src/backend/access/transam/xlog.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * xlog.c
   4  *              PostgreSQL transaction log manager
   5  *
   6  *
   7  * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
   8  * Portions Copyright (c) 1994, Regents of the University of California
   9  *
  10  * $PostgreSQL$
  11  *
  12  *-------------------------------------------------------------------------
  13  */
  14
  15 #include "postgres.h"
  16
  17 #include <ctype.h>
  18 #include <signal.h>
  19 #include <time.h>
  20 #include <sys/stat.h>
  21 #include <sys/time.h>
  22 #include <sys/wait.h>
  23 #include <unistd.h>
  24
  25 #include "access/clog.h"
  26 #include "access/multixact.h"
  27 #include "access/subtrans.h"
  28 #include "access/transam.h"
  29 #include "access/tuptoaster.h"
  30 #include "access/twophase.h"
  31 #include "access/xact.h"
  32 #include "access/xlog_internal.h"
  33 #include "access/xlogutils.h"
  34 #include "catalog/catversion.h"
  35 #include "catalog/pg_control.h"
  36 #include "catalog/pg_type.h"
  37 #include "funcapi.h"
  38 #include "miscadmin.h"
  39 #include "pgstat.h"
  40 #include "postmaster/bgwriter.h"
  41 #include "storage/bufmgr.h"
  42 #include "storage/fd.h"
  43 #include "storage/ipc.h"
  44 #include "storage/pmsignal.h"
  45 #include "storage/procarray.h"
  46 #include "storage/smgr.h"
  47 #include "storage/spin.h"
  48 #include "utils/builtins.h"
  49 #include "utils/guc.h"
  50 #include "utils/ps_status.h"
  51
  52
  53 /* File path names (all relative to $PGDATA) */
  54 #define BACKUP_LABEL_FILE               "backup_label"
  55 #define BACKUP_LABEL_OLD                "backup_label.old"
  56 #define RECOVERY_COMMAND_FILE   "recovery.conf"
  57 #define RECOVERY_COMMAND_DONE   "recovery.done"
  58
  59
  60 /* User-settable parameters */
  61 int                     CheckPointSegments = 3;
  62 int                     XLOGbuffers = 8;
  63 int                     XLogArchiveTimeout = 0;
  64 bool            XLogArchiveMode = false;
  65 char       *XLogArchiveCommand = NULL;
  66 bool            fullPageWrites = true;
  67 bool            log_checkpoints = false;
  68 int             sync_method = DEFAULT_SYNC_METHOD;
  69
  70 #ifdef WAL_DEBUG
  71 bool            XLOG_DEBUG = false;
  72 #endif
  73
  74 /*
  75  * XLOGfileslop is the maximum number of preallocated future XLOG segments.
  76  * When we are done with an old XLOG segment file, we will recycle it as a
  77  * future XLOG segment as long as there aren't already XLOGfileslop future
  78  * segments; else we'll delete it.  This could be made a separate GUC
  79  * variable, but at present I think it's sufficient to hardwire it as
  80  * 2*CheckPointSegments+1.      Under normal conditions, a checkpoint will free
  81  * no more than 2*CheckPointSegments log segments, and we want to recycle all
  82  * of them; the +1 allows boundary cases to happen without wasting a
  83  * delete/create-segment cycle.
  84  */
  85 #define XLOGfileslop    (2*CheckPointSegments + 1)
  86
  87 /*
  88  * GUC support
  89  */
  90 const struct config_enum_entry sync_method_options[] = {
  91         {"fsync", SYNC_METHOD_FSYNC, false},
  92 #ifdef HAVE_FSYNC_WRITETHROUGH
  93         {"fsync_writethrough", SYNC_METHOD_FSYNC_WRITETHROUGH, false},
  94 #endif
  95 #ifdef HAVE_FDATASYNC
  96         {"fdatasync", SYNC_METHOD_FDATASYNC, false},
  97 #endif
  98 #ifdef OPEN_SYNC_FLAG
  99         {"open_sync", SYNC_METHOD_OPEN, false},
 100 #endif
 101 #ifdef OPEN_DATASYNC_FLAG
 102         {"open_datasync", SYNC_METHOD_OPEN_DSYNC, false},
 103 #endif
 104         {NULL, 0, false}
 105 };
 106
 107 /*
 108  * Statistics for current checkpoint are collected in this global struct.
 109  * Because only the background writer or a stand-alone backend can perform
 110  * checkpoints, this will be unused in normal backends.
 111  */
 112 CheckpointStatsData CheckpointStats;
 113
 114 /*
 115  * ThisTimeLineID will be same in all backends --- it identifies current
 116  * WAL timeline for the database system.
 117  */
 118 TimeLineID      ThisTimeLineID = 0;
 119
 120 /* Are we doing recovery from XLOG? */
 121 bool            InRecovery = false;
 122
 123 /* Are we recovering using offline XLOG archives? */
 124 static bool InArchiveRecovery = false;
 125
 126 /* Was the last xlog file restored from archive, or local? */
 127 static bool restoredFromArchive = false;
 128
 129 /* options taken from recovery.conf */
 130 static char *recoveryRestoreCommand = NULL;
 131 static bool recoveryTarget = false;
 132 static bool recoveryTargetExact = false;
 133 static bool recoveryTargetInclusive = true;
 134 static bool recoveryLogRestartpoints = false;
 135 static TransactionId recoveryTargetXid;
 136 static TimestampTz recoveryTargetTime;
 137 static TimestampTz recoveryLastXTime = 0;
 138
 139 /* if recoveryStopsHere returns true, it saves actual stop xid/time here */
 140 static TransactionId recoveryStopXid;
 141 static TimestampTz recoveryStopTime;
 142 static bool recoveryStopAfter;
 143
 144 /*
 145  * During normal operation, the only timeline we care about is ThisTimeLineID.
 146  * During recovery, however, things are more complicated.  To simplify life
 147  * for rmgr code, we keep ThisTimeLineID set to the "current" timeline as we
 148  * scan through the WAL history (that is, it is the line that was active when
 149  * the currently-scanned WAL record was generated).  We also need these
 150  * timeline values:
 151  *
 152  * recoveryTargetTLI: the desired timeline that we want to end in.
 153  *
 154  * expectedTLIs: an integer list of recoveryTargetTLI and the TLIs of
 155  * its known parents, newest first (so recoveryTargetTLI is always the
 156  * first list member).  Only these TLIs are expected to be seen in the WAL
 157  * segments we read, and indeed only these TLIs will be considered as
 158  * candidate WAL files to open at all.
 159  *
 160  * curFileTLI: the TLI appearing in the name of the current input WAL file.
 161  * (This is not necessarily the same as ThisTimeLineID, because we could
 162  * be scanning data that was copied from an ancestor timeline when the current
 163  * file was created.)  During a sequential scan we do not allow this value
 164  * to decrease.
 165  */
 166 static TimeLineID recoveryTargetTLI;
 167 static List *expectedTLIs;
 168 static TimeLineID curFileTLI;
 169
 170 /*
 171  * ProcLastRecPtr points to the start of the last XLOG record inserted by the
 172  * current backend.  It is updated for all inserts.  XactLastRecEnd points to
 173  * end+1 of the last record, and is reset when we end a top-level transaction,
 174  * or start a new one; so it can be used to tell if the current transaction has
 175  * created any XLOG records.
 176  */
 177 static XLogRecPtr ProcLastRecPtr = {0, 0};
 178
 179 XLogRecPtr      XactLastRecEnd = {0, 0};
 180
 181 /*
 182  * RedoRecPtr is this backend's local copy of the REDO record pointer
 183  * (which is almost but not quite the same as a pointer to the most recent
 184  * CHECKPOINT record).  We update this from the shared-memory copy,
 185  * XLogCtl->Insert.RedoRecPtr, whenever we can safely do so (ie, when we
 186  * hold the Insert lock).  See XLogInsert for details.  We are also allowed
 187  * to update from XLogCtl->Insert.RedoRecPtr if we hold the info_lck;
 188  * see GetRedoRecPtr.  A freshly spawned backend obtains the value during
 189  * InitXLOGAccess.
 190  */
 191 static XLogRecPtr RedoRecPtr;
 192
 193 /*----------
 194  * Shared-memory data structures for XLOG control
 195  *
 196  * LogwrtRqst indicates a byte position that we need to write and/or fsync
 197  * the log up to (all records before that point must be written or fsynced).
 198  * LogwrtResult indicates the byte positions we have already written/fsynced.
 199  * These structs are identical but are declared separately to indicate their
 200  * slightly different functions.
 201  *
 202  * We do a lot of pushups to minimize the amount of access to lockable
 203  * shared memory values.  There are actually three shared-memory copies of
 204  * LogwrtResult, plus one unshared copy in each backend.  Here's how it works:
 205  *              XLogCtl->LogwrtResult is protected by info_lck
 206  *              XLogCtl->Write.LogwrtResult is protected by WALWriteLock
 207  *              XLogCtl->Insert.LogwrtResult is protected by WALInsertLock
 208  * One must hold the associated lock to read or write any of these, but
 209  * of course no lock is needed to read/write the unshared LogwrtResult.
 210  *
 211  * XLogCtl->LogwrtResult and XLogCtl->Write.LogwrtResult are both "always
 212  * right", since both are updated by a write or flush operation before
 213  * it releases WALWriteLock.  The point of keeping XLogCtl->Write.LogwrtResult
 214  * is that it can be examined/modified by code that already holds WALWriteLock
 215  * without needing to grab info_lck as well.
 216  *
 217  * XLogCtl->Insert.LogwrtResult may lag behind the reality of the other two,
 218  * but is updated when convenient.      Again, it exists for the convenience of
 219  * code that is already holding WALInsertLock but not the other locks.
 220  *
 221  * The unshared LogwrtResult may lag behind any or all of these, and again
 222  * is updated when convenient.
 223  *
 224  * The request bookkeeping is simpler: there is a shared XLogCtl->LogwrtRqst
 225  * (protected by info_lck), but we don't need to cache any copies of it.
 226  *
 227  * Note that this all works because the request and result positions can only
 228  * advance forward, never back up, and so we can easily determine which of two
 229  * values is "more up to date".
 230  *
 231  * info_lck is only held long enough to read/update the protected variables,
 232  * so it's a plain spinlock.  The other locks are held longer (potentially
 233  * over I/O operations), so we use LWLocks for them.  These locks are:
 234  *
 235  * WALInsertLock: must be held to insert a record into the WAL buffers.
 236  *
 237  * WALWriteLock: must be held to write WAL buffers to disk (XLogWrite or
 238  * XLogFlush).
 239  *
 240  * ControlFileLock: must be held to read/update control file or create
 241  * new log file.
 242  *
 243  * CheckpointLock: must be held to do a checkpoint (ensures only one
 244  * checkpointer at a time; currently, with all checkpoints done by the
 245  * bgwriter, this is just pro forma).
 246  *
 247  *----------
 248  */
 249
 250 typedef struct XLogwrtRqst
 251 {
 252         XLogRecPtr      Write;                  /* last byte + 1 to write out */
 253         XLogRecPtr      Flush;                  /* last byte + 1 to flush */
 254 } XLogwrtRqst;
 255
 256 typedef struct XLogwrtResult
 257 {
 258         XLogRecPtr      Write;                  /* last byte + 1 written out */
 259         XLogRecPtr      Flush;                  /* last byte + 1 flushed */
 260 } XLogwrtResult;
 261
 262 /*
 263  * Shared state data for XLogInsert.
 264  */
 265 typedef struct XLogCtlInsert
 266 {
 267         XLogwrtResult LogwrtResult; /* a recent value of LogwrtResult */
 268         XLogRecPtr      PrevRecord;             /* start of previously-inserted record */
 269         int                     curridx;                /* current block index in cache */
 270         XLogPageHeader currpage;        /* points to header of block in cache */
 271         char       *currpos;            /* current insertion point in cache */
 272         XLogRecPtr      RedoRecPtr;             /* current redo point for insertions */
 273         bool            forcePageWrites;        /* forcing full-page writes for PITR? */
 274 } XLogCtlInsert;
 275
 276 /*
 277  * Shared state data for XLogWrite/XLogFlush.
 278  */
 279 typedef struct XLogCtlWrite
 280 {
 281         XLogwrtResult LogwrtResult; /* current value of LogwrtResult */
 282         int                     curridx;                /* cache index of next block to write */
 283         pg_time_t       lastSegSwitchTime;              /* time of last xlog segment switch */
 284 } XLogCtlWrite;
 285
 286 /*
 287  * Total shared-memory state for XLOG.
 288  */
 289 typedef struct XLogCtlData
 290 {
 291         /* Protected by WALInsertLock: */
 292         XLogCtlInsert Insert;
 293
 294         /* Protected by info_lck: */
 295         XLogwrtRqst LogwrtRqst;
 296         XLogwrtResult LogwrtResult;
 297         uint32          ckptXidEpoch;   /* nextXID & epoch of latest checkpoint */
 298         TransactionId ckptXid;
 299         XLogRecPtr      asyncCommitLSN; /* LSN of newest async commit */
 300
 301         /* Protected by WALWriteLock: */
 302         XLogCtlWrite Write;
 303
 304         /*
 305          * These values do not change after startup, although the pointed-to pages
 306          * and xlblocks values certainly do.  Permission to read/write the pages
 307          * and xlblocks values depends on WALInsertLock and WALWriteLock.
 308          */
 309         char       *pages;                      /* buffers for unwritten XLOG pages */
 310         XLogRecPtr *xlblocks;           /* 1st byte ptr-s + XLOG_BLCKSZ */
 311         int                     XLogCacheBlck;  /* highest allocated xlog buffer index */
 312         TimeLineID      ThisTimeLineID;
 313
 314         slock_t         info_lck;               /* locks shared variables shown above */
 315 } XLogCtlData;
 316
 317 static XLogCtlData *XLogCtl = NULL;
 318
 319 /*
 320  * We maintain an image of pg_control in shared memory.
 321  */
 322 static ControlFileData *ControlFile = NULL;
 323
 324 /*
 325  * Macros for managing XLogInsert state.  In most cases, the calling routine
 326  * has local copies of XLogCtl->Insert and/or XLogCtl->Insert->curridx,
 327  * so these are passed as parameters instead of being fetched via XLogCtl.
 328  */
 329
 330 /* Free space remaining in the current xlog page buffer */
 331 #define INSERT_FREESPACE(Insert)  \
 332         (XLOG_BLCKSZ - ((Insert)->currpos - (char *) (Insert)->currpage))
 333
 334 /* Construct XLogRecPtr value for current insertion point */
 335 #define INSERT_RECPTR(recptr,Insert,curridx)  \
 336         ( \
 337           (recptr).xlogid = XLogCtl->xlblocks[curridx].xlogid, \
 338           (recptr).xrecoff = \
 339                 XLogCtl->xlblocks[curridx].xrecoff - INSERT_FREESPACE(Insert) \
 340         )
 341
 342 #define PrevBufIdx(idx)         \
 343                 (((idx) == 0) ? XLogCtl->XLogCacheBlck : ((idx) - 1))
 344
 345 #define NextBufIdx(idx)         \
 346                 (((idx) == XLogCtl->XLogCacheBlck) ? 0 : ((idx) + 1))
 347
 348 /*
 349  * Private, possibly out-of-date copy of shared LogwrtResult.
 350  * See discussion above.
 351  */
 352 static XLogwrtResult LogwrtResult = {{0, 0}, {0, 0}};
 353
 354 /*
 355  * openLogFile is -1 or a kernel FD for an open log file segment.
 356  * When it's open, openLogOff is the current seek offset in the file.
 357  * openLogId/openLogSeg identify the segment.  These variables are only
 358  * used to write the XLOG, and so will normally refer to the active segment.
 359  */
 360 static int      openLogFile = -1;
 361 static uint32 openLogId = 0;
 362 static uint32 openLogSeg = 0;
 363 static uint32 openLogOff = 0;
 364
 365 /*
 366  * These variables are used similarly to the ones above, but for reading
 367  * the XLOG.  Note, however, that readOff generally represents the offset
 368  * of the page just read, not the seek position of the FD itself, which
 369  * will be just past that page.
 370  */
 371 static int      readFile = -1;
 372 static uint32 readId = 0;
 373 static uint32 readSeg = 0;
 374 static uint32 readOff = 0;
 375
 376 /* Buffer for currently read page (XLOG_BLCKSZ bytes) */
 377 static char *readBuf = NULL;
 378
 379 /* Buffer for current ReadRecord result (expandable) */
 380 static char *readRecordBuf = NULL;
 381 static uint32 readRecordBufSize = 0;
 382
 383 /* State information for XLOG reading */
 384 static XLogRecPtr ReadRecPtr;   /* start of last record read */
 385 static XLogRecPtr EndRecPtr;    /* end+1 of last record read */
 386 static XLogRecord *nextRecord = NULL;
 387 static TimeLineID lastPageTLI = 0;
 388
 389 static bool InRedo = false;
 390
 391
 392 static void XLogArchiveNotify(const char *xlog);
 393 static void XLogArchiveNotifySeg(uint32 log, uint32 seg);
 394 static bool XLogArchiveCheckDone(const char *xlog);
 395 static bool XLogArchiveIsBusy(const char *xlog);
 396 static void XLogArchiveCleanup(const char *xlog);
 397 static void readRecoveryCommandFile(void);
 398 static void exitArchiveRecovery(TimeLineID endTLI,
 399                                         uint32 endLogId, uint32 endLogSeg);
 400 static bool recoveryStopsHere(XLogRecord *record, bool *includeThis);
 401 static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
 402
 403 static bool XLogCheckBuffer(XLogRecData *rdata, bool doPageWrites,
 404                                 XLogRecPtr *lsn, BkpBlock *bkpb);
 405 static bool AdvanceXLInsertBuffer(bool new_segment);
 406 static void XLogWrite(XLogwrtRqst WriteRqst, bool flexible, bool xlog_switch);
 407 static int XLogFileInit(uint32 log, uint32 seg,
 408                          bool *use_existent, bool use_lock);
 409 static bool InstallXLogFileSegment(uint32 *log, uint32 *seg, char *tmppath,
 410                                            bool find_free, int *max_advance,
 411                                            bool use_lock);
 412 static int      XLogFileOpen(uint32 log, uint32 seg);
 413 static int      XLogFileRead(uint32 log, uint32 seg, int emode);
 414 static void XLogFileClose(void);
 415 static bool RestoreArchivedFile(char *path, const char *xlogfname,
 416                                         const char *recovername, off_t expectedSize);
 417 static void PreallocXlogFiles(XLogRecPtr endptr);
 418 static void RemoveOldXlogFiles(uint32 log, uint32 seg, XLogRecPtr endptr);
 419 static void CleanupBackupHistory(void);
 420 static XLogRecord *ReadRecord(XLogRecPtr *RecPtr, int emode);
 421 static bool ValidXLOGHeader(XLogPageHeader hdr, int emode);
 422 static XLogRecord *ReadCheckpointRecord(XLogRecPtr RecPtr, int whichChkpt);
 423 static List *readTimeLineHistory(TimeLineID targetTLI);
 424 static bool existsTimeLineHistory(TimeLineID probeTLI);
 425 static TimeLineID findNewestTimeLine(TimeLineID startTLI);
 426 static void writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI,
 427                                          TimeLineID endTLI,
 428                                          uint32 endLogId, uint32 endLogSeg);
 429 static void WriteControlFile(void);
 430 static void ReadControlFile(void);
 431 static char *str_time(pg_time_t tnow);
 432 #ifdef WAL_DEBUG
 433 static void xlog_outrec(StringInfo buf, XLogRecord *record);
 434 #endif
 435 static void issue_xlog_fsync(void);
 436 static void pg_start_backup_callback(int code, Datum arg);
 437 static bool read_backup_label(XLogRecPtr *checkPointLoc,
 438                                   XLogRecPtr *minRecoveryLoc);
 439 static void rm_redo_error_callback(void *arg);
 440 static int get_sync_bit(int method);
 441
 442
 443 /*
 444  * Insert an XLOG record having the specified RMID and info bytes,
 445  * with the body of the record being the data chunk(s) described by
 446  * the rdata chain (see xlog.h for notes about rdata).
 447  *
 448  * Returns XLOG pointer to end of record (beginning of next record).
 449  * This can be used as LSN for data pages affected by the logged action.
 450  * (LSN is the XLOG point up to which the XLOG must be flushed to disk
 451  * before the data page can be written out.  This implements the basic
 452  * WAL rule "write the log before the data".)
 453  *
 454  * NB: this routine feels free to scribble on the XLogRecData structs,
 455  * though not on the data they reference.  This is OK since the XLogRecData
 456  * structs are always just temporaries in the calling code.
 457  */
 458 XLogRecPtr
 459 XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata)
 460 {
 461         XLogCtlInsert *Insert = &XLogCtl->Insert;
 462         XLogRecord *record;
 463         XLogContRecord *contrecord;
 464         XLogRecPtr      RecPtr;
 465         XLogRecPtr      WriteRqst;
 466         uint32          freespace;
 467         int                     curridx;
 468         XLogRecData *rdt;
 469         Buffer          dtbuf[XLR_MAX_BKP_BLOCKS];
 470         bool            dtbuf_bkp[XLR_MAX_BKP_BLOCKS];
 471         BkpBlock        dtbuf_xlg[XLR_MAX_BKP_BLOCKS];
 472         XLogRecPtr      dtbuf_lsn[XLR_MAX_BKP_BLOCKS];
 473         XLogRecData dtbuf_rdt1[XLR_MAX_BKP_BLOCKS];
 474         XLogRecData dtbuf_rdt2[XLR_MAX_BKP_BLOCKS];
 475         XLogRecData dtbuf_rdt3[XLR_MAX_BKP_BLOCKS];
 476         pg_crc32        rdata_crc;
 477         uint32          len,
 478                                 write_len;
 479         unsigned        i;
 480         bool            updrqst;
 481         bool            doPageWrites;
 482         bool            isLogSwitch = (rmid == RM_XLOG_ID && info == XLOG_SWITCH);
 483
 484         /* info's high bits are reserved for use by me */
 485         if (info & XLR_INFO_MASK)
 486                 elog(PANIC, "invalid xlog info mask %02X", info);
 487
 488         /*
 489          * In bootstrap mode, we don't actually log anything but XLOG resources;
 490          * return a phony record pointer.
 491          */
 492         if (IsBootstrapProcessingMode() && rmid != RM_XLOG_ID)
 493         {
 494                 RecPtr.xlogid = 0;
 495                 RecPtr.xrecoff = SizeOfXLogLongPHD;             /* start of 1st chkpt record */
 496                 return RecPtr;
 497         }
 498
 499         /*
 500          * Here we scan the rdata chain, determine which buffers must be backed
 501          * up, and compute the CRC values for the data.  Note that the record
 502          * header isn't added into the CRC initially since we don't know the final
 503          * length or info bits quite yet.  Thus, the CRC will represent the CRC of
 504          * the whole record in the order "rdata, then backup blocks, then record
 505          * header".
 506          *
 507          * We may have to loop back to here if a race condition is detected below.
 508          * We could prevent the race by doing all this work while holding the
 509          * insert lock, but it seems better to avoid doing CRC calculations while
 510          * holding the lock.  This means we have to be careful about modifying the
 511          * rdata chain until we know we aren't going to loop back again.  The only
 512          * change we allow ourselves to make earlier is to set rdt->data = NULL in
 513          * chain items we have decided we will have to back up the whole buffer
 514          * for.  This is OK because we will certainly decide the same thing again
 515          * for those items if we do it over; doing it here saves an extra pass
 516          * over the chain later.
 517          */
 518 begin:;
 519         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
 520         {
 521                 dtbuf[i] = InvalidBuffer;
 522                 dtbuf_bkp[i] = false;
 523         }
 524
 525         /*
 526          * Decide if we need to do full-page writes in this XLOG record: true if
 527          * full_page_writes is on or we have a PITR request for it.  Since we
 528          * don't yet have the insert lock, forcePageWrites could change under us,
 529          * but we'll recheck it once we have the lock.
 530          */
 531         doPageWrites = fullPageWrites || Insert->forcePageWrites;
 532
 533         INIT_CRC32(rdata_crc);
 534         len = 0;
 535         for (rdt = rdata;;)
 536         {
 537                 if (rdt->buffer == InvalidBuffer)
 538                 {
 539                         /* Simple data, just include it */
 540                         len += rdt->len;
 541                         COMP_CRC32(rdata_crc, rdt->data, rdt->len);
 542                 }
 543                 else
 544                 {
 545                         /* Find info for buffer */
 546                         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
 547                         {
 548                                 if (rdt->buffer == dtbuf[i])
 549                                 {
 550                                         /* Buffer already referenced by earlier chain item */
 551                                         if (dtbuf_bkp[i])
 552                                                 rdt->data = NULL;
 553                                         else if (rdt->data)
 554                                         {
 555                                                 len += rdt->len;
 556                                                 COMP_CRC32(rdata_crc, rdt->data, rdt->len);
 557                                         }
 558                                         break;
 559                                 }
 560                                 if (dtbuf[i] == InvalidBuffer)
 561                                 {
 562                                         /* OK, put it in this slot */
 563                                         dtbuf[i] = rdt->buffer;
 564                                         if (XLogCheckBuffer(rdt, doPageWrites,
 565                                                                                 &(dtbuf_lsn[i]), &(dtbuf_xlg[i])))
 566                                         {
 567                                                 dtbuf_bkp[i] = true;
 568                                                 rdt->data = NULL;
 569                                         }
 570                                         else if (rdt->data)
 571                                         {
 572                                                 len += rdt->len;
 573                                                 COMP_CRC32(rdata_crc, rdt->data, rdt->len);
 574                                         }
 575                                         break;
 576                                 }
 577                         }
 578                         if (i >= XLR_MAX_BKP_BLOCKS)
 579                                 elog(PANIC, "can backup at most %d blocks per xlog record",
 580                                          XLR_MAX_BKP_BLOCKS);
 581                 }
 582                 /* Break out of loop when rdt points to last chain item */
 583                 if (rdt->next == NULL)
 584                         break;
 585                 rdt = rdt->next;
 586         }
 587
 588         /*
 589          * Now add the backup block headers and data into the CRC
 590          */
 591         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
 592         {
 593                 if (dtbuf_bkp[i])
 594                 {
 595                         BkpBlock   *bkpb = &(dtbuf_xlg[i]);
 596                         char       *page;
 597
 598                         COMP_CRC32(rdata_crc,
 599                                            (char *) bkpb,
 600                                            sizeof(BkpBlock));
 601                         page = (char *) BufferGetBlock(dtbuf[i]);
 602                         if (bkpb->hole_length == 0)
 603                         {
 604                                 COMP_CRC32(rdata_crc,
 605                                                    page,
 606                                                    BLCKSZ);
 607                         }
 608                         else
 609                         {
 610                                 /* must skip the hole */
 611                                 COMP_CRC32(rdata_crc,
 612                                                    page,
 613                                                    bkpb->hole_offset);
 614                                 COMP_CRC32(rdata_crc,
 615                                                    page + (bkpb->hole_offset + bkpb->hole_length),
 616                                                    BLCKSZ - (bkpb->hole_offset + bkpb->hole_length));
 617                         }
 618                 }
 619         }
 620
 621         /*
 622          * NOTE: We disallow len == 0 because it provides a useful bit of extra
 623          * error checking in ReadRecord.  This means that all callers of
 624          * XLogInsert must supply at least some not-in-a-buffer data.  However, we
 625          * make an exception for XLOG SWITCH records because we don't want them to
 626          * ever cross a segment boundary.
 627          */
 628         if (len == 0 && !isLogSwitch)
 629                 elog(PANIC, "invalid xlog record length %u", len);
 630
 631         START_CRIT_SECTION();
 632
 633         /* Now wait to get insert lock */
 634         LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
 635
 636         /*
 637          * Check to see if my RedoRecPtr is out of date.  If so, may have to go
 638          * back and recompute everything.  This can only happen just after a
 639          * checkpoint, so it's better to be slow in this case and fast otherwise.
 640          *
 641          * If we aren't doing full-page writes then RedoRecPtr doesn't actually
 642          * affect the contents of the XLOG record, so we'll update our local copy
 643          * but not force a recomputation.
 644          */
 645         if (!XLByteEQ(RedoRecPtr, Insert->RedoRecPtr))
 646         {
 647                 Assert(XLByteLT(RedoRecPtr, Insert->RedoRecPtr));
 648                 RedoRecPtr = Insert->RedoRecPtr;
 649
 650                 if (doPageWrites)
 651                 {
 652                         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
 653                         {
 654                                 if (dtbuf[i] == InvalidBuffer)
 655                                         continue;
 656                                 if (dtbuf_bkp[i] == false &&
 657                                         XLByteLE(dtbuf_lsn[i], RedoRecPtr))
 658                                 {
 659                                         /*
 660                                          * Oops, this buffer now needs to be backed up, but we
 661                                          * didn't think so above.  Start over.
 662                                          */
 663                                         LWLockRelease(WALInsertLock);
 664                                         END_CRIT_SECTION();
 665                                         goto begin;
 666                                 }
 667                         }
 668                 }
 669         }
 670
 671         /*
 672          * Also check to see if forcePageWrites was just turned on; if we weren't
 673          * already doing full-page writes then go back and recompute. (If it was
 674          * just turned off, we could recompute the record without full pages, but
 675          * we choose not to bother.)
 676          */
 677         if (Insert->forcePageWrites && !doPageWrites)
 678         {
 679                 /* Oops, must redo it with full-page data */
 680                 LWLockRelease(WALInsertLock);
 681                 END_CRIT_SECTION();
 682                 goto begin;
 683         }
 684
 685         /*
 686          * Make additional rdata chain entries for the backup blocks, so that we
 687          * don't need to special-case them in the write loop.  Note that we have
 688          * now irrevocably changed the input rdata chain.  At the exit of this
 689          * loop, write_len includes the backup block data.
 690          *
 691          * Also set the appropriate info bits to show which buffers were backed
 692          * up. The i'th XLR_SET_BKP_BLOCK bit corresponds to the i'th distinct
 693          * buffer value (ignoring InvalidBuffer) appearing in the rdata chain.
 694          */
 695         write_len = len;
 696         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
 697         {
 698                 BkpBlock   *bkpb;
 699                 char       *page;
 700
 701                 if (!dtbuf_bkp[i])
 702                         continue;
 703
 704                 info |= XLR_SET_BKP_BLOCK(i);
 705
 706                 bkpb = &(dtbuf_xlg[i]);
 707                 page = (char *) BufferGetBlock(dtbuf[i]);
 708
 709                 rdt->next = &(dtbuf_rdt1[i]);
 710                 rdt = rdt->next;
 711
 712                 rdt->data = (char *) bkpb;
 713                 rdt->len = sizeof(BkpBlock);
 714                 write_len += sizeof(BkpBlock);
 715
 716                 rdt->next = &(dtbuf_rdt2[i]);
 717                 rdt = rdt->next;
 718
 719                 if (bkpb->hole_length == 0)
 720                 {
 721                         rdt->data = page;
 722                         rdt->len = BLCKSZ;
 723                         write_len += BLCKSZ;
 724                         rdt->next = NULL;
 725                 }
 726                 else
 727                 {
 728                         /* must skip the hole */
 729                         rdt->data = page;
 730                         rdt->len = bkpb->hole_offset;
 731                         write_len += bkpb->hole_offset;
 732
 733                         rdt->next = &(dtbuf_rdt3[i]);
 734                         rdt = rdt->next;
 735
 736                         rdt->data = page + (bkpb->hole_offset + bkpb->hole_length);
 737                         rdt->len = BLCKSZ - (bkpb->hole_offset + bkpb->hole_length);
 738                         write_len += rdt->len;
 739                         rdt->next = NULL;
 740                 }
 741         }
 742
 743         /*
 744          * If we backed up any full blocks and online backup is not in progress,
 745          * mark the backup blocks as removable.  This allows the WAL archiver to
 746          * know whether it is safe to compress archived WAL data by transforming
 747          * full-block records into the non-full-block format.
 748          *
 749          * Note: we could just set the flag whenever !forcePageWrites, but
 750          * defining it like this leaves the info bit free for some potential other
 751          * use in records without any backup blocks.
 752          */
 753         if ((info & XLR_BKP_BLOCK_MASK) && !Insert->forcePageWrites)
 754                 info |= XLR_BKP_REMOVABLE;
 755
 756         /*
 757          * If there isn't enough space on the current XLOG page for a record
 758          * header, advance to the next page (leaving the unused space as zeroes).
 759          */
 760         updrqst = false;
 761         freespace = INSERT_FREESPACE(Insert);
 762         if (freespace < SizeOfXLogRecord)
 763         {
 764                 updrqst = AdvanceXLInsertBuffer(false);
 765                 freespace = INSERT_FREESPACE(Insert);
 766         }
 767
 768         /* Compute record's XLOG location */
 769         curridx = Insert->curridx;
 770         INSERT_RECPTR(RecPtr, Insert, curridx);
 771
 772         /*
 773          * If the record is an XLOG_SWITCH, and we are exactly at the start of a
 774          * segment, we need not insert it (and don't want to because we'd like
 775          * consecutive switch requests to be no-ops).  Instead, make sure
 776          * everything is written and flushed through the end of the prior segment,
 777          * and return the prior segment's end address.
 778          */
 779         if (isLogSwitch &&
 780                 (RecPtr.xrecoff % XLogSegSize) == SizeOfXLogLongPHD)
 781         {
 782                 /* We can release insert lock immediately */
 783                 LWLockRelease(WALInsertLock);
 784
 785                 RecPtr.xrecoff -= SizeOfXLogLongPHD;
 786                 if (RecPtr.xrecoff == 0)
 787                 {
 788                         /* crossing a logid boundary */
 789                         RecPtr.xlogid -= 1;
 790                         RecPtr.xrecoff = XLogFileSize;
 791                 }
 792
 793                 LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
 794                 LogwrtResult = XLogCtl->Write.LogwrtResult;
 795                 if (!XLByteLE(RecPtr, LogwrtResult.Flush))
 796                 {
 797                         XLogwrtRqst FlushRqst;
 798
 799                         FlushRqst.Write = RecPtr;
 800                         FlushRqst.Flush = RecPtr;
 801                         XLogWrite(FlushRqst, false, false);
 802                 }
 803                 LWLockRelease(WALWriteLock);
 804
 805                 END_CRIT_SECTION();
 806
 807                 return RecPtr;
 808         }
 809
 810         /* Insert record header */
 811
 812         record = (XLogRecord *) Insert->currpos;
 813         record->xl_prev = Insert->PrevRecord;
 814         record->xl_xid = GetCurrentTransactionIdIfAny();
 815         record->xl_tot_len = SizeOfXLogRecord + write_len;
 816         record->xl_len = len;           /* doesn't include backup blocks */
 817         record->xl_info = info;
 818         record->xl_rmid = rmid;
 819
 820         /* Now we can finish computing the record's CRC */
 821         COMP_CRC32(rdata_crc, (char *) record + sizeof(pg_crc32),
 822                            SizeOfXLogRecord - sizeof(pg_crc32));
 823         FIN_CRC32(rdata_crc);
 824         record->xl_crc = rdata_crc;
 825
 826 #ifdef WAL_DEBUG
 827         if (XLOG_DEBUG)
 828         {
 829                 StringInfoData buf;
 830
 831                 initStringInfo(&buf);
 832                 appendStringInfo(&buf, "INSERT @ %X/%X: ",
 833                                                  RecPtr.xlogid, RecPtr.xrecoff);
 834                 xlog_outrec(&buf, record);
 835                 if (rdata->data != NULL)
 836                 {
 837                         appendStringInfo(&buf, " - ");
 838                         RmgrTable[record->xl_rmid].rm_desc(&buf, record->xl_info, rdata->data);
 839                 }
 840                 elog(LOG, "%s", buf.data);
 841                 pfree(buf.data);
 842         }
 843 #endif
 844
 845         /* Record begin of record in appropriate places */
 846         ProcLastRecPtr = RecPtr;
 847         Insert->PrevRecord = RecPtr;
 848
 849         Insert->currpos += SizeOfXLogRecord;
 850         freespace -= SizeOfXLogRecord;
 851
 852         /*
 853          * Append the data, including backup blocks if any
 854          */
 855         while (write_len)
 856         {
 857                 while (rdata->data == NULL)
 858                         rdata = rdata->next;
 859
 860                 if (freespace > 0)
 861                 {
 862                         if (rdata->len > freespace)
 863                         {
 864                                 memcpy(Insert->currpos, rdata->data, freespace);
 865                                 rdata->data += freespace;
 866                                 rdata->len -= freespace;
 867                                 write_len -= freespace;
 868                         }
 869                         else
 870                         {
 871                                 memcpy(Insert->currpos, rdata->data, rdata->len);
 872                                 freespace -= rdata->len;
 873                                 write_len -= rdata->len;
 874                                 Insert->currpos += rdata->len;
 875                                 rdata = rdata->next;
 876                                 continue;
 877                         }
 878                 }
 879
 880                 /* Use next buffer */
 881                 updrqst = AdvanceXLInsertBuffer(false);
 882                 curridx = Insert->curridx;
 883                 /* Insert cont-record header */
 884                 Insert->currpage->xlp_info |= XLP_FIRST_IS_CONTRECORD;
 885                 contrecord = (XLogContRecord *) Insert->currpos;
 886                 contrecord->xl_rem_len = write_len;
 887                 Insert->currpos += SizeOfXLogContRecord;
 888                 freespace = INSERT_FREESPACE(Insert);
 889         }
 890
 891         /* Ensure next record will be properly aligned */
 892         Insert->currpos = (char *) Insert->currpage +
 893                 MAXALIGN(Insert->currpos - (char *) Insert->currpage);
 894         freespace = INSERT_FREESPACE(Insert);
 895
 896         /*
 897          * The recptr I return is the beginning of the *next* record. This will be
 898          * stored as LSN for changed data pages...
 899          */
 900         INSERT_RECPTR(RecPtr, Insert, curridx);
 901
 902         /*
 903          * If the record is an XLOG_SWITCH, we must now write and flush all the
 904          * existing data, and then forcibly advance to the start of the next
 905          * segment.  It's not good to do this I/O while holding the insert lock,
 906          * but there seems too much risk of confusion if we try to release the
 907          * lock sooner.  Fortunately xlog switch needn't be a high-performance
 908          * operation anyway...
 909          */
 910         if (isLogSwitch)
 911         {
 912                 XLogCtlWrite *Write = &XLogCtl->Write;
 913                 XLogwrtRqst FlushRqst;
 914                 XLogRecPtr      OldSegEnd;
 915
 916                 LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
 917
 918                 /*
 919                  * Flush through the end of the page containing XLOG_SWITCH, and
 920                  * perform end-of-segment actions (eg, notifying archiver).
 921                  */
 922                 WriteRqst = XLogCtl->xlblocks[curridx];
 923                 FlushRqst.Write = WriteRqst;
 924                 FlushRqst.Flush = WriteRqst;
 925                 XLogWrite(FlushRqst, false, true);
 926
 927                 /* Set up the next buffer as first page of next segment */
 928                 /* Note: AdvanceXLInsertBuffer cannot need to do I/O here */
 929                 (void) AdvanceXLInsertBuffer(true);
 930
 931                 /* There should be no unwritten data */
 932                 curridx = Insert->curridx;
 933                 Assert(curridx == Write->curridx);
 934
 935                 /* Compute end address of old segment */
 936                 OldSegEnd = XLogCtl->xlblocks[curridx];
 937                 OldSegEnd.xrecoff -= XLOG_BLCKSZ;
 938                 if (OldSegEnd.xrecoff == 0)
 939                 {
 940                         /* crossing a logid boundary */
 941                         OldSegEnd.xlogid -= 1;
 942                         OldSegEnd.xrecoff = XLogFileSize;
 943                 }
 944
 945                 /* Make it look like we've written and synced all of old segment */
 946                 LogwrtResult.Write = OldSegEnd;
 947                 LogwrtResult.Flush = OldSegEnd;
 948
 949                 /*
 950                  * Update shared-memory status --- this code should match XLogWrite
 951                  */
 952                 {
 953                         /* use volatile pointer to prevent code rearrangement */
 954                         volatile XLogCtlData *xlogctl = XLogCtl;
 955
 956                         SpinLockAcquire(&xlogctl->info_lck);
 957                         xlogctl->LogwrtResult = LogwrtResult;
 958                         if (XLByteLT(xlogctl->LogwrtRqst.Write, LogwrtResult.Write))
 959                                 xlogctl->LogwrtRqst.Write = LogwrtResult.Write;
 960                         if (XLByteLT(xlogctl->LogwrtRqst.Flush, LogwrtResult.Flush))
 961                                 xlogctl->LogwrtRqst.Flush = LogwrtResult.Flush;
 962                         SpinLockRelease(&xlogctl->info_lck);
 963                 }
 964
 965                 Write->LogwrtResult = LogwrtResult;
 966
 967                 LWLockRelease(WALWriteLock);
 968
 969                 updrqst = false;                /* done already */
 970         }
 971         else
 972         {
 973                 /* normal case, ie not xlog switch */
 974
 975                 /* Need to update shared LogwrtRqst if some block was filled up */
 976                 if (freespace < SizeOfXLogRecord)
 977                 {
 978                         /* curridx is filled and available for writing out */
 979                         updrqst = true;
 980                 }
 981                 else
 982                 {
 983                         /* if updrqst already set, write through end of previous buf */
 984                         curridx = PrevBufIdx(curridx);
 985                 }
 986                 WriteRqst = XLogCtl->xlblocks[curridx];
 987         }
 988
 989         LWLockRelease(WALInsertLock);
 990
 991         if (updrqst)
 992         {
 993                 /* use volatile pointer to prevent code rearrangement */
 994                 volatile XLogCtlData *xlogctl = XLogCtl;
 995
 996                 SpinLockAcquire(&xlogctl->info_lck);
 997                 /* advance global request to include new block(s) */
 998                 if (XLByteLT(xlogctl->LogwrtRqst.Write, WriteRqst))
 999                         xlogctl->LogwrtRqst.Write = WriteRqst;
1000                 /* update local result copy while I have the chance */
1001                 LogwrtResult = xlogctl->LogwrtResult;
1002                 SpinLockRelease(&xlogctl->info_lck);
1003         }
1004
1005         XactLastRecEnd = RecPtr;
1006
1007         END_CRIT_SECTION();
1008
1009         return RecPtr;
1010 }
1011
1012 /*
1013  * Determine whether the buffer referenced by an XLogRecData item has to
1014  * be backed up, and if so fill a BkpBlock struct for it.  In any case
1015  * save the buffer's LSN at *lsn.
1016  */
1017 static bool
1018 XLogCheckBuffer(XLogRecData *rdata, bool doPageWrites,
1019                                 XLogRecPtr *lsn, BkpBlock *bkpb)
1020 {
1021         Page            page;
1022
1023         page = BufferGetPage(rdata->buffer);
1024
1025         /*
1026          * XXX We assume page LSN is first data on *every* page that can be passed
1027          * to XLogInsert, whether it otherwise has the standard page layout or
1028          * not.
1029          */
1030         *lsn = PageGetLSN(page);
1031
1032         if (doPageWrites &&
1033                 XLByteLE(PageGetLSN(page), RedoRecPtr))
1034         {
1035                 /*
1036                  * The page needs to be backed up, so set up *bkpb
1037                  */
1038                 BufferGetTag(rdata->buffer, &bkpb->node, &bkpb->fork, &bkpb->block);
1039
1040                 if (rdata->buffer_std)
1041                 {
1042                         /* Assume we can omit data between pd_lower and pd_upper */
1043                         uint16          lower = ((PageHeader) page)->pd_lower;
1044                         uint16          upper = ((PageHeader) page)->pd_upper;
1045
1046                         if (lower >= SizeOfPageHeaderData &&
1047                                 upper > lower &&
1048                                 upper <= BLCKSZ)
1049                         {
1050                                 bkpb->hole_offset = lower;
1051                                 bkpb->hole_length = upper - lower;
1052                         }
1053                         else
1054                         {
1055                                 /* No "hole" to compress out */
1056                                 bkpb->hole_offset = 0;
1057                                 bkpb->hole_length = 0;
1058                         }
1059                 }
1060                 else
1061                 {
1062                         /* Not a standard page header, don't try to eliminate "hole" */
1063                         bkpb->hole_offset = 0;
1064                         bkpb->hole_length = 0;
1065                 }
1066
1067                 return true;                    /* buffer requires backup */
1068         }
1069
1070         return false;                           /* buffer does not need to be backed up */
1071 }
1072
1073 /*
1074  * XLogArchiveNotify
1075  *
1076  * Create an archive notification file
1077  *
1078  * The name of the notification file is the message that will be picked up
1079  * by the archiver, e.g. we write 0000000100000001000000C6.ready
1080  * and the archiver then knows to archive XLOGDIR/0000000100000001000000C6,
1081  * then when complete, rename it to 0000000100000001000000C6.done
1082  */
1083 static void
1084 XLogArchiveNotify(const char *xlog)
1085 {
1086         char            archiveStatusPath[MAXPGPATH];
1087         FILE       *fd;
1088
1089         /* insert an otherwise empty file called <XLOG>.ready */
1090         StatusFilePath(archiveStatusPath, xlog, ".ready");
1091         fd = AllocateFile(archiveStatusPath, "w");
1092         if (fd == NULL)
1093         {
1094                 ereport(LOG,
1095                                 (errcode_for_file_access(),
1096                                  errmsg("could not create archive status file \"%s\": %m",
1097                                                 archiveStatusPath)));
1098                 return;
1099         }
1100         if (FreeFile(fd))
1101         {
1102                 ereport(LOG,
1103                                 (errcode_for_file_access(),
1104                                  errmsg("could not write archive status file \"%s\": %m",
1105                                                 archiveStatusPath)));
1106                 return;
1107         }
1108
1109         /* Notify archiver that it's got something to do */
1110         if (IsUnderPostmaster)
1111                 SendPostmasterSignal(PMSIGNAL_WAKEN_ARCHIVER);
1112 }
1113
1114 /*
1115  * Convenience routine to notify using log/seg representation of filename
1116  */
1117 static void
1118 XLogArchiveNotifySeg(uint32 log, uint32 seg)
1119 {
1120         char            xlog[MAXFNAMELEN];
1121
1122         XLogFileName(xlog, ThisTimeLineID, log, seg);
1123         XLogArchiveNotify(xlog);
1124 }
1125
1126 /*
1127  * XLogArchiveCheckDone
1128  *
1129  * This is called when we are ready to delete or recycle an old XLOG segment
1130  * file or backup history file.  If it is okay to delete it then return true.
1131  * If it is not time to delete it, make sure a .ready file exists, and return
1132  * false.
1133  *
1134  * If <XLOG>.done exists, then return true; else if <XLOG>.ready exists,
1135  * then return false; else create <XLOG>.ready and return false.
1136  *
1137  * The reason we do things this way is so that if the original attempt to
1138  * create <XLOG>.ready fails, we'll retry during subsequent checkpoints.
1139  */
1140 static bool
1141 XLogArchiveCheckDone(const char *xlog)
1142 {
1143         char            archiveStatusPath[MAXPGPATH];
1144         struct stat stat_buf;
1145
1146         /* Always deletable if archiving is off */
1147         if (!XLogArchivingActive())
1148                 return true;
1149
1150         /* First check for .done --- this means archiver is done with it */
1151         StatusFilePath(archiveStatusPath, xlog, ".done");
1152         if (stat(archiveStatusPath, &stat_buf) == 0)
1153                 return true;
1154
1155         /* check for .ready --- this means archiver is still busy with it */
1156         StatusFilePath(archiveStatusPath, xlog, ".ready");
1157         if (stat(archiveStatusPath, &stat_buf) == 0)
1158                 return false;
1159
1160         /* Race condition --- maybe archiver just finished, so recheck */
1161         StatusFilePath(archiveStatusPath, xlog, ".done");
1162         if (stat(archiveStatusPath, &stat_buf) == 0)
1163                 return true;
1164
1165         /* Retry creation of the .ready file */
1166         XLogArchiveNotify(xlog);
1167         return false;
1168 }
1169
1170 /*
1171  * XLogArchiveIsBusy
1172  *
1173  * Check to see if an XLOG segment file is still unarchived.
1174  * This is almost but not quite the inverse of XLogArchiveCheckDone: in
1175  * the first place we aren't chartered to recreate the .ready file, and
1176  * in the second place we should consider that if the file is already gone
1177  * then it's not busy.  (This check is needed to handle the race condition
1178  * that a checkpoint already deleted the no-longer-needed file.)
1179  */
1180 static bool
1181 XLogArchiveIsBusy(const char *xlog)
1182 {
1183         char            archiveStatusPath[MAXPGPATH];
1184         struct stat stat_buf;
1185
1186         /* First check for .done --- this means archiver is done with it */
1187         StatusFilePath(archiveStatusPath, xlog, ".done");
1188         if (stat(archiveStatusPath, &stat_buf) == 0)
1189                 return false;
1190
1191         /* check for .ready --- this means archiver is still busy with it */
1192         StatusFilePath(archiveStatusPath, xlog, ".ready");
1193         if (stat(archiveStatusPath, &stat_buf) == 0)
1194                 return true;
1195
1196         /* Race condition --- maybe archiver just finished, so recheck */
1197         StatusFilePath(archiveStatusPath, xlog, ".done");
1198         if (stat(archiveStatusPath, &stat_buf) == 0)
1199                 return false;
1200
1201         /*
1202          * Check to see if the WAL file has been removed by checkpoint,
1203          * which implies it has already been archived, and explains why we
1204          * can't see a status file for it.
1205          */
1206         snprintf(archiveStatusPath, MAXPGPATH, XLOGDIR "/%s", xlog);
1207         if (stat(archiveStatusPath, &stat_buf) != 0 &&
1208                 errno == ENOENT)
1209                 return false;
1210
1211         return true;
1212 }
1213
1214 /*
1215  * XLogArchiveCleanup
1216  *
1217  * Cleanup archive notification file(s) for a particular xlog segment
1218  */
1219 static void
1220 XLogArchiveCleanup(const char *xlog)
1221 {
1222         char            archiveStatusPath[MAXPGPATH];
1223
1224         /* Remove the .done file */
1225         StatusFilePath(archiveStatusPath, xlog, ".done");
1226         unlink(archiveStatusPath);
1227         /* should we complain about failure? */
1228
1229         /* Remove the .ready file if present --- normally it shouldn't be */
1230         StatusFilePath(archiveStatusPath, xlog, ".ready");
1231         unlink(archiveStatusPath);
1232         /* should we complain about failure? */
1233 }
1234
1235 /*
1236  * Advance the Insert state to the next buffer page, writing out the next
1237  * buffer if it still contains unwritten data.
1238  *
1239  * If new_segment is TRUE then we set up the next buffer page as the first
1240  * page of the next xlog segment file, possibly but not usually the next
1241  * consecutive file page.
1242  *
1243  * The global LogwrtRqst.Write pointer needs to be advanced to include the
1244  * just-filled page.  If we can do this for free (without an extra lock),
1245  * we do so here.  Otherwise the caller must do it.  We return TRUE if the
1246  * request update still needs to be done, FALSE if we did it internally.
1247  *
1248  * Must be called with WALInsertLock held.
1249  */
1250 static bool
1251 AdvanceXLInsertBuffer(bool new_segment)
1252 {
1253         XLogCtlInsert *Insert = &XLogCtl->Insert;
1254         XLogCtlWrite *Write = &XLogCtl->Write;
1255         int                     nextidx = NextBufIdx(Insert->curridx);
1256         bool            update_needed = true;
1257         XLogRecPtr      OldPageRqstPtr;
1258         XLogwrtRqst WriteRqst;
1259         XLogRecPtr      NewPageEndPtr;
1260         XLogPageHeader NewPage;
1261
1262         /* Use Insert->LogwrtResult copy if it's more fresh */
1263         if (XLByteLT(LogwrtResult.Write, Insert->LogwrtResult.Write))
1264                 LogwrtResult = Insert->LogwrtResult;
1265
1266         /*
1267          * Get ending-offset of the buffer page we need to replace (this may be
1268          * zero if the buffer hasn't been used yet).  Fall through if it's already
1269          * written out.
1270          */
1271         OldPageRqstPtr = XLogCtl->xlblocks[nextidx];
1272         if (!XLByteLE(OldPageRqstPtr, LogwrtResult.Write))
1273         {
1274                 /* nope, got work to do... */
1275                 XLogRecPtr      FinishedPageRqstPtr;
1276
1277                 FinishedPageRqstPtr = XLogCtl->xlblocks[Insert->curridx];
1278
1279                 /* Before waiting, get info_lck and update LogwrtResult */
1280                 {
1281                         /* use volatile pointer to prevent code rearrangement */
1282                         volatile XLogCtlData *xlogctl = XLogCtl;
1283
1284                         SpinLockAcquire(&xlogctl->info_lck);
1285                         if (XLByteLT(xlogctl->LogwrtRqst.Write, FinishedPageRqstPtr))
1286                                 xlogctl->LogwrtRqst.Write = FinishedPageRqstPtr;
1287                         LogwrtResult = xlogctl->LogwrtResult;
1288                         SpinLockRelease(&xlogctl->info_lck);
1289                 }
1290
1291                 update_needed = false;  /* Did the shared-request update */
1292
1293                 if (XLByteLE(OldPageRqstPtr, LogwrtResult.Write))
1294                 {
1295                         /* OK, someone wrote it already */
1296                         Insert->LogwrtResult = LogwrtResult;
1297                 }
1298                 else
1299                 {
1300                         /* Must acquire write lock */
1301                         LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
1302                         LogwrtResult = Write->LogwrtResult;
1303                         if (XLByteLE(OldPageRqstPtr, LogwrtResult.Write))
1304                         {
1305                                 /* OK, someone wrote it already */
1306                                 LWLockRelease(WALWriteLock);
1307                                 Insert->LogwrtResult = LogwrtResult;
1308                         }
1309                         else
1310                         {
1311                                 /*
1312                                  * Have to write buffers while holding insert lock. This is
1313                                  * not good, so only write as much as we absolutely must.
1314                                  */
1315                                 WriteRqst.Write = OldPageRqstPtr;
1316                                 WriteRqst.Flush.xlogid = 0;
1317                                 WriteRqst.Flush.xrecoff = 0;
1318                                 XLogWrite(WriteRqst, false, false);
1319                                 LWLockRelease(WALWriteLock);
1320                                 Insert->LogwrtResult = LogwrtResult;
1321                         }
1322                 }
1323         }
1324
1325         /*
1326          * Now the next buffer slot is free and we can set it up to be the next
1327          * output page.
1328          */
1329         NewPageEndPtr = XLogCtl->xlblocks[Insert->curridx];
1330
1331         if (new_segment)
1332         {
1333                 /* force it to a segment start point */
1334                 NewPageEndPtr.xrecoff += XLogSegSize - 1;
1335                 NewPageEndPtr.xrecoff -= NewPageEndPtr.xrecoff % XLogSegSize;
1336         }
1337
1338         if (NewPageEndPtr.xrecoff >= XLogFileSize)
1339         {
1340                 /* crossing a logid boundary */
1341                 NewPageEndPtr.xlogid += 1;
1342                 NewPageEndPtr.xrecoff = XLOG_BLCKSZ;
1343         }
1344         else
1345                 NewPageEndPtr.xrecoff += XLOG_BLCKSZ;
1346         XLogCtl->xlblocks[nextidx] = NewPageEndPtr;
1347         NewPage = (XLogPageHeader) (XLogCtl->pages + nextidx * (Size) XLOG_BLCKSZ);
1348
1349         Insert->curridx = nextidx;
1350         Insert->currpage = NewPage;
1351
1352         Insert->currpos = ((char *) NewPage) +SizeOfXLogShortPHD;
1353
1354         /*
1355          * Be sure to re-zero the buffer so that bytes beyond what we've written
1356          * will look like zeroes and not valid XLOG records...
1357          */
1358         MemSet((char *) NewPage, 0, XLOG_BLCKSZ);
1359
1360         /*
1361          * Fill the new page's header
1362          */
1363         NewPage   ->xlp_magic = XLOG_PAGE_MAGIC;
1364
1365         /* NewPage->xlp_info = 0; */    /* done by memset */
1366         NewPage   ->xlp_tli = ThisTimeLineID;
1367         NewPage   ->xlp_pageaddr.xlogid = NewPageEndPtr.xlogid;
1368         NewPage   ->xlp_pageaddr.xrecoff = NewPageEndPtr.xrecoff - XLOG_BLCKSZ;
1369
1370         /*
1371          * If first page of an XLOG segment file, make it a long header.
1372          */
1373         if ((NewPage->xlp_pageaddr.xrecoff % XLogSegSize) == 0)
1374         {
1375                 XLogLongPageHeader NewLongPage = (XLogLongPageHeader) NewPage;
1376
1377                 NewLongPage->xlp_sysid = ControlFile->system_identifier;
1378                 NewLongPage->xlp_seg_size = XLogSegSize;
1379                 NewLongPage->xlp_xlog_blcksz = XLOG_BLCKSZ;
1380                 NewPage   ->xlp_info |= XLP_LONG_HEADER;
1381
1382                 Insert->currpos = ((char *) NewPage) +SizeOfXLogLongPHD;
1383         }
1384
1385         return update_needed;
1386 }
1387
1388 /*
1389  * Check whether we've consumed enough xlog space that a checkpoint is needed.
1390  *
1391  * Caller must have just finished filling the open log file (so that
1392  * openLogId/openLogSeg are valid).  We measure the distance from RedoRecPtr
1393  * to the open log file and see if that exceeds CheckPointSegments.
1394  *
1395  * Note: it is caller's responsibility that RedoRecPtr is up-to-date.
1396  */
1397 static bool
1398 XLogCheckpointNeeded(void)
1399 {
1400         /*
1401          * A straight computation of segment number could overflow 32 bits. Rather
1402          * than assuming we have working 64-bit arithmetic, we compare the
1403          * highest-order bits separately, and force a checkpoint immediately when
1404          * they change.
1405          */
1406         uint32          old_segno,
1407                                 new_segno;
1408         uint32          old_highbits,
1409                                 new_highbits;
1410
1411         old_segno = (RedoRecPtr.xlogid % XLogSegSize) * XLogSegsPerFile +
1412                 (RedoRecPtr.xrecoff / XLogSegSize);
1413         old_highbits = RedoRecPtr.xlogid / XLogSegSize;
1414         new_segno = (openLogId % XLogSegSize) * XLogSegsPerFile + openLogSeg;
1415         new_highbits = openLogId / XLogSegSize;
1416         if (new_highbits != old_highbits ||
1417                 new_segno >= old_segno + (uint32) (CheckPointSegments - 1))
1418                 return true;
1419         return false;
1420 }
1421
1422 /*
1423  * Write and/or fsync the log at least as far as WriteRqst indicates.
1424  *
1425  * If flexible == TRUE, we don't have to write as far as WriteRqst, but
1426  * may stop at any convenient boundary (such as a cache or logfile boundary).
1427  * This option allows us to avoid uselessly issuing multiple writes when a
1428  * single one would do.
1429  *
1430  * If xlog_switch == TRUE, we are intending an xlog segment switch, so
1431  * perform end-of-segment actions after writing the last page, even if
1432  * it's not physically the end of its segment.  (NB: this will work properly
1433  * only if caller specifies WriteRqst == page-end and flexible == false,
1434  * and there is some data to write.)
1435  *
1436  * Must be called with WALWriteLock held.
1437  */
1438 static void
1439 XLogWrite(XLogwrtRqst WriteRqst, bool flexible, bool xlog_switch)
1440 {
1441         XLogCtlWrite *Write = &XLogCtl->Write;
1442         bool            ispartialpage;
1443         bool            last_iteration;
1444         bool            finishing_seg;
1445         bool            use_existent;
1446         int                     curridx;
1447         int                     npages;
1448         int                     startidx;
1449         uint32          startoffset;
1450
1451         /* We should always be inside a critical section here */
1452         Assert(CritSectionCount > 0);
1453
1454         /*
1455          * Update local LogwrtResult (caller probably did this already, but...)
1456          */
1457         LogwrtResult = Write->LogwrtResult;
1458
1459         /*
1460          * Since successive pages in the xlog cache are consecutively allocated,
1461          * we can usually gather multiple pages together and issue just one
1462          * write() call.  npages is the number of pages we have determined can be
1463          * written together; startidx is the cache block index of the first one,
1464          * and startoffset is the file offset at which it should go. The latter
1465          * two variables are only valid when npages > 0, but we must initialize
1466          * all of them to keep the compiler quiet.
1467          */
1468         npages = 0;
1469         startidx = 0;
1470         startoffset = 0;
1471
1472         /*
1473          * Within the loop, curridx is the cache block index of the page to
1474          * consider writing.  We advance Write->curridx only after successfully
1475          * writing pages.  (Right now, this refinement is useless since we are
1476          * going to PANIC if any error occurs anyway; but someday it may come in
1477          * useful.)
1478          */
1479         curridx = Write->curridx;
1480
1481         while (XLByteLT(LogwrtResult.Write, WriteRqst.Write))
1482         {
1483                 /*
1484                  * Make sure we're not ahead of the insert process.  This could happen
1485                  * if we're passed a bogus WriteRqst.Write that is past the end of the
1486                  * last page that's been initialized by AdvanceXLInsertBuffer.
1487                  */
1488                 if (!XLByteLT(LogwrtResult.Write, XLogCtl->xlblocks[curridx]))
1489                         elog(PANIC, "xlog write request %X/%X is past end of log %X/%X",
1490                                  LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff,
1491                                  XLogCtl->xlblocks[curridx].xlogid,
1492                                  XLogCtl->xlblocks[curridx].xrecoff);
1493
1494                 /* Advance LogwrtResult.Write to end of current buffer page */
1495                 LogwrtResult.Write = XLogCtl->xlblocks[curridx];
1496                 ispartialpage = XLByteLT(WriteRqst.Write, LogwrtResult.Write);
1497
1498                 if (!XLByteInPrevSeg(LogwrtResult.Write, openLogId, openLogSeg))
1499                 {
1500                         /*
1501                          * Switch to new logfile segment.  We cannot have any pending
1502                          * pages here (since we dump what we have at segment end).
1503                          */
1504                         Assert(npages == 0);
1505                         if (openLogFile >= 0)
1506                                 XLogFileClose();
1507                         XLByteToPrevSeg(LogwrtResult.Write, openLogId, openLogSeg);
1508
1509                         /* create/use new log file */
1510                         use_existent = true;
1511                         openLogFile = XLogFileInit(openLogId, openLogSeg,
1512                                                                            &use_existent, true);
1513                         openLogOff = 0;
1514                 }
1515
1516                 /* Make sure we have the current logfile open */
1517                 if (openLogFile < 0)
1518                 {
1519                         XLByteToPrevSeg(LogwrtResult.Write, openLogId, openLogSeg);
1520                         openLogFile = XLogFileOpen(openLogId, openLogSeg);
1521                         openLogOff = 0;
1522                 }
1523
1524                 /* Add current page to the set of pending pages-to-dump */
1525                 if (npages == 0)
1526                 {
1527                         /* first of group */
1528                         startidx = curridx;
1529                         startoffset = (LogwrtResult.Write.xrecoff - XLOG_BLCKSZ) % XLogSegSize;
1530                 }
1531                 npages++;
1532
1533                 /*
1534                  * Dump the set if this will be the last loop iteration, or if we are
1535                  * at the last page of the cache area (since the next page won't be
1536                  * contiguous in memory), or if we are at the end of the logfile
1537                  * segment.
1538                  */
1539                 last_iteration = !XLByteLT(LogwrtResult.Write, WriteRqst.Write);
1540
1541                 finishing_seg = !ispartialpage &&
1542                         (startoffset + npages * XLOG_BLCKSZ) >= XLogSegSize;
1543
1544                 if (last_iteration ||
1545                         curridx == XLogCtl->XLogCacheBlck ||
1546                         finishing_seg)
1547                 {
1548                         char       *from;
1549                         Size            nbytes;
1550
1551                         /* Need to seek in the file? */
1552                         if (openLogOff != startoffset)
1553                         {
1554                                 if (lseek(openLogFile, (off_t) startoffset, SEEK_SET) < 0)
1555                                         ereport(PANIC,
1556                                                         (errcode_for_file_access(),
1557                                                          errmsg("could not seek in log file %u, "
1558                                                                         "segment %u to offset %u: %m",
1559                                                                         openLogId, openLogSeg, startoffset)));
1560                                 openLogOff = startoffset;
1561                         }
1562
1563                         /* OK to write the page(s) */
1564                         from = XLogCtl->pages + startidx * (Size) XLOG_BLCKSZ;
1565                         nbytes = npages * (Size) XLOG_BLCKSZ;
1566                         errno = 0;
1567                         if (write(openLogFile, from, nbytes) != nbytes)
1568                         {
1569                                 /* if write didn't set errno, assume no disk space */
1570                                 if (errno == 0)
1571                                         errno = ENOSPC;
1572                                 ereport(PANIC,
1573                                                 (errcode_for_file_access(),
1574                                                  errmsg("could not write to log file %u, segment %u "
1575                                                                 "at offset %u, length %lu: %m",
1576                                                                 openLogId, openLogSeg,
1577                                                                 openLogOff, (unsigned long) nbytes)));
1578                         }
1579
1580                         /* Update state for write */
1581                         openLogOff += nbytes;
1582                         Write->curridx = ispartialpage ? curridx : NextBufIdx(curridx);
1583                         npages = 0;
1584
1585                         /*
1586                          * If we just wrote the whole last page of a logfile segment,
1587                          * fsync the segment immediately.  This avoids having to go back
1588                          * and re-open prior segments when an fsync request comes along
1589                          * later. Doing it here ensures that one and only one backend will
1590                          * perform this fsync.
1591                          *
1592                          * We also do this if this is the last page written for an xlog
1593                          * switch.
1594                          *
1595                          * This is also the right place to notify the Archiver that the
1596                          * segment is ready to copy to archival storage, and to update the
1597                          * timer for archive_timeout, and to signal for a checkpoint if
1598                          * too many logfile segments have been used since the last
1599                          * checkpoint.
1600                          */
1601                         if (finishing_seg || (xlog_switch && last_iteration))
1602                         {
1603                                 issue_xlog_fsync();
1604                                 LogwrtResult.Flush = LogwrtResult.Write;                /* end of page */
1605
1606                                 if (XLogArchivingActive())
1607                                         XLogArchiveNotifySeg(openLogId, openLogSeg);
1608
1609                                 Write->lastSegSwitchTime = (pg_time_t) time(NULL);
1610
1611                                 /*
1612                                  * Signal bgwriter to start a checkpoint if we've consumed too
1613                                  * much xlog since the last one.  For speed, we first check
1614                                  * using the local copy of RedoRecPtr, which might be out of
1615                                  * date; if it looks like a checkpoint is needed, forcibly
1616                                  * update RedoRecPtr and recheck.
1617                                  */
1618                                 if (IsUnderPostmaster &&
1619                                         XLogCheckpointNeeded())
1620                                 {
1621                                         (void) GetRedoRecPtr();
1622                                         if (XLogCheckpointNeeded())
1623                                                 RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
1624                                 }
1625                         }
1626                 }
1627
1628                 if (ispartialpage)
1629                 {
1630                         /* Only asked to write a partial page */
1631                         LogwrtResult.Write = WriteRqst.Write;
1632                         break;
1633                 }
1634                 curridx = NextBufIdx(curridx);
1635
1636                 /* If flexible, break out of loop as soon as we wrote something */
1637                 if (flexible && npages == 0)
1638                         break;
1639         }
1640
1641         Assert(npages == 0);
1642         Assert(curridx == Write->curridx);
1643
1644         /*
1645          * If asked to flush, do so
1646          */
1647         if (XLByteLT(LogwrtResult.Flush, WriteRqst.Flush) &&
1648                 XLByteLT(LogwrtResult.Flush, LogwrtResult.Write))
1649         {
1650                 /*
1651                  * Could get here without iterating above loop, in which case we might
1652                  * have no open file or the wrong one.  However, we do not need to
1653                  * fsync more than one file.
1654                  */
1655                 if (sync_method != SYNC_METHOD_OPEN &&
1656                         sync_method != SYNC_METHOD_OPEN_DSYNC)
1657                 {
1658                         if (openLogFile >= 0 &&
1659                                 !XLByteInPrevSeg(LogwrtResult.Write, openLogId, openLogSeg))
1660                                 XLogFileClose();
1661                         if (openLogFile < 0)
1662                         {
1663                                 XLByteToPrevSeg(LogwrtResult.Write, openLogId, openLogSeg);
1664                                 openLogFile = XLogFileOpen(openLogId, openLogSeg);
1665                                 openLogOff = 0;
1666                         }
1667                         issue_xlog_fsync();
1668                 }
1669                 LogwrtResult.Flush = LogwrtResult.Write;
1670         }
1671
1672         /*
1673          * Update shared-memory status
1674          *
1675          * We make sure that the shared 'request' values do not fall behind the
1676          * 'result' values.  This is not absolutely essential, but it saves some
1677          * code in a couple of places.
1678          */
1679         {
1680                 /* use volatile pointer to prevent code rearrangement */
1681                 volatile XLogCtlData *xlogctl = XLogCtl;
1682
1683                 SpinLockAcquire(&xlogctl->info_lck);
1684                 xlogctl->LogwrtResult = LogwrtResult;
1685                 if (XLByteLT(xlogctl->LogwrtRqst.Write, LogwrtResult.Write))
1686                         xlogctl->LogwrtRqst.Write = LogwrtResult.Write;
1687                 if (XLByteLT(xlogctl->LogwrtRqst.Flush, LogwrtResult.Flush))
1688                         xlogctl->LogwrtRqst.Flush = LogwrtResult.Flush;
1689                 SpinLockRelease(&xlogctl->info_lck);
1690         }
1691
1692         Write->LogwrtResult = LogwrtResult;
1693 }
1694
1695 /*
1696  * Record the LSN for an asynchronous transaction commit.
1697  * (This should not be called for aborts, nor for synchronous commits.)
1698  */
1699 void
1700 XLogSetAsyncCommitLSN(XLogRecPtr asyncCommitLSN)
1701 {
1702         /* use volatile pointer to prevent code rearrangement */
1703         volatile XLogCtlData *xlogctl = XLogCtl;
1704
1705         SpinLockAcquire(&xlogctl->info_lck);
1706         if (XLByteLT(xlogctl->asyncCommitLSN, asyncCommitLSN))
1707                 xlogctl->asyncCommitLSN = asyncCommitLSN;
1708         SpinLockRelease(&xlogctl->info_lck);
1709 }
1710
1711 /*
1712  * Ensure that all XLOG data through the given position is flushed to disk.
1713  *
1714  * NOTE: this differs from XLogWrite mainly in that the WALWriteLock is not
1715  * already held, and we try to avoid acquiring it if possible.
1716  */
1717 void
1718 XLogFlush(XLogRecPtr record)
1719 {
1720         XLogRecPtr      WriteRqstPtr;
1721         XLogwrtRqst WriteRqst;
1722
1723         /* Disabled during REDO */
1724         if (InRedo)
1725                 return;
1726
1727         /* Quick exit if already known flushed */
1728         if (XLByteLE(record, LogwrtResult.Flush))
1729                 return;
1730
1731 #ifdef WAL_DEBUG
1732         if (XLOG_DEBUG)
1733                 elog(LOG, "xlog flush request %X/%X; write %X/%X; flush %X/%X",
1734                          record.xlogid, record.xrecoff,
1735                          LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff,
1736                          LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff);
1737 #endif
1738
1739         START_CRIT_SECTION();
1740
1741         /*
1742          * Since fsync is usually a horribly expensive operation, we try to
1743          * piggyback as much data as we can on each fsync: if we see any more data
1744          * entered into the xlog buffer, we'll write and fsync that too, so that
1745          * the final value of LogwrtResult.Flush is as large as possible. This
1746          * gives us some chance of avoiding another fsync immediately after.
1747          */
1748
1749         /* initialize to given target; may increase below */
1750         WriteRqstPtr = record;
1751
1752         /* read LogwrtResult and update local state */
1753         {
1754                 /* use volatile pointer to prevent code rearrangement */
1755                 volatile XLogCtlData *xlogctl = XLogCtl;
1756
1757                 SpinLockAcquire(&xlogctl->info_lck);
1758                 if (XLByteLT(WriteRqstPtr, xlogctl->LogwrtRqst.Write))
1759                         WriteRqstPtr = xlogctl->LogwrtRqst.Write;
1760                 LogwrtResult = xlogctl->LogwrtResult;
1761                 SpinLockRelease(&xlogctl->info_lck);
1762         }
1763
1764         /* done already? */
1765         if (!XLByteLE(record, LogwrtResult.Flush))
1766         {
1767                 /* now wait for the write lock */
1768                 LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
1769                 LogwrtResult = XLogCtl->Write.LogwrtResult;
1770                 if (!XLByteLE(record, LogwrtResult.Flush))
1771                 {
1772                         /* try to write/flush later additions to XLOG as well */
1773                         if (LWLockConditionalAcquire(WALInsertLock, LW_EXCLUSIVE))
1774                         {
1775                                 XLogCtlInsert *Insert = &XLogCtl->Insert;
1776                                 uint32          freespace = INSERT_FREESPACE(Insert);
1777
1778                                 if (freespace < SizeOfXLogRecord)               /* buffer is full */
1779                                         WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx];
1780                                 else
1781                                 {
1782                                         WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx];
1783                                         WriteRqstPtr.xrecoff -= freespace;
1784                                 }
1785                                 LWLockRelease(WALInsertLock);
1786                                 WriteRqst.Write = WriteRqstPtr;
1787                                 WriteRqst.Flush = WriteRqstPtr;
1788                         }
1789                         else
1790                         {
1791                                 WriteRqst.Write = WriteRqstPtr;
1792                                 WriteRqst.Flush = record;
1793                         }
1794                         XLogWrite(WriteRqst, false, false);
1795                 }
1796                 LWLockRelease(WALWriteLock);
1797         }
1798
1799         END_CRIT_SECTION();
1800
1801         /*
1802          * If we still haven't flushed to the request point then we have a
1803          * problem; most likely, the requested flush point is past end of XLOG.
1804          * This has been seen to occur when a disk page has a corrupted LSN.
1805          *
1806          * Formerly we treated this as a PANIC condition, but that hurts the
1807          * system's robustness rather than helping it: we do not want to take down
1808          * the whole system due to corruption on one data page.  In particular, if
1809          * the bad page is encountered again during recovery then we would be
1810          * unable to restart the database at all!  (This scenario has actually
1811          * happened in the field several times with 7.1 releases. Note that we
1812          * cannot get here while InRedo is true, but if the bad page is brought in
1813          * and marked dirty during recovery then CreateCheckPoint will try to
1814          * flush it at the end of recovery.)
1815          *
1816          * The current approach is to ERROR under normal conditions, but only
1817          * WARNING during recovery, so that the system can be brought up even if
1818          * there's a corrupt LSN.  Note that for calls from xact.c, the ERROR will
1819          * be promoted to PANIC since xact.c calls this routine inside a critical
1820          * section.  However, calls from bufmgr.c are not within critical sections
1821          * and so we will not force a restart for a bad LSN on a data page.
1822          */
1823         if (XLByteLT(LogwrtResult.Flush, record))
1824                 elog(InRecovery ? WARNING : ERROR,
1825                 "xlog flush request %X/%X is not satisfied --- flushed only to %X/%X",
1826                          record.xlogid, record.xrecoff,
1827                          LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff);
1828 }
1829
1830 /*
1831  * Flush xlog, but without specifying exactly where to flush to.
1832  *
1833  * We normally flush only completed blocks; but if there is nothing to do on
1834  * that basis, we check for unflushed async commits in the current incomplete
1835  * block, and flush through the latest one of those.  Thus, if async commits
1836  * are not being used, we will flush complete blocks only.      We can guarantee
1837  * that async commits reach disk after at most three cycles; normally only
1838  * one or two.  (We allow XLogWrite to write "flexibly", meaning it can stop
1839  * at the end of the buffer ring; this makes a difference only with very high
1840  * load or long wal_writer_delay, but imposes one extra cycle for the worst
1841  * case for async commits.)
1842  *
1843  * This routine is invoked periodically by the background walwriter process.
1844  */
1845 void
1846 XLogBackgroundFlush(void)
1847 {
1848         XLogRecPtr      WriteRqstPtr;
1849         bool            flexible = true;
1850
1851         /* read LogwrtResult and update local state */
1852         {
1853                 /* use volatile pointer to prevent code rearrangement */
1854                 volatile XLogCtlData *xlogctl = XLogCtl;
1855
1856                 SpinLockAcquire(&xlogctl->info_lck);
1857                 LogwrtResult = xlogctl->LogwrtResult;
1858                 WriteRqstPtr = xlogctl->LogwrtRqst.Write;
1859                 SpinLockRelease(&xlogctl->info_lck);
1860         }
1861
1862         /* back off to last completed page boundary */
1863         WriteRqstPtr.xrecoff -= WriteRqstPtr.xrecoff % XLOG_BLCKSZ;
1864
1865         /* if we have already flushed that far, consider async commit records */
1866         if (XLByteLE(WriteRqstPtr, LogwrtResult.Flush))
1867         {
1868                 /* use volatile pointer to prevent code rearrangement */
1869                 volatile XLogCtlData *xlogctl = XLogCtl;
1870
1871                 SpinLockAcquire(&xlogctl->info_lck);
1872                 WriteRqstPtr = xlogctl->asyncCommitLSN;
1873                 SpinLockRelease(&xlogctl->info_lck);
1874                 flexible = false;               /* ensure it all gets written */
1875         }
1876
1877         /* Done if already known flushed */
1878         if (XLByteLE(WriteRqstPtr, LogwrtResult.Flush))
1879                 return;
1880
1881 #ifdef WAL_DEBUG
1882         if (XLOG_DEBUG)
1883                 elog(LOG, "xlog bg flush request %X/%X; write %X/%X; flush %X/%X",
1884                          WriteRqstPtr.xlogid, WriteRqstPtr.xrecoff,
1885                          LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff,
1886                          LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff);
1887 #endif
1888
1889         START_CRIT_SECTION();
1890
1891         /* now wait for the write lock */
1892         LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
1893         LogwrtResult = XLogCtl->Write.LogwrtResult;
1894         if (!XLByteLE(WriteRqstPtr, LogwrtResult.Flush))
1895         {
1896                 XLogwrtRqst WriteRqst;
1897
1898                 WriteRqst.Write = WriteRqstPtr;
1899                 WriteRqst.Flush = WriteRqstPtr;
1900                 XLogWrite(WriteRqst, flexible, false);
1901         }
1902         LWLockRelease(WALWriteLock);
1903
1904         END_CRIT_SECTION();
1905 }
1906
1907 /*
1908  * Flush any previous asynchronously-committed transactions' commit records.
1909  *
1910  * NOTE: it is unwise to assume that this provides any strong guarantees.
1911  * In particular, because of the inexact LSN bookkeeping used by clog.c,
1912  * we cannot assume that hint bits will be settable for these transactions.
1913  */
1914 void
1915 XLogAsyncCommitFlush(void)
1916 {
1917         XLogRecPtr      WriteRqstPtr;
1918
1919         /* use volatile pointer to prevent code rearrangement */
1920         volatile XLogCtlData *xlogctl = XLogCtl;
1921
1922         SpinLockAcquire(&xlogctl->info_lck);
1923         WriteRqstPtr = xlogctl->asyncCommitLSN;
1924         SpinLockRelease(&xlogctl->info_lck);
1925
1926         XLogFlush(WriteRqstPtr);
1927 }
1928
1929 /*
1930  * Test whether XLOG data has been flushed up to (at least) the given position.
1931  *
1932  * Returns true if a flush is still needed.  (It may be that someone else
1933  * is already in process of flushing that far, however.)
1934  */
1935 bool
1936 XLogNeedsFlush(XLogRecPtr record)
1937 {
1938         /* Quick exit if already known flushed */
1939         if (XLByteLE(record, LogwrtResult.Flush))
1940                 return false;
1941
1942         /* read LogwrtResult and update local state */
1943         {
1944                 /* use volatile pointer to prevent code rearrangement */
1945                 volatile XLogCtlData *xlogctl = XLogCtl;
1946
1947                 SpinLockAcquire(&xlogctl->info_lck);
1948                 LogwrtResult = xlogctl->LogwrtResult;
1949                 SpinLockRelease(&xlogctl->info_lck);
1950         }
1951
1952         /* check again */
1953         if (XLByteLE(record, LogwrtResult.Flush))
1954                 return false;
1955
1956         return true;
1957 }
1958
1959 /*
1960  * Create a new XLOG file segment, or open a pre-existing one.
1961  *
1962  * log, seg: identify segment to be created/opened.
1963  *
1964  * *use_existent: if TRUE, OK to use a pre-existing file (else, any
1965  * pre-existing file will be deleted).  On return, TRUE if a pre-existing
1966  * file was used.
1967  *
1968  * use_lock: if TRUE, acquire ControlFileLock while moving file into
1969  * place.  This should be TRUE except during bootstrap log creation.  The
1970  * caller must *not* hold the lock at call.
1971  *
1972  * Returns FD of opened file.
1973  *
1974  * Note: errors here are ERROR not PANIC because we might or might not be
1975  * inside a critical section (eg, during checkpoint there is no reason to
1976  * take down the system on failure).  They will promote to PANIC if we are
1977  * in a critical section.
1978  */
1979 static int
1980 XLogFileInit(uint32 log, uint32 seg,
1981                          bool *use_existent, bool use_lock)
1982 {
1983         char            path[MAXPGPATH];
1984         char            tmppath[MAXPGPATH];
1985         char       *zbuffer;
1986         uint32          installed_log;
1987         uint32          installed_seg;
1988         int                     max_advance;
1989         int                     fd;
1990         int                     nbytes;
1991
1992         XLogFilePath(path, ThisTimeLineID, log, seg);
1993
1994         /*
1995          * Try to use existent file (checkpoint maker may have created it already)
1996          */
1997         if (*use_existent)
1998         {
1999                 fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
2000                                                    S_IRUSR | S_IWUSR);
2001                 if (fd < 0)
2002                 {
2003                         if (errno != ENOENT)
2004                                 ereport(ERROR,
2005                                                 (errcode_for_file_access(),
2006                                                  errmsg("could not open file \"%s\" (log file %u, segment %u): %m",
2007                                                                 path, log, seg)));
2008                 }
2009                 else
2010                         return fd;
2011         }
2012
2013         /*
2014          * Initialize an empty (all zeroes) segment.  NOTE: it is possible that
2015          * another process is doing the same thing.  If so, we will end up
2016          * pre-creating an extra log segment.  That seems OK, and better than
2017          * holding the lock throughout this lengthy process.
2018          */
2019         elog(DEBUG2, "creating and filling new WAL file");
2020
2021         snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
2022
2023         unlink(tmppath);
2024
2025         /* do not use get_sync_bit() here --- want to fsync only at end of fill */
2026         fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
2027                                            S_IRUSR | S_IWUSR);
2028         if (fd < 0)
2029                 ereport(ERROR,
2030                                 (errcode_for_file_access(),
2031                                  errmsg("could not create file \"%s\": %m", tmppath)));
2032
2033         /*
2034          * Zero-fill the file.  We have to do this the hard way to ensure that all
2035          * the file space has really been allocated --- on platforms that allow
2036          * "holes" in files, just seeking to the end doesn't allocate intermediate
2037          * space.  This way, we know that we have all the space and (after the
2038          * fsync below) that all the indirect blocks are down on disk.  Therefore,
2039          * fdatasync(2) or O_DSYNC will be sufficient to sync future writes to the
2040          * log file.
2041          *
2042          * Note: palloc zbuffer, instead of just using a local char array, to
2043          * ensure it is reasonably well-aligned; this may save a few cycles
2044          * transferring data to the kernel.
2045          */
2046         zbuffer = (char *) palloc0(XLOG_BLCKSZ);
2047         for (nbytes = 0; nbytes < XLogSegSize; nbytes += XLOG_BLCKSZ)
2048         {
2049                 errno = 0;
2050                 if ((int) write(fd, zbuffer, XLOG_BLCKSZ) != (int) XLOG_BLCKSZ)
2051                 {
2052                         int                     save_errno = errno;
2053
2054                         /*
2055                          * If we fail to make the file, delete it to release disk space
2056                          */
2057                         unlink(tmppath);
2058                         /* if write didn't set errno, assume problem is no disk space */
2059                         errno = save_errno ? save_errno : ENOSPC;
2060
2061                         ereport(ERROR,
2062                                         (errcode_for_file_access(),
2063                                          errmsg("could not write to file \"%s\": %m", tmppath)));
2064                 }
2065         }
2066         pfree(zbuffer);
2067
2068         if (pg_fsync(fd) != 0)
2069                 ereport(ERROR,
2070                                 (errcode_for_file_access(),
2071                                  errmsg("could not fsync file \"%s\": %m", tmppath)));
2072
2073         if (close(fd))
2074                 ereport(ERROR,
2075                                 (errcode_for_file_access(),
2076                                  errmsg("could not close file \"%s\": %m", tmppath)));
2077
2078         /*
2079          * Now move the segment into place with its final name.
2080          *
2081          * If caller didn't want to use a pre-existing file, get rid of any
2082          * pre-existing file.  Otherwise, cope with possibility that someone else
2083          * has created the file while we were filling ours: if so, use ours to
2084          * pre-create a future log segment.
2085          */
2086         installed_log = log;
2087         installed_seg = seg;
2088         max_advance = XLOGfileslop;
2089         if (!InstallXLogFileSegment(&installed_log, &installed_seg, tmppath,
2090                                                                 *use_existent, &max_advance,
2091                                                                 use_lock))
2092         {
2093                 /* No need for any more future segments... */
2094                 unlink(tmppath);
2095         }
2096
2097         elog(DEBUG2, "done creating and filling new WAL file");
2098
2099         /* Set flag to tell caller there was no existent file */
2100         *use_existent = false;
2101
2102         /* Now open original target segment (might not be file I just made) */
2103         fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
2104                                            S_IRUSR | S_IWUSR);
2105         if (fd < 0)
2106                 ereport(ERROR,
2107                                 (errcode_for_file_access(),
2108                    errmsg("could not open file \"%s\" (log file %u, segment %u): %m",
2109                                   path, log, seg)));
2110
2111         return fd;
2112 }
2113
2114 /*
2115  * Create a new XLOG file segment by copying a pre-existing one.
2116  *
2117  * log, seg: identify segment to be created.
2118  *
2119  * srcTLI, srclog, srcseg: identify segment to be copied (could be from
2120  *              a different timeline)
2121  *
2122  * Currently this is only used during recovery, and so there are no locking
2123  * considerations.      But we should be just as tense as XLogFileInit to avoid
2124  * emplacing a bogus file.
2125  */
2126 static void
2127 XLogFileCopy(uint32 log, uint32 seg,
2128                          TimeLineID srcTLI, uint32 srclog, uint32 srcseg)
2129 {
2130         char            path[MAXPGPATH];
2131         char            tmppath[MAXPGPATH];
2132         char            buffer[XLOG_BLCKSZ];
2133         int                     srcfd;
2134         int                     fd;
2135         int                     nbytes;
2136
2137         /*
2138          * Open the source file
2139          */
2140         XLogFilePath(path, srcTLI, srclog, srcseg);
2141         srcfd = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0);
2142         if (srcfd < 0)
2143                 ereport(ERROR,
2144                                 (errcode_for_file_access(),
2145                                  errmsg("could not open file \"%s\": %m", path)));
2146
2147         /*
2148          * Copy into a temp file name.
2149          */
2150         snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
2151
2152         unlink(tmppath);
2153
2154         /* do not use get_sync_bit() here --- want to fsync only at end of fill */
2155         fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
2156                                            S_IRUSR | S_IWUSR);
2157         if (fd < 0)
2158                 ereport(ERROR,
2159                                 (errcode_for_file_access(),
2160                                  errmsg("could not create file \"%s\": %m", tmppath)));
2161
2162         /*
2163          * Do the data copying.
2164          */
2165         for (nbytes = 0; nbytes < XLogSegSize; nbytes += sizeof(buffer))
2166         {
2167                 errno = 0;
2168                 if ((int) read(srcfd, buffer, sizeof(buffer)) != (int) sizeof(buffer))
2169                 {
2170                         if (errno != 0)
2171                                 ereport(ERROR,
2172                                                 (errcode_for_file_access(),
2173                                                  errmsg("could not read file \"%s\": %m", path)));
2174                         else
2175                                 ereport(ERROR,
2176                                                 (errmsg("not enough data in file \"%s\"", path)));
2177                 }
2178                 errno = 0;
2179                 if ((int) write(fd, buffer, sizeof(buffer)) != (int) sizeof(buffer))
2180                 {
2181                         int                     save_errno = errno;
2182
2183                         /*
2184                          * If we fail to make the file, delete it to release disk space
2185                          */
2186                         unlink(tmppath);
2187                         /* if write didn't set errno, assume problem is no disk space */
2188                         errno = save_errno ? save_errno : ENOSPC;
2189
2190                         ereport(ERROR,
2191                                         (errcode_for_file_access(),
2192                                          errmsg("could not write to file \"%s\": %m", tmppath)));
2193                 }
2194         }
2195
2196         if (pg_fsync(fd) != 0)
2197                 ereport(ERROR,
2198                                 (errcode_for_file_access(),
2199                                  errmsg("could not fsync file \"%s\": %m", tmppath)));
2200
2201         if (close(fd))
2202                 ereport(ERROR,
2203                                 (errcode_for_file_access(),
2204                                  errmsg("could not close file \"%s\": %m", tmppath)));
2205
2206         close(srcfd);
2207
2208         /*
2209          * Now move the segment into place with its final name.
2210          */
2211         if (!InstallXLogFileSegment(&log, &seg, tmppath, false, NULL, false))
2212                 elog(ERROR, "InstallXLogFileSegment should not have failed");
2213 }
2214
2215 /*
2216  * Install a new XLOG segment file as a current or future log segment.
2217  *
2218  * This is used both to install a newly-created segment (which has a temp
2219  * filename while it's being created) and to recycle an old segment.
2220  *
2221  * *log, *seg: identify segment to install as (or first possible target).
2222  * When find_free is TRUE, these are modified on return to indicate the
2223  * actual installation location or last segment searched.
2224  *
2225  * tmppath: initial name of file to install.  It will be renamed into place.
2226  *
2227  * find_free: if TRUE, install the new segment at the first empty log/seg
2228  * number at or after the passed numbers.  If FALSE, install the new segment
2229  * exactly where specified, deleting any existing segment file there.
2230  *
2231  * *max_advance: maximum number of log/seg slots to advance past the starting
2232  * point.  Fail if no free slot is found in this range.  On return, reduced
2233  * by the number of slots skipped over.  (Irrelevant, and may be NULL,
2234  * when find_free is FALSE.)
2235  *
2236  * use_lock: if TRUE, acquire ControlFileLock while moving file into
2237  * place.  This should be TRUE except during bootstrap log creation.  The
2238  * caller must *not* hold the lock at call.
2239  *
2240  * Returns TRUE if file installed, FALSE if not installed because of
2241  * exceeding max_advance limit.  On Windows, we also return FALSE if we
2242  * can't rename the file into place because someone's got it open.
2243  * (Any other kind of failure causes ereport().)
2244  */
2245 static bool
2246 InstallXLogFileSegment(uint32 *log, uint32 *seg, char *tmppath,
2247                                            bool find_free, int *max_advance,
2248                                            bool use_lock)
2249 {
2250         char            path[MAXPGPATH];
2251         struct stat stat_buf;
2252
2253         XLogFilePath(path, ThisTimeLineID, *log, *seg);
2254
2255         /*
2256          * We want to be sure that only one process does this at a time.
2257          */
2258         if (use_lock)
2259                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
2260
2261         if (!find_free)
2262         {
2263                 /* Force installation: get rid of any pre-existing segment file */
2264                 unlink(path);
2265         }
2266         else
2267         {
2268                 /* Find a free slot to put it in */
2269                 while (stat(path, &stat_buf) == 0)
2270                 {
2271                         if (*max_advance <= 0)
2272                         {
2273                                 /* Failed to find a free slot within specified range */
2274                                 if (use_lock)
2275                                         LWLockRelease(ControlFileLock);
2276                                 return false;
2277                         }
2278                         NextLogSeg(*log, *seg);
2279                         (*max_advance)--;
2280                         XLogFilePath(path, ThisTimeLineID, *log, *seg);
2281                 }
2282         }
2283
2284         /*
2285          * Prefer link() to rename() here just to be really sure that we don't
2286          * overwrite an existing logfile.  However, there shouldn't be one, so
2287          * rename() is an acceptable substitute except for the truly paranoid.
2288          */
2289 #if HAVE_WORKING_LINK
2290         if (link(tmppath, path) < 0)
2291                 ereport(ERROR,
2292                                 (errcode_for_file_access(),
2293                                  errmsg("could not link file \"%s\" to \"%s\" (initialization of log file %u, segment %u): %m",
2294                                                 tmppath, path, *log, *seg)));
2295         unlink(tmppath);
2296 #else
2297         if (rename(tmppath, path) < 0)
2298         {
2299 #ifdef WIN32
2300 #if !defined(__CYGWIN__)
2301                 if (GetLastError() == ERROR_ACCESS_DENIED)
2302 #else
2303                 if (errno == EACCES)
2304 #endif
2305                 {
2306                         if (use_lock)
2307                                 LWLockRelease(ControlFileLock);
2308                         return false;
2309                 }
2310 #endif   /* WIN32 */
2311
2312                 ereport(ERROR,
2313                                 (errcode_for_file_access(),
2314                                  errmsg("could not rename file \"%s\" to \"%s\" (initialization of log file %u, segment %u): %m",
2315                                                 tmppath, path, *log, *seg)));
2316         }
2317 #endif
2318
2319         if (use_lock)
2320                 LWLockRelease(ControlFileLock);
2321
2322         return true;
2323 }
2324
2325 /*
2326  * Open a pre-existing logfile segment for writing.
2327  */
2328 static int
2329 XLogFileOpen(uint32 log, uint32 seg)
2330 {
2331         char            path[MAXPGPATH];
2332         int                     fd;
2333
2334         XLogFilePath(path, ThisTimeLineID, log, seg);
2335
2336         fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
2337                                            S_IRUSR | S_IWUSR);
2338         if (fd < 0)
2339                 ereport(PANIC,
2340                                 (errcode_for_file_access(),
2341                    errmsg("could not open file \"%s\" (log file %u, segment %u): %m",
2342                                   path, log, seg)));
2343
2344         return fd;
2345 }
2346
2347 /*
2348  * Open a logfile segment for reading (during recovery).
2349  */
2350 static int
2351 XLogFileRead(uint32 log, uint32 seg, int emode)
2352 {
2353         char            path[MAXPGPATH];
2354         char            xlogfname[MAXFNAMELEN];
2355         char            activitymsg[MAXFNAMELEN + 16];
2356         ListCell   *cell;
2357         int                     fd;
2358
2359         /*
2360          * Loop looking for a suitable timeline ID: we might need to read any of
2361          * the timelines listed in expectedTLIs.
2362          *
2363          * We expect curFileTLI on entry to be the TLI of the preceding file in
2364          * sequence, or 0 if there was no predecessor.  We do not allow curFileTLI
2365          * to go backwards; this prevents us from picking up the wrong file when a
2366          * parent timeline extends to higher segment numbers than the child we
2367          * want to read.
2368          */
2369         foreach(cell, expectedTLIs)
2370         {
2371                 TimeLineID      tli = (TimeLineID) lfirst_int(cell);
2372
2373                 if (tli < curFileTLI)
2374                         break;                          /* don't bother looking at too-old TLIs */
2375
2376                 XLogFileName(xlogfname, tli, log, seg);
2377
2378                 if (InArchiveRecovery)
2379                 {
2380                         /* Report recovery progress in PS display */
2381                         snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
2382                                          xlogfname);
2383                         set_ps_display(activitymsg, false);
2384
2385                         restoredFromArchive = RestoreArchivedFile(path, xlogfname,
2386                                                                                                           "RECOVERYXLOG",
2387                                                                                                           XLogSegSize);
2388                 }
2389                 else
2390                         XLogFilePath(path, tli, log, seg);
2391
2392                 fd = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0);
2393                 if (fd >= 0)
2394                 {
2395                         /* Success! */
2396                         curFileTLI = tli;
2397
2398                         /* Report recovery progress in PS display */
2399                         snprintf(activitymsg, sizeof(activitymsg), "recovering %s",
2400                                          xlogfname);
2401                         set_ps_display(activitymsg, false);
2402
2403                         return fd;
2404                 }
2405                 if (errno != ENOENT)    /* unexpected failure? */
2406                         ereport(PANIC,
2407                                         (errcode_for_file_access(),
2408                         errmsg("could not open file \"%s\" (log file %u, segment %u): %m",
2409                                    path, log, seg)));
2410         }
2411
2412         /* Couldn't find it.  For simplicity, complain about front timeline */
2413         XLogFilePath(path, recoveryTargetTLI, log, seg);
2414         errno = ENOENT;
2415         ereport(emode,
2416                         (errcode_for_file_access(),
2417                    errmsg("could not open file \"%s\" (log file %u, segment %u): %m",
2418                                   path, log, seg)));
2419         return -1;
2420 }
2421
2422 /*
2423  * Close the current logfile segment for writing.
2424  */
2425 static void
2426 XLogFileClose(void)
2427 {
2428         Assert(openLogFile >= 0);
2429
2430         /*
2431          * posix_fadvise is problematic on many platforms: on older x86 Linux it
2432          * just dumps core, and there are reports of problems on PPC platforms as
2433          * well.  The following is therefore disabled for the time being. We could
2434          * consider some kind of configure test to see if it's safe to use, but
2435          * since we lack hard evidence that there's any useful performance gain to
2436          * be had, spending time on that seems unprofitable for now.
2437          */
2438 #ifdef NOT_USED
2439
2440         /*
2441          * WAL segment files will not be re-read in normal operation, so we advise
2442          * OS to release any cached pages.      But do not do so if WAL archiving is
2443          * active, because archiver process could use the cache to read the WAL
2444          * segment.
2445          *
2446          * While O_DIRECT works for O_SYNC, posix_fadvise() works for fsync() and
2447          * O_SYNC, and some platforms only have posix_fadvise().
2448          */
2449 #if defined(HAVE_DECL_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
2450         if (!XLogArchivingActive())
2451                 posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED);
2452 #endif
2453 #endif   /* NOT_USED */
2454
2455         if (close(openLogFile))
2456                 ereport(PANIC,
2457                                 (errcode_for_file_access(),
2458                                  errmsg("could not close log file %u, segment %u: %m",
2459                                                 openLogId, openLogSeg)));
2460         openLogFile = -1;
2461 }
2462
2463 /*
2464  * Attempt to retrieve the specified file from off-line archival storage.
2465  * If successful, fill "path" with its complete path (note that this will be
2466  * a temp file name that doesn't follow the normal naming convention), and
2467  * return TRUE.
2468  *
2469  * If not successful, fill "path" with the name of the normal on-line file
2470  * (which may or may not actually exist, but we'll try to use it), and return
2471  * FALSE.
2472  *
2473  * For fixed-size files, the caller may pass the expected size as an
2474  * additional crosscheck on successful recovery.  If the file size is not
2475  * known, set expectedSize = 0.
2476  */
2477 static bool
2478 RestoreArchivedFile(char *path, const char *xlogfname,
2479                                         const char *recovername, off_t expectedSize)
2480 {
2481         char            xlogpath[MAXPGPATH];
2482         char            xlogRestoreCmd[MAXPGPATH];
2483         char            lastRestartPointFname[MAXPGPATH];
2484         char       *dp;
2485         char       *endp;
2486         const char *sp;
2487         int                     rc;
2488         bool            signaled;
2489         struct stat stat_buf;
2490         uint32          restartLog;
2491         uint32          restartSeg;
2492
2493         /*
2494          * When doing archive recovery, we always prefer an archived log file even
2495          * if a file of the same name exists in XLOGDIR.  The reason is that the
2496          * file in XLOGDIR could be an old, un-filled or partly-filled version
2497          * that was copied and restored as part of backing up $PGDATA.
2498          *
2499          * We could try to optimize this slightly by checking the local copy
2500          * lastchange timestamp against the archived copy, but we have no API to
2501          * do this, nor can we guarantee that the lastchange timestamp was
2502          * preserved correctly when we copied to archive. Our aim is robustness,
2503          * so we elect not to do this.
2504          *
2505          * If we cannot obtain the log file from the archive, however, we will try
2506          * to use the XLOGDIR file if it exists.  This is so that we can make use
2507          * of log segments that weren't yet transferred to the archive.
2508          *
2509          * Notice that we don't actually overwrite any files when we copy back
2510          * from archive because the recoveryRestoreCommand may inadvertently
2511          * restore inappropriate xlogs, or they may be corrupt, so we may wish to
2512          * fallback to the segments remaining in current XLOGDIR later. The
2513          * copy-from-archive filename is always the same, ensuring that we don't
2514          * run out of disk space on long recoveries.
2515          */
2516         snprintf(xlogpath, MAXPGPATH, XLOGDIR "/%s", recovername);
2517
2518         /*
2519          * Make sure there is no existing file named recovername.
2520          */
2521         if (stat(xlogpath, &stat_buf) != 0)
2522         {
2523                 if (errno != ENOENT)
2524                         ereport(FATAL,
2525                                         (errcode_for_file_access(),
2526                                          errmsg("could not stat file \"%s\": %m",
2527                                                         xlogpath)));
2528         }
2529         else
2530         {
2531                 if (unlink(xlogpath) != 0)
2532                         ereport(FATAL,
2533                                         (errcode_for_file_access(),
2534                                          errmsg("could not remove file \"%s\": %m",
2535                                                         xlogpath)));
2536         }
2537
2538         /*
2539          * Calculate the archive file cutoff point for use during log shipping
2540          * replication. All files earlier than this point can be deleted
2541          * from the archive, though there is no requirement to do so.
2542          *
2543          * We initialise this with the filename of an InvalidXLogRecPtr, which
2544          * will prevent the deletion of any WAL files from the archive
2545          * because of the alphabetic sorting property of WAL filenames.
2546          *
2547          * Once we have successfully located the redo pointer of the checkpoint
2548          * from which we start recovery we never request a file prior to the redo
2549          * pointer of the last restartpoint. When redo begins we know that we
2550          * have successfully located it, so there is no need for additional
2551          * status flags to signify the point when we can begin deleting WAL files
2552          * from the archive.
2553          */
2554         if (InRedo)
2555         {
2556                 XLByteToSeg(ControlFile->checkPointCopy.redo,
2557                                         restartLog, restartSeg);
2558                 XLogFileName(lastRestartPointFname,
2559                                          ControlFile->checkPointCopy.ThisTimeLineID,
2560                                          restartLog, restartSeg);
2561                 /* we shouldn't need anything earlier than last restart point */
2562                 Assert(strcmp(lastRestartPointFname, xlogfname) <= 0);
2563         }
2564         else
2565                 XLogFileName(lastRestartPointFname, 0, 0, 0);
2566
2567         /*
2568          * construct the command to be executed
2569          */
2570         dp = xlogRestoreCmd;
2571         endp = xlogRestoreCmd + MAXPGPATH - 1;
2572         *endp = '\0';
2573
2574         for (sp = recoveryRestoreCommand; *sp; sp++)
2575         {
2576                 if (*sp == '%')
2577                 {
2578                         switch (sp[1])
2579                         {
2580                                 case 'p':
2581                                         /* %p: relative path of target file */
2582                                         sp++;
2583                                         StrNCpy(dp, xlogpath, endp - dp);
2584                                         make_native_path(dp);
2585                                         dp += strlen(dp);
2586                                         break;
2587                                 case 'f':
2588                                         /* %f: filename of desired file */
2589                                         sp++;
2590                                         StrNCpy(dp, xlogfname, endp - dp);
2591                                         dp += strlen(dp);
2592                                         break;
2593                                 case 'r':
2594                                         /* %r: filename of last restartpoint */
2595                                         sp++;
2596                                         StrNCpy(dp, lastRestartPointFname, endp - dp);
2597                                         dp += strlen(dp);
2598                                         break;
2599                                 case '%':
2600                                         /* convert %% to a single % */
2601                                         sp++;
2602                                         if (dp < endp)
2603                                                 *dp++ = *sp;
2604                                         break;
2605                                 default:
2606                                         /* otherwise treat the % as not special */
2607                                         if (dp < endp)
2608                                                 *dp++ = *sp;
2609                                         break;
2610                         }
2611                 }
2612                 else
2613                 {
2614                         if (dp < endp)
2615                                 *dp++ = *sp;
2616                 }
2617         }
2618         *dp = '\0';
2619
2620         ereport(DEBUG3,
2621                         (errmsg_internal("executing restore command \"%s\"",
2622                                                          xlogRestoreCmd)));
2623
2624         /*
2625          * Copy xlog from archival storage to XLOGDIR
2626          */
2627         rc = system(xlogRestoreCmd);
2628         if (rc == 0)
2629         {
2630                 /*
2631                  * command apparently succeeded, but let's make sure the file is
2632                  * really there now and has the correct size.
2633                  *
2634                  * XXX I made wrong-size a fatal error to ensure the DBA would notice
2635                  * it, but is that too strong?  We could try to plow ahead with a
2636                  * local copy of the file ... but the problem is that there probably
2637                  * isn't one, and we'd incorrectly conclude we've reached the end of
2638                  * WAL and we're done recovering ...
2639                  */
2640                 if (stat(xlogpath, &stat_buf) == 0)
2641                 {
2642                         if (expectedSize > 0 && stat_buf.st_size != expectedSize)
2643                                 ereport(FATAL,
2644                                                 (errmsg("archive file \"%s\" has wrong size: %lu instead of %lu",
2645                                                                 xlogfname,
2646                                                                 (unsigned long) stat_buf.st_size,
2647                                                                 (unsigned long) expectedSize)));
2648                         else
2649                         {
2650                                 ereport(LOG,
2651                                                 (errmsg("restored log file \"%s\" from archive",
2652                                                                 xlogfname)));
2653                                 strcpy(path, xlogpath);
2654                                 return true;
2655                         }
2656                 }
2657                 else
2658                 {
2659                         /* stat failed */
2660                         if (errno != ENOENT)
2661                                 ereport(FATAL,
2662                                                 (errcode_for_file_access(),
2663                                                  errmsg("could not stat file \"%s\": %m",
2664                                                                 xlogpath)));
2665                 }
2666         }
2667
2668         /*
2669          * Remember, we rollforward UNTIL the restore fails so failure here is
2670          * just part of the process... that makes it difficult to determine
2671          * whether the restore failed because there isn't an archive to restore,
2672          * or because the administrator has specified the restore program
2673          * incorrectly.  We have to assume the former.
2674          *
2675          * However, if the failure was due to any sort of signal, it's best to
2676          * punt and abort recovery.  (If we "return false" here, upper levels will
2677          * assume that recovery is complete and start up the database!) It's
2678          * essential to abort on child SIGINT and SIGQUIT, because per spec
2679          * system() ignores SIGINT and SIGQUIT while waiting; if we see one of
2680          * those it's a good bet we should have gotten it too.  Aborting on other
2681          * signals such as SIGTERM seems a good idea as well.
2682          *
2683          * Per the Single Unix Spec, shells report exit status > 128 when a called
2684          * command died on a signal.  Also, 126 and 127 are used to report
2685          * problems such as an unfindable command; treat those as fatal errors
2686          * too.
2687          */
2688         signaled = WIFSIGNALED(rc) || WEXITSTATUS(rc) > 125;
2689
2690         ereport(signaled ? FATAL : DEBUG2,
2691                 (errmsg("could not restore file \"%s\" from archive: return code %d",
2692                                 xlogfname, rc)));
2693
2694         /*
2695          * if an archived file is not available, there might still be a version of
2696          * this file in XLOGDIR, so return that as the filename to open.
2697          *
2698          * In many recovery scenarios we expect this to fail also, but if so that
2699          * just means we've reached the end of WAL.
2700          */
2701         snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname);
2702         return false;
2703 }
2704
2705 /*
2706  * Preallocate log files beyond the specified log endpoint.
2707  *
2708  * XXX this is currently extremely conservative, since it forces only one
2709  * future log segment to exist, and even that only if we are 75% done with
2710  * the current one.  This is only appropriate for very low-WAL-volume systems.
2711  * High-volume systems will be OK once they've built up a sufficient set of
2712  * recycled log segments, but the startup transient is likely to include
2713  * a lot of segment creations by foreground processes, which is not so good.
2714  */
2715 static void
2716 PreallocXlogFiles(XLogRecPtr endptr)
2717 {
2718         uint32          _logId;
2719         uint32          _logSeg;
2720         int                     lf;
2721         bool            use_existent;
2722
2723         XLByteToPrevSeg(endptr, _logId, _logSeg);
2724         if ((endptr.xrecoff - 1) % XLogSegSize >=
2725                 (uint32) (0.75 * XLogSegSize))
2726         {
2727                 NextLogSeg(_logId, _logSeg);
2728                 use_existent = true;
2729                 lf = XLogFileInit(_logId, _logSeg, &use_existent, true);
2730                 close(lf);
2731                 if (!use_existent)
2732                         CheckpointStats.ckpt_segs_added++;
2733         }
2734 }
2735
2736 /*
2737  * Recycle or remove all log files older or equal to passed log/seg#
2738  *
2739  * endptr is current (or recent) end of xlog; this is used to determine
2740  * whether we want to recycle rather than delete no-longer-wanted log files.
2741  */
2742 static void
2743 RemoveOldXlogFiles(uint32 log, uint32 seg, XLogRecPtr endptr)
2744 {
2745         uint32          endlogId;
2746         uint32          endlogSeg;
2747         int                     max_advance;
2748         DIR                *xldir;
2749         struct dirent *xlde;
2750         char            lastoff[MAXFNAMELEN];
2751         char            path[MAXPGPATH];
2752
2753         /*
2754          * Initialize info about where to try to recycle to.  We allow recycling
2755          * segments up to XLOGfileslop segments beyond the current XLOG location.
2756          */
2757         XLByteToPrevSeg(endptr, endlogId, endlogSeg);
2758         max_advance = XLOGfileslop;
2759
2760         xldir = AllocateDir(XLOGDIR);
2761         if (xldir == NULL)
2762                 ereport(ERROR,
2763                                 (errcode_for_file_access(),
2764                                  errmsg("could not open transaction log directory \"%s\": %m",
2765                                                 XLOGDIR)));
2766
2767         XLogFileName(lastoff, ThisTimeLineID, log, seg);
2768
2769         while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
2770         {
2771                 /*
2772                  * We ignore the timeline part of the XLOG segment identifiers in
2773                  * deciding whether a segment is still needed.  This ensures that we
2774                  * won't prematurely remove a segment from a parent timeline. We could
2775                  * probably be a little more proactive about removing segments of
2776                  * non-parent timelines, but that would be a whole lot more
2777                  * complicated.
2778                  *
2779                  * We use the alphanumeric sorting property of the filenames to decide
2780                  * which ones are earlier than the lastoff segment.
2781                  */
2782                 if (strlen(xlde->d_name) == 24 &&
2783                         strspn(xlde->d_name, "0123456789ABCDEF") == 24 &&
2784                         strcmp(xlde->d_name + 8, lastoff + 8) <= 0)
2785                 {
2786                         if (XLogArchiveCheckDone(xlde->d_name))
2787                         {
2788                                 snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name);
2789
2790                                 /*
2791                                  * Before deleting the file, see if it can be recycled as a
2792                                  * future log segment.
2793                                  */
2794                                 if (InstallXLogFileSegment(&endlogId, &endlogSeg, path,
2795                                                                                    true, &max_advance,
2796                                                                                    true))
2797                                 {
2798                                         ereport(DEBUG2,
2799                                                         (errmsg("recycled transaction log file \"%s\"",
2800                                                                         xlde->d_name)));
2801                                         CheckpointStats.ckpt_segs_recycled++;
2802                                         /* Needn't recheck that slot on future iterations */
2803                                         if (max_advance > 0)
2804                                         {
2805                                                 NextLogSeg(endlogId, endlogSeg);
2806                                                 max_advance--;
2807                                         }
2808                                 }
2809                                 else
2810                                 {
2811                                         /* No need for any more future segments... */
2812                                         ereport(DEBUG2,
2813                                                         (errmsg("removing transaction log file \"%s\"",
2814                                                                         xlde->d_name)));
2815                                         unlink(path);
2816                                         CheckpointStats.ckpt_segs_removed++;
2817                                 }
2818
2819                                 XLogArchiveCleanup(xlde->d_name);
2820                         }
2821                 }
2822         }
2823
2824         FreeDir(xldir);
2825 }
2826
2827 /*
2828  * Remove previous backup history files.  This also retries creation of
2829  * .ready files for any backup history files for which XLogArchiveNotify
2830  * failed earlier.
2831  */
2832 static void
2833 CleanupBackupHistory(void)
2834 {
2835         DIR                *xldir;
2836         struct dirent *xlde;
2837         char            path[MAXPGPATH];
2838
2839         xldir = AllocateDir(XLOGDIR);
2840         if (xldir == NULL)
2841                 ereport(ERROR,
2842                                 (errcode_for_file_access(),
2843                                  errmsg("could not open transaction log directory \"%s\": %m",
2844                                                 XLOGDIR)));
2845
2846         while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
2847         {
2848                 if (strlen(xlde->d_name) > 24 &&
2849                         strspn(xlde->d_name, "0123456789ABCDEF") == 24 &&
2850                         strcmp(xlde->d_name + strlen(xlde->d_name) - strlen(".backup"),
2851                                    ".backup") == 0)
2852                 {
2853                         if (XLogArchiveCheckDone(xlde->d_name))
2854                         {
2855                                 ereport(DEBUG2,
2856                                 (errmsg("removing transaction log backup history file \"%s\"",
2857                                                 xlde->d_name)));
2858                                 snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name);
2859                                 unlink(path);
2860                                 XLogArchiveCleanup(xlde->d_name);
2861                         }
2862                 }
2863         }
2864
2865         FreeDir(xldir);
2866 }
2867
2868 /*
2869  * Restore the backup blocks present in an XLOG record, if any.
2870  *
2871  * We assume all of the record has been read into memory at *record.
2872  *
2873  * Note: when a backup block is available in XLOG, we restore it
2874  * unconditionally, even if the page in the database appears newer.
2875  * This is to protect ourselves against database pages that were partially
2876  * or incorrectly written during a crash.  We assume that the XLOG data
2877  * must be good because it has passed a CRC check, while the database
2878  * page might not be.  This will force us to replay all subsequent
2879  * modifications of the page that appear in XLOG, rather than possibly
2880  * ignoring them as already applied, but that's not a huge drawback.
2881  */
2882 static void
2883 RestoreBkpBlocks(XLogRecord *record, XLogRecPtr lsn)
2884 {
2885         Buffer          buffer;
2886         Page            page;
2887         BkpBlock        bkpb;
2888         char       *blk;
2889         int                     i;
2890
2891         blk = (char *) XLogRecGetData(record) + record->xl_len;
2892         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
2893         {
2894                 if (!(record->xl_info & XLR_SET_BKP_BLOCK(i)))
2895                         continue;
2896
2897                 memcpy(&bkpb, blk, sizeof(BkpBlock));
2898                 blk += sizeof(BkpBlock);
2899
2900                 buffer = XLogReadBufferWithFork(bkpb.node, bkpb.fork, bkpb.block,
2901                                                                                 true);
2902                 Assert(BufferIsValid(buffer));
2903                 page = (Page) BufferGetPage(buffer);
2904
2905                 if (bkpb.hole_length == 0)
2906                 {
2907                         memcpy((char *) page, blk, BLCKSZ);
2908                 }
2909                 else
2910                 {
2911                         /* must zero-fill the hole */
2912                         MemSet((char *) page, 0, BLCKSZ);
2913                         memcpy((char *) page, blk, bkpb.hole_offset);
2914                         memcpy((char *) page + (bkpb.hole_offset + bkpb.hole_length),
2915                                    blk + bkpb.hole_offset,
2916                                    BLCKSZ - (bkpb.hole_offset + bkpb.hole_length));
2917                 }
2918
2919                 PageSetLSN(page, lsn);
2920                 PageSetTLI(page, ThisTimeLineID);
2921                 MarkBufferDirty(buffer);
2922                 UnlockReleaseBuffer(buffer);
2923
2924                 blk += BLCKSZ - bkpb.hole_length;
2925         }
2926 }
2927
2928 /*
2929  * CRC-check an XLOG record.  We do not believe the contents of an XLOG
2930  * record (other than to the minimal extent of computing the amount of
2931  * data to read in) until we've checked the CRCs.
2932  *
2933  * We assume all of the record has been read into memory at *record.
2934  */
2935 static bool
2936 RecordIsValid(XLogRecord *record, XLogRecPtr recptr, int emode)
2937 {
2938         pg_crc32        crc;
2939         int                     i;
2940         uint32          len = record->xl_len;
2941         BkpBlock        bkpb;
2942         char       *blk;
2943
2944         /* First the rmgr data */
2945         INIT_CRC32(crc);
2946         COMP_CRC32(crc, XLogRecGetData(record), len);
2947
2948         /* Add in the backup blocks, if any */
2949         blk = (char *) XLogRecGetData(record) + len;
2950         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
2951         {
2952                 uint32          blen;
2953
2954                 if (!(record->xl_info & XLR_SET_BKP_BLOCK(i)))
2955                         continue;
2956
2957                 memcpy(&bkpb, blk, sizeof(BkpBlock));
2958                 if (bkpb.hole_offset + bkpb.hole_length > BLCKSZ)
2959                 {
2960                         ereport(emode,
2961                                         (errmsg("incorrect hole size in record at %X/%X",
2962                                                         recptr.xlogid, recptr.xrecoff)));
2963                         return false;
2964                 }
2965                 blen = sizeof(BkpBlock) + BLCKSZ - bkpb.hole_length;
2966                 COMP_CRC32(crc, blk, blen);
2967                 blk += blen;
2968         }
2969
2970         /* Check that xl_tot_len agrees with our calculation */
2971         if (blk != (char *) record + record->xl_tot_len)
2972         {
2973                 ereport(emode,
2974                                 (errmsg("incorrect total length in record at %X/%X",
2975                                                 recptr.xlogid, recptr.xrecoff)));
2976                 return false;
2977         }
2978
2979         /* Finally include the record header */
2980         COMP_CRC32(crc, (char *) record + sizeof(pg_crc32),
2981                            SizeOfXLogRecord - sizeof(pg_crc32));
2982         FIN_CRC32(crc);
2983
2984         if (!EQ_CRC32(record->xl_crc, crc))
2985         {
2986                 ereport(emode,
2987                 (errmsg("incorrect resource manager data checksum in record at %X/%X",
2988                                 recptr.xlogid, recptr.xrecoff)));
2989                 return false;
2990         }
2991
2992         return true;
2993 }
2994
2995 /*
2996  * Attempt to read an XLOG record.
2997  *
2998  * If RecPtr is not NULL, try to read a record at that position.  Otherwise
2999  * try to read a record just after the last one previously read.
3000  *
3001  * If no valid record is available, returns NULL, or fails if emode is PANIC.
3002  * (emode must be either PANIC or LOG.)
3003  *
3004  * The record is copied into readRecordBuf, so that on successful return,
3005  * the returned record pointer always points there.
3006  */
3007 static XLogRecord *
3008 ReadRecord(XLogRecPtr *RecPtr, int emode)
3009 {
3010         XLogRecord *record;
3011         char       *buffer;
3012         XLogRecPtr      tmpRecPtr = EndRecPtr;
3013         bool            randAccess = false;
3014         uint32          len,
3015                                 total_len;
3016         uint32          targetPageOff;
3017         uint32          targetRecOff;
3018         uint32          pageHeaderSize;
3019
3020         if (readBuf == NULL)
3021         {
3022                 /*
3023                  * First time through, permanently allocate readBuf.  We do it this
3024                  * way, rather than just making a static array, for two reasons: (1)
3025                  * no need to waste the storage in most instantiations of the backend;
3026                  * (2) a static char array isn't guaranteed to have any particular
3027                  * alignment, whereas malloc() will provide MAXALIGN'd storage.
3028                  */
3029                 readBuf = (char *) malloc(XLOG_BLCKSZ);
3030                 Assert(readBuf != NULL);
3031         }
3032
3033         if (RecPtr == NULL)
3034         {
3035                 RecPtr = &tmpRecPtr;
3036                 /* fast case if next record is on same page */
3037                 if (nextRecord != NULL)
3038                 {
3039                         record = nextRecord;
3040                         goto got_record;
3041                 }
3042                 /* align old recptr to next page */
3043                 if (tmpRecPtr.xrecoff % XLOG_BLCKSZ != 0)
3044                         tmpRecPtr.xrecoff += (XLOG_BLCKSZ - tmpRecPtr.xrecoff % XLOG_BLCKSZ);
3045                 if (tmpRecPtr.xrecoff >= XLogFileSize)
3046                 {
3047                         (tmpRecPtr.xlogid)++;
3048                         tmpRecPtr.xrecoff = 0;
3049                 }
3050                 /* We will account for page header size below */
3051         }
3052         else
3053         {
3054                 if (!XRecOffIsValid(RecPtr->xrecoff))
3055                         ereport(PANIC,
3056                                         (errmsg("invalid record offset at %X/%X",
3057                                                         RecPtr->xlogid, RecPtr->xrecoff)));
3058
3059                 /*
3060                  * Since we are going to a random position in WAL, forget any prior
3061                  * state about what timeline we were in, and allow it to be any
3062                  * timeline in expectedTLIs.  We also set a flag to allow curFileTLI
3063                  * to go backwards (but we can't reset that variable right here, since
3064                  * we might not change files at all).
3065                  */
3066                 lastPageTLI = 0;                /* see comment in ValidXLOGHeader */
3067                 randAccess = true;              /* allow curFileTLI to go backwards too */
3068         }
3069
3070         if (readFile >= 0 && !XLByteInSeg(*RecPtr, readId, readSeg))
3071         {
3072                 close(readFile);
3073                 readFile = -1;
3074         }
3075         XLByteToSeg(*RecPtr, readId, readSeg);
3076         if (readFile < 0)
3077         {
3078                 /* Now it's okay to reset curFileTLI if random fetch */
3079                 if (randAccess)
3080                         curFileTLI = 0;
3081
3082                 readFile = XLogFileRead(readId, readSeg, emode);
3083                 if (readFile < 0)
3084                         goto next_record_is_invalid;
3085
3086                 /*
3087                  * Whenever switching to a new WAL segment, we read the first page of
3088                  * the file and validate its header, even if that's not where the
3089                  * target record is.  This is so that we can check the additional
3090                  * identification info that is present in the first page's "long"
3091                  * header.
3092                  */
3093                 readOff = 0;
3094                 if (read(readFile, readBuf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
3095                 {
3096                         ereport(emode,
3097                                         (errcode_for_file_access(),
3098                                          errmsg("could not read from log file %u, segment %u, offset %u: %m",
3099                                                         readId, readSeg, readOff)));
3100                         goto next_record_is_invalid;
3101                 }
3102                 if (!ValidXLOGHeader((XLogPageHeader) readBuf, emode))
3103                         goto next_record_is_invalid;
3104         }
3105
3106         targetPageOff = ((RecPtr->xrecoff % XLogSegSize) / XLOG_BLCKSZ) * XLOG_BLCKSZ;
3107         if (readOff != targetPageOff)
3108         {
3109                 readOff = targetPageOff;
3110                 if (lseek(readFile, (off_t) readOff, SEEK_SET) < 0)
3111                 {
3112                         ereport(emode,
3113                                         (errcode_for_file_access(),
3114                                          errmsg("could not seek in log file %u, segment %u to offset %u: %m",
3115                                                         readId, readSeg, readOff)));
3116                         goto next_record_is_invalid;
3117                 }
3118                 if (read(readFile, readBuf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
3119                 {
3120                         ereport(emode,
3121                                         (errcode_for_file_access(),
3122                                          errmsg("could not read from log file %u, segment %u, offset %u: %m",
3123                                                         readId, readSeg, readOff)));
3124                         goto next_record_is_invalid;
3125                 }
3126                 if (!ValidXLOGHeader((XLogPageHeader) readBuf, emode))
3127                         goto next_record_is_invalid;
3128         }
3129         pageHeaderSize = XLogPageHeaderSize((XLogPageHeader) readBuf);
3130         targetRecOff = RecPtr->xrecoff % XLOG_BLCKSZ;
3131         if (targetRecOff == 0)
3132         {
3133                 /*
3134                  * Can only get here in the continuing-from-prev-page case, because
3135                  * XRecOffIsValid eliminated the zero-page-offset case otherwise. Need
3136                  * to skip over the new page's header.
3137                  */
3138                 tmpRecPtr.xrecoff += pageHeaderSize;
3139                 targetRecOff = pageHeaderSize;
3140         }
3141         else if (targetRecOff < pageHeaderSize)
3142         {
3143                 ereport(emode,
3144                                 (errmsg("invalid record offset at %X/%X",
3145                                                 RecPtr->xlogid, RecPtr->xrecoff)));
3146                 goto next_record_is_invalid;
3147         }
3148         if ((((XLogPageHeader) readBuf)->xlp_info & XLP_FIRST_IS_CONTRECORD) &&
3149                 targetRecOff == pageHeaderSize)
3150         {
3151                 ereport(emode,
3152                                 (errmsg("contrecord is requested by %X/%X",
3153                                                 RecPtr->xlogid, RecPtr->xrecoff)));
3154                 goto next_record_is_invalid;
3155         }
3156         record = (XLogRecord *) ((char *) readBuf + RecPtr->xrecoff % XLOG_BLCKSZ);
3157
3158 got_record:;
3159
3160         /*
3161          * xl_len == 0 is bad data for everything except XLOG SWITCH, where it is
3162          * required.
3163          */
3164         if (record->xl_rmid == RM_XLOG_ID && record->xl_info == XLOG_SWITCH)
3165         {
3166                 if (record->xl_len != 0)
3167                 {
3168                         ereport(emode,
3169                                         (errmsg("invalid xlog switch record at %X/%X",
3170                                                         RecPtr->xlogid, RecPtr->xrecoff)));
3171                         goto next_record_is_invalid;
3172                 }
3173         }
3174         else if (record->xl_len == 0)
3175         {
3176                 ereport(emode,
3177                                 (errmsg("record with zero length at %X/%X",
3178                                                 RecPtr->xlogid, RecPtr->xrecoff)));
3179                 goto next_record_is_invalid;
3180         }
3181         if (record->xl_tot_len < SizeOfXLogRecord + record->xl_len ||
3182                 record->xl_tot_len > SizeOfXLogRecord + record->xl_len +
3183                 XLR_MAX_BKP_BLOCKS * (sizeof(BkpBlock) + BLCKSZ))
3184         {
3185                 ereport(emode,
3186                                 (errmsg("invalid record length at %X/%X",
3187                                                 RecPtr->xlogid, RecPtr->xrecoff)));
3188                 goto next_record_is_invalid;
3189         }
3190         if (record->xl_rmid > RM_MAX_ID)
3191         {
3192                 ereport(emode,
3193                                 (errmsg("invalid resource manager ID %u at %X/%X",
3194                                                 record->xl_rmid, RecPtr->xlogid, RecPtr->xrecoff)));
3195                 goto next_record_is_invalid;
3196         }
3197         if (randAccess)
3198         {
3199                 /*
3200                  * We can't exactly verify the prev-link, but surely it should be less
3201                  * than the record's own address.
3202                  */
3203                 if (!XLByteLT(record->xl_prev, *RecPtr))
3204                 {
3205                         ereport(emode,
3206                                         (errmsg("record with incorrect prev-link %X/%X at %X/%X",
3207                                                         record->xl_prev.xlogid, record->xl_prev.xrecoff,
3208                                                         RecPtr->xlogid, RecPtr->xrecoff)));
3209                         goto next_record_is_invalid;
3210                 }
3211         }
3212         else
3213         {
3214                 /*
3215                  * Record's prev-link should exactly match our previous location. This
3216                  * check guards against torn WAL pages where a stale but valid-looking
3217                  * WAL record starts on a sector boundary.
3218                  */
3219                 if (!XLByteEQ(record->xl_prev, ReadRecPtr))
3220                 {
3221                         ereport(emode,
3222                                         (errmsg("record with incorrect prev-link %X/%X at %X/%X",
3223                                                         record->xl_prev.xlogid, record->xl_prev.xrecoff,
3224                                                         RecPtr->xlogid, RecPtr->xrecoff)));
3225                         goto next_record_is_invalid;
3226                 }
3227         }
3228
3229         /*
3230          * Allocate or enlarge readRecordBuf as needed.  To avoid useless small
3231          * increases, round its size to a multiple of XLOG_BLCKSZ, and make sure
3232          * it's at least 4*Max(BLCKSZ, XLOG_BLCKSZ) to start with.  (That is
3233          * enough for all "normal" records, but very large commit or abort records
3234          * might need more space.)
3235          */
3236         total_len = record->xl_tot_len;
3237         if (total_len > readRecordBufSize)
3238         {
3239                 uint32          newSize = total_len;
3240
3241                 newSize += XLOG_BLCKSZ - (newSize % XLOG_BLCKSZ);
3242                 newSize = Max(newSize, 4 * Max(BLCKSZ, XLOG_BLCKSZ));
3243                 if (readRecordBuf)
3244                         free(readRecordBuf);
3245                 readRecordBuf = (char *) malloc(newSize);
3246                 if (!readRecordBuf)
3247                 {
3248                         readRecordBufSize = 0;
3249                         /* We treat this as a "bogus data" condition */
3250                         ereport(emode,
3251                                         (errmsg("record length %u at %X/%X too long",
3252                                                         total_len, RecPtr->xlogid, RecPtr->xrecoff)));
3253                         goto next_record_is_invalid;
3254                 }
3255                 readRecordBufSize = newSize;
3256         }
3257
3258         buffer = readRecordBuf;
3259         nextRecord = NULL;
3260         len = XLOG_BLCKSZ - RecPtr->xrecoff % XLOG_BLCKSZ;
3261         if (total_len > len)
3262         {
3263                 /* Need to reassemble record */
3264                 XLogContRecord *contrecord;
3265                 uint32          gotlen = len;
3266
3267                 memcpy(buffer, record, len);
3268                 record = (XLogRecord *) buffer;
3269                 buffer += len;
3270                 for (;;)
3271                 {
3272                         readOff += XLOG_BLCKSZ;
3273                         if (readOff >= XLogSegSize)
3274                         {
3275                                 close(readFile);
3276                                 readFile = -1;
3277                                 NextLogSeg(readId, readSeg);
3278                                 readFile = XLogFileRead(readId, readSeg, emode);
3279                                 if (readFile < 0)
3280                                         goto next_record_is_invalid;
3281                                 readOff = 0;
3282                         }
3283                         if (read(readFile, readBuf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
3284                         {
3285                                 ereport(emode,
3286                                                 (errcode_for_file_access(),
3287                                                  errmsg("could not read from log file %u, segment %u, offset %u: %m",
3288                                                                 readId, readSeg, readOff)));
3289                                 goto next_record_is_invalid;
3290                         }
3291                         if (!ValidXLOGHeader((XLogPageHeader) readBuf, emode))
3292                                 goto next_record_is_invalid;
3293                         if (!(((XLogPageHeader) readBuf)->xlp_info & XLP_FIRST_IS_CONTRECORD))
3294                         {
3295                                 ereport(emode,
3296                                                 (errmsg("there is no contrecord flag in log file %u, segment %u, offset %u",
3297                                                                 readId, readSeg, readOff)));
3298                                 goto next_record_is_invalid;
3299                         }
3300                         pageHeaderSize = XLogPageHeaderSize((XLogPageHeader) readBuf);
3301                         contrecord = (XLogContRecord *) ((char *) readBuf + pageHeaderSize);
3302                         if (contrecord->xl_rem_len == 0 ||
3303                                 total_len != (contrecord->xl_rem_len + gotlen))
3304                         {
3305                                 ereport(emode,
3306                                                 (errmsg("invalid contrecord length %u in log file %u, segment %u, offset %u",
3307                                                                 contrecord->xl_rem_len,
3308                                                                 readId, readSeg, readOff)));
3309                                 goto next_record_is_invalid;
3310                         }
3311                         len = XLOG_BLCKSZ - pageHeaderSize - SizeOfXLogContRecord;
3312                         if (contrecord->xl_rem_len > len)
3313                         {
3314                                 memcpy(buffer, (char *) contrecord + SizeOfXLogContRecord, len);
3315                                 gotlen += len;
3316                                 buffer += len;
3317                                 continue;
3318                         }
3319                         memcpy(buffer, (char *) contrecord + SizeOfXLogContRecord,
3320                                    contrecord->xl_rem_len);
3321                         break;
3322                 }
3323                 if (!RecordIsValid(record, *RecPtr, emode))
3324                         goto next_record_is_invalid;
3325                 pageHeaderSize = XLogPageHeaderSize((XLogPageHeader) readBuf);
3326                 if (XLOG_BLCKSZ - SizeOfXLogRecord >= pageHeaderSize +
3327                         MAXALIGN(SizeOfXLogContRecord + contrecord->xl_rem_len))
3328                 {
3329                         nextRecord = (XLogRecord *) ((char *) contrecord +
3330                                         MAXALIGN(SizeOfXLogContRecord + contrecord->xl_rem_len));
3331                 }
3332                 EndRecPtr.xlogid = readId;
3333                 EndRecPtr.xrecoff = readSeg * XLogSegSize + readOff +
3334                         pageHeaderSize +
3335                         MAXALIGN(SizeOfXLogContRecord + contrecord->xl_rem_len);
3336                 ReadRecPtr = *RecPtr;
3337                 /* needn't worry about XLOG SWITCH, it can't cross page boundaries */
3338                 return record;
3339         }
3340
3341         /* Record does not cross a page boundary */
3342         if (!RecordIsValid(record, *RecPtr, emode))
3343                 goto next_record_is_invalid;
3344         if (XLOG_BLCKSZ - SizeOfXLogRecord >= RecPtr->xrecoff % XLOG_BLCKSZ +
3345                 MAXALIGN(total_len))
3346                 nextRecord = (XLogRecord *) ((char *) record + MAXALIGN(total_len));
3347         EndRecPtr.xlogid = RecPtr->xlogid;
3348         EndRecPtr.xrecoff = RecPtr->xrecoff + MAXALIGN(total_len);
3349         ReadRecPtr = *RecPtr;
3350         memcpy(buffer, record, total_len);
3351
3352         /*
3353          * Special processing if it's an XLOG SWITCH record
3354          */
3355         if (record->xl_rmid == RM_XLOG_ID && record->xl_info == XLOG_SWITCH)
3356         {
3357                 /* Pretend it extends to end of segment */
3358                 EndRecPtr.xrecoff += XLogSegSize - 1;
3359                 EndRecPtr.xrecoff -= EndRecPtr.xrecoff % XLogSegSize;
3360                 nextRecord = NULL;              /* definitely not on same page */
3361
3362                 /*
3363                  * Pretend that readBuf contains the last page of the segment. This is
3364                  * just to avoid Assert failure in StartupXLOG if XLOG ends with this
3365                  * segment.
3366                  */
3367                 readOff = XLogSegSize - XLOG_BLCKSZ;
3368         }
3369         return (XLogRecord *) buffer;
3370
3371 next_record_is_invalid:;
3372         if (readFile >= 0)
3373         {
3374                 close(readFile);
3375                 readFile = -1;
3376         }
3377         nextRecord = NULL;
3378         return NULL;
3379 }
3380
3381 /*
3382  * Check whether the xlog header of a page just read in looks valid.
3383  *
3384  * This is just a convenience subroutine to avoid duplicated code in
3385  * ReadRecord.  It's not intended for use from anywhere else.
3386  */
3387 static bool
3388 ValidXLOGHeader(XLogPageHeader hdr, int emode)
3389 {
3390         XLogRecPtr      recaddr;
3391
3392         if (hdr->xlp_magic != XLOG_PAGE_MAGIC)
3393         {
3394                 ereport(emode,
3395                                 (errmsg("invalid magic number %04X in log file %u, segment %u, offset %u",
3396                                                 hdr->xlp_magic, readId, readSeg, readOff)));
3397                 return false;
3398         }
3399         if ((hdr->xlp_info & ~XLP_ALL_FLAGS) != 0)
3400         {
3401                 ereport(emode,
3402                                 (errmsg("invalid info bits %04X in log file %u, segment %u, offset %u",
3403                                                 hdr->xlp_info, readId, readSeg, readOff)));
3404                 return false;
3405         }
3406         if (hdr->xlp_info & XLP_LONG_HEADER)
3407         {
3408                 XLogLongPageHeader longhdr = (XLogLongPageHeader) hdr;
3409
3410                 if (longhdr->xlp_sysid != ControlFile->system_identifier)
3411                 {
3412                         char            fhdrident_str[32];
3413                         char            sysident_str[32];
3414
3415                         /*
3416                          * Format sysids separately to keep platform-dependent format code
3417                          * out of the translatable message string.
3418                          */
3419                         snprintf(fhdrident_str, sizeof(fhdrident_str), UINT64_FORMAT,
3420                                          longhdr->xlp_sysid);
3421                         snprintf(sysident_str, sizeof(sysident_str), UINT64_FORMAT,
3422                                          ControlFile->system_identifier);
3423                         ereport(emode,
3424                                         (errmsg("WAL file is from different system"),
3425                                          errdetail("WAL file SYSID is %s, pg_control SYSID is %s",
3426                                                            fhdrident_str, sysident_str)));
3427                         return false;
3428                 }
3429                 if (longhdr->xlp_seg_size != XLogSegSize)
3430                 {
3431                         ereport(emode,
3432                                         (errmsg("WAL file is from different system"),
3433                                          errdetail("Incorrect XLOG_SEG_SIZE in page header.")));
3434                         return false;
3435                 }
3436                 if (longhdr->xlp_xlog_blcksz != XLOG_BLCKSZ)
3437                 {
3438                         ereport(emode,
3439                                         (errmsg("WAL file is from different system"),
3440                                          errdetail("Incorrect XLOG_BLCKSZ in page header.")));
3441                         return false;
3442                 }
3443         }
3444         else if (readOff == 0)
3445         {
3446                 /* hmm, first page of file doesn't have a long header? */
3447                 ereport(emode,
3448                                 (errmsg("invalid info bits %04X in log file %u, segment %u, offset %u",
3449                                                 hdr->xlp_info, readId, readSeg, readOff)));
3450                 return false;
3451         }
3452
3453         recaddr.xlogid = readId;
3454         recaddr.xrecoff = readSeg * XLogSegSize + readOff;
3455         if (!XLByteEQ(hdr->xlp_pageaddr, recaddr))
3456         {
3457                 ereport(emode,
3458                                 (errmsg("unexpected pageaddr %X/%X in log file %u, segment %u, offset %u",
3459                                                 hdr->xlp_pageaddr.xlogid, hdr->xlp_pageaddr.xrecoff,
3460                                                 readId, readSeg, readOff)));
3461                 return false;
3462         }
3463
3464         /*
3465          * Check page TLI is one of the expected values.
3466          */
3467         if (!list_member_int(expectedTLIs, (int) hdr->xlp_tli))
3468         {
3469                 ereport(emode,
3470                                 (errmsg("unexpected timeline ID %u in log file %u, segment %u, offset %u",
3471                                                 hdr->xlp_tli,
3472                                                 readId, readSeg, readOff)));
3473                 return false;
3474         }
3475
3476         /*
3477          * Since child timelines are always assigned a TLI greater than their
3478          * immediate parent's TLI, we should never see TLI go backwards across
3479          * successive pages of a consistent WAL sequence.
3480          *
3481          * Of course this check should only be applied when advancing sequentially
3482          * across pages; therefore ReadRecord resets lastPageTLI to zero when
3483          * going to a random page.
3484          */
3485         if (hdr->xlp_tli < lastPageTLI)
3486         {
3487                 ereport(emode,
3488                                 (errmsg("out-of-sequence timeline ID %u (after %u) in log file %u, segment %u, offset %u",
3489                                                 hdr->xlp_tli, lastPageTLI,
3490                                                 readId, readSeg, readOff)));
3491                 return false;
3492         }
3493         lastPageTLI = hdr->xlp_tli;
3494         return true;
3495 }
3496
3497 /*
3498  * Try to read a timeline's history file.
3499  *
3500  * If successful, return the list of component TLIs (the given TLI followed by
3501  * its ancestor TLIs).  If we can't find the history file, assume that the
3502  * timeline has no parents, and return a list of just the specified timeline
3503  * ID.
3504  */
3505 static List *
3506 readTimeLineHistory(TimeLineID targetTLI)
3507 {
3508         List       *result;
3509         char            path[MAXPGPATH];
3510         char            histfname[MAXFNAMELEN];
3511         char            fline[MAXPGPATH];
3512         FILE       *fd;
3513
3514         if (InArchiveRecovery)
3515         {
3516                 TLHistoryFileName(histfname, targetTLI);
3517                 RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0);
3518         }
3519         else
3520                 TLHistoryFilePath(path, targetTLI);
3521
3522         fd = AllocateFile(path, "r");
3523         if (fd == NULL)
3524         {
3525                 if (errno != ENOENT)
3526                         ereport(FATAL,
3527                                         (errcode_for_file_access(),
3528                                          errmsg("could not open file \"%s\": %m", path)));
3529                 /* Not there, so assume no parents */
3530                 return list_make1_int((int) targetTLI);
3531         }
3532
3533         result = NIL;
3534
3535         /*
3536          * Parse the file...
3537          */
3538         while (fgets(fline, sizeof(fline), fd) != NULL)
3539         {
3540                 /* skip leading whitespace and check for # comment */
3541                 char       *ptr;
3542                 char       *endptr;
3543                 TimeLineID      tli;
3544
3545                 for (ptr = fline; *ptr; ptr++)
3546                 {
3547                         if (!isspace((unsigned char) *ptr))
3548                                 break;
3549                 }
3550                 if (*ptr == '\0' || *ptr == '#')
3551                         continue;
3552
3553                 /* expect a numeric timeline ID as first field of line */
3554                 tli = (TimeLineID) strtoul(ptr, &endptr, 0);
3555                 if (endptr == ptr)
3556                         ereport(FATAL,
3557                                         (errmsg("syntax error in history file: %s", fline),
3558                                          errhint("Expected a numeric timeline ID.")));
3559
3560                 if (result &&
3561                         tli <= (TimeLineID) linitial_int(result))
3562                         ereport(FATAL,
3563                                         (errmsg("invalid data in history file: %s", fline),
3564                                    errhint("Timeline IDs must be in increasing sequence.")));
3565
3566                 /* Build list with newest item first */
3567                 result = lcons_int((int) tli, result);
3568
3569                 /* we ignore the remainder of each line */
3570         }
3571
3572         FreeFile(fd);
3573
3574         if (result &&
3575                 targetTLI <= (TimeLineID) linitial_int(result))
3576                 ereport(FATAL,
3577                                 (errmsg("invalid data in history file \"%s\"", path),
3578                         errhint("Timeline IDs must be less than child timeline's ID.")));
3579
3580         result = lcons_int((int) targetTLI, result);
3581
3582         ereport(DEBUG3,
3583                         (errmsg_internal("history of timeline %u is %s",
3584                                                          targetTLI, nodeToString(result))));
3585
3586         return result;
3587 }
3588
3589 /*
3590  * Probe whether a timeline history file exists for the given timeline ID
3591  */
3592 static bool
3593 existsTimeLineHistory(TimeLineID probeTLI)
3594 {
3595         char            path[MAXPGPATH];
3596         char            histfname[MAXFNAMELEN];
3597         FILE       *fd;
3598
3599         if (InArchiveRecovery)
3600         {
3601                 TLHistoryFileName(histfname, probeTLI);
3602                 RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0);
3603         }
3604         else
3605                 TLHistoryFilePath(path, probeTLI);
3606
3607         fd = AllocateFile(path, "r");
3608         if (fd != NULL)
3609         {
3610                 FreeFile(fd);
3611                 return true;
3612         }
3613         else
3614         {
3615                 if (errno != ENOENT)
3616                         ereport(FATAL,
3617                                         (errcode_for_file_access(),
3618                                          errmsg("could not open file \"%s\": %m", path)));
3619                 return false;
3620         }
3621 }
3622
3623 /*
3624  * Find the newest existing timeline, assuming that startTLI exists.
3625  *
3626  * Note: while this is somewhat heuristic, it does positively guarantee
3627  * that (result + 1) is not a known timeline, and therefore it should
3628  * be safe to assign that ID to a new timeline.
3629  */
3630 static TimeLineID
3631 findNewestTimeLine(TimeLineID startTLI)
3632 {
3633         TimeLineID      newestTLI;
3634         TimeLineID      probeTLI;
3635
3636         /*
3637          * The algorithm is just to probe for the existence of timeline history
3638          * files.  XXX is it useful to allow gaps in the sequence?
3639          */
3640         newestTLI = startTLI;
3641
3642         for (probeTLI = startTLI + 1;; probeTLI++)
3643         {
3644                 if (existsTimeLineHistory(probeTLI))
3645                 {
3646                         newestTLI = probeTLI;           /* probeTLI exists */
3647                 }
3648                 else
3649                 {
3650                         /* doesn't exist, assume we're done */
3651                         break;
3652                 }
3653         }
3654
3655         return newestTLI;
3656 }
3657
3658 /*
3659  * Create a new timeline history file.
3660  *
3661  *      newTLI: ID of the new timeline
3662  *      parentTLI: ID of its immediate parent
3663  *      endTLI et al: ID of the last used WAL file, for annotation purposes
3664  *
3665  * Currently this is only used during recovery, and so there are no locking
3666  * considerations.      But we should be just as tense as XLogFileInit to avoid
3667  * emplacing a bogus file.
3668  */
3669 static void
3670 writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI,
3671                                          TimeLineID endTLI, uint32 endLogId, uint32 endLogSeg)
3672 {
3673         char            path[MAXPGPATH];
3674         char            tmppath[MAXPGPATH];
3675         char            histfname[MAXFNAMELEN];
3676         char            xlogfname[MAXFNAMELEN];
3677         char            buffer[BLCKSZ];
3678         int                     srcfd;
3679         int                     fd;
3680         int                     nbytes;
3681
3682         Assert(newTLI > parentTLI); /* else bad selection of newTLI */
3683
3684         /*
3685          * Write into a temp file name.
3686          */
3687         snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
3688
3689         unlink(tmppath);
3690
3691         /* do not use get_sync_bit() here --- want to fsync only at end of fill */
3692         fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL,
3693                                            S_IRUSR | S_IWUSR);
3694         if (fd < 0)
3695                 ereport(ERROR,
3696                                 (errcode_for_file_access(),
3697                                  errmsg("could not create file \"%s\": %m", tmppath)));
3698
3699         /*
3700          * If a history file exists for the parent, copy it verbatim
3701          */
3702         if (InArchiveRecovery)
3703         {
3704                 TLHistoryFileName(histfname, parentTLI);
3705                 RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0);
3706         }
3707         else
3708                 TLHistoryFilePath(path, parentTLI);
3709
3710         srcfd = BasicOpenFile(path, O_RDONLY, 0);
3711         if (srcfd < 0)
3712         {
3713                 if (errno != ENOENT)
3714                         ereport(ERROR,
3715                                         (errcode_for_file_access(),
3716                                          errmsg("could not open file \"%s\": %m", path)));
3717                 /* Not there, so assume parent has no parents */
3718         }
3719         else
3720         {
3721                 for (;;)
3722                 {
3723                         errno = 0;
3724                         nbytes = (int) read(srcfd, buffer, sizeof(buffer));
3725                         if (nbytes < 0 || errno != 0)
3726                                 ereport(ERROR,
3727                                                 (errcode_for_file_access(),
3728                                                  errmsg("could not read file \"%s\": %m", path)));
3729                         if (nbytes == 0)
3730                                 break;
3731                         errno = 0;
3732                         if ((int) write(fd, buffer, nbytes) != nbytes)
3733                         {
3734                                 int                     save_errno = errno;
3735
3736                                 /*
3737                                  * If we fail to make the file, delete it to release disk
3738                                  * space
3739                                  */
3740                                 unlink(tmppath);
3741
3742                                 /*
3743                                  * if write didn't set errno, assume problem is no disk space
3744                                  */
3745                                 errno = save_errno ? save_errno : ENOSPC;
3746
3747                                 ereport(ERROR,
3748                                                 (errcode_for_file_access(),
3749                                          errmsg("could not write to file \"%s\": %m", tmppath)));
3750                         }
3751                 }
3752                 close(srcfd);
3753         }
3754
3755         /*
3756          * Append one line with the details of this timeline split.
3757          *
3758          * If we did have a parent file, insert an extra newline just in case the
3759          * parent file failed to end with one.
3760          */
3761         XLogFileName(xlogfname, endTLI, endLogId, endLogSeg);
3762
3763         snprintf(buffer, sizeof(buffer),
3764                          "%s%u\t%s\t%s transaction %u at %s\n",
3765                          (srcfd < 0) ? "" : "\n",
3766                          parentTLI,
3767                          xlogfname,
3768                          recoveryStopAfter ? "after" : "before",
3769                          recoveryStopXid,
3770                          timestamptz_to_str(recoveryStopTime));
3771
3772         nbytes = strlen(buffer);
3773         errno = 0;
3774         if ((int) write(fd, buffer, nbytes) != nbytes)
3775         {
3776                 int                     save_errno = errno;
3777
3778                 /*
3779                  * If we fail to make the file, delete it to release disk space
3780                  */
3781                 unlink(tmppath);
3782                 /* if write didn't set errno, assume problem is no disk space */
3783                 errno = save_errno ? save_errno : ENOSPC;
3784
3785                 ereport(ERROR,
3786                                 (errcode_for_file_access(),
3787                                  errmsg("could not write to file \"%s\": %m", tmppath)));
3788         }
3789
3790         if (pg_fsync(fd) != 0)
3791                 ereport(ERROR,
3792                                 (errcode_for_file_access(),
3793                                  errmsg("could not fsync file \"%s\": %m", tmppath)));
3794
3795         if (close(fd))
3796                 ereport(ERROR,
3797                                 (errcode_for_file_access(),
3798                                  errmsg("could not close file \"%s\": %m", tmppath)));
3799
3800
3801         /*
3802          * Now move the completed history file into place with its final name.
3803          */
3804         TLHistoryFilePath(path, newTLI);
3805
3806         /*
3807          * Prefer link() to rename() here just to be really sure that we don't
3808          * overwrite an existing logfile.  However, there shouldn't be one, so
3809          * rename() is an acceptable substitute except for the truly paranoid.
3810          */
3811 #if HAVE_WORKING_LINK
3812         if (link(tmppath, path) < 0)
3813                 ereport(ERROR,
3814                                 (errcode_for_file_access(),
3815                                  errmsg("could not link file \"%s\" to \"%s\": %m",
3816                                                 tmppath, path)));
3817         unlink(tmppath);
3818 #else
3819         if (rename(tmppath, path) < 0)
3820                 ereport(ERROR,
3821                                 (errcode_for_file_access(),
3822                                  errmsg("could not rename file \"%s\" to \"%s\": %m",
3823                                                 tmppath, path)));
3824 #endif
3825
3826         /* The history file can be archived immediately. */
3827         TLHistoryFileName(histfname, newTLI);
3828         XLogArchiveNotify(histfname);
3829 }
3830
3831 /*
3832  * I/O routines for pg_control
3833  *
3834  * *ControlFile is a buffer in shared memory that holds an image of the
3835  * contents of pg_control.      WriteControlFile() initializes pg_control
3836  * given a preloaded buffer, ReadControlFile() loads the buffer from
3837  * the pg_control file (during postmaster or standalone-backend startup),
3838  * and UpdateControlFile() rewrites pg_control after we modify xlog state.
3839  *
3840  * For simplicity, WriteControlFile() initializes the fields of pg_control
3841  * that are related to checking backend/database compatibility, and
3842  * ReadControlFile() verifies they are correct.  We could split out the
3843  * I/O and compatibility-check functions, but there seems no need currently.
3844  */
3845 static void
3846 WriteControlFile(void)
3847 {
3848         int                     fd;
3849         char            buffer[PG_CONTROL_SIZE];                /* need not be aligned */
3850
3851         /*
3852          * Initialize version and compatibility-check fields
3853          */
3854         ControlFile->pg_control_version = PG_CONTROL_VERSION;
3855         ControlFile->catalog_version_no = CATALOG_VERSION_NO;
3856
3857         ControlFile->maxAlign = MAXIMUM_ALIGNOF;
3858         ControlFile->floatFormat = FLOATFORMAT_VALUE;
3859
3860         ControlFile->blcksz = BLCKSZ;
3861         ControlFile->relseg_size = RELSEG_SIZE;
3862         ControlFile->xlog_blcksz = XLOG_BLCKSZ;
3863         ControlFile->xlog_seg_size = XLOG_SEG_SIZE;
3864
3865         ControlFile->nameDataLen = NAMEDATALEN;
3866         ControlFile->indexMaxKeys = INDEX_MAX_KEYS;
3867
3868         ControlFile->toast_max_chunk_size = TOAST_MAX_CHUNK_SIZE;
3869
3870 #ifdef HAVE_INT64_TIMESTAMP
3871         ControlFile->enableIntTimes = true;
3872 #else
3873         ControlFile->enableIntTimes = false;
3874 #endif
3875         ControlFile->float4ByVal = FLOAT4PASSBYVAL;
3876         ControlFile->float8ByVal = FLOAT8PASSBYVAL;
3877
3878         /* Contents are protected with a CRC */
3879         INIT_CRC32(ControlFile->crc);
3880         COMP_CRC32(ControlFile->crc,
3881                            (char *) ControlFile,
3882                            offsetof(ControlFileData, crc));
3883         FIN_CRC32(ControlFile->crc);
3884
3885         /*
3886          * We write out PG_CONTROL_SIZE bytes into pg_control, zero-padding the
3887          * excess over sizeof(ControlFileData).  This reduces the odds of
3888          * premature-EOF errors when reading pg_control.  We'll still fail when we
3889          * check the contents of the file, but hopefully with a more specific
3890          * error than "couldn't read pg_control".
3891          */
3892         if (sizeof(ControlFileData) > PG_CONTROL_SIZE)
3893                 elog(PANIC, "sizeof(ControlFileData) is larger than PG_CONTROL_SIZE; fix either one");
3894
3895         memset(buffer, 0, PG_CONTROL_SIZE);
3896         memcpy(buffer, ControlFile, sizeof(ControlFileData));
3897
3898         fd = BasicOpenFile(XLOG_CONTROL_FILE,
3899                                            O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
3900                                            S_IRUSR | S_IWUSR);
3901         if (fd < 0)
3902                 ereport(PANIC,
3903                                 (errcode_for_file_access(),
3904                                  errmsg("could not create control file \"%s\": %m",
3905                                                 XLOG_CONTROL_FILE)));
3906
3907         errno = 0;
3908         if (write(fd, buffer, PG_CONTROL_SIZE) != PG_CONTROL_SIZE)
3909         {
3910                 /* if write didn't set errno, assume problem is no disk space */
3911                 if (errno == 0)
3912                         errno = ENOSPC;
3913                 ereport(PANIC,
3914                                 (errcode_for_file_access(),
3915                                  errmsg("could not write to control file: %m")));
3916         }
3917
3918         if (pg_fsync(fd) != 0)
3919                 ereport(PANIC,
3920                                 (errcode_for_file_access(),
3921                                  errmsg("could not fsync control file: %m")));
3922
3923         if (close(fd))
3924                 ereport(PANIC,
3925                                 (errcode_for_file_access(),
3926                                  errmsg("could not close control file: %m")));
3927 }
3928
3929 static void
3930 ReadControlFile(void)
3931 {
3932         pg_crc32        crc;
3933         int                     fd;
3934
3935         /*
3936          * Read data...
3937          */
3938         fd = BasicOpenFile(XLOG_CONTROL_FILE,
3939                                            O_RDWR | PG_BINARY,
3940                                            S_IRUSR | S_IWUSR);
3941         if (fd < 0)
3942                 ereport(PANIC,
3943                                 (errcode_for_file_access(),
3944                                  errmsg("could not open control file \"%s\": %m",
3945                                                 XLOG_CONTROL_FILE)));
3946
3947         if (read(fd, ControlFile, sizeof(ControlFileData)) != sizeof(ControlFileData))
3948                 ereport(PANIC,
3949                                 (errcode_for_file_access(),
3950                                  errmsg("could not read from control file: %m")));
3951
3952         close(fd);
3953
3954         /*
3955          * Check for expected pg_control format version.  If this is wrong, the
3956          * CRC check will likely fail because we'll be checking the wrong number
3957          * of bytes.  Complaining about wrong version will probably be more
3958          * enlightening than complaining about wrong CRC.
3959          */
3960
3961         if (ControlFile->pg_control_version != PG_CONTROL_VERSION && ControlFile->pg_control_version % 65536 == 0 && ControlFile->pg_control_version / 65536 != 0)
3962                 ereport(FATAL,
3963                                 (errmsg("database files are incompatible with server"),
3964                                  errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d (0x%08x),"
3965                                                    " but the server was compiled with PG_CONTROL_VERSION %d (0x%08x).",
3966                                                    ControlFile->pg_control_version, ControlFile->pg_control_version,
3967                                                    PG_CONTROL_VERSION, PG_CONTROL_VERSION),
3968                                  errhint("This could be a problem of mismatched byte ordering.  It looks like you need to initdb.")));
3969
3970         if (ControlFile->pg_control_version != PG_CONTROL_VERSION)
3971                 ereport(FATAL,
3972                                 (errmsg("database files are incompatible with server"),
3973                                  errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d,"
3974                                   " but the server was compiled with PG_CONTROL_VERSION %d.",
3975                                                 ControlFile->pg_control_version, PG_CONTROL_VERSION),
3976                                  errhint("It looks like you need to initdb.")));
3977
3978         /* Now check the CRC. */
3979         INIT_CRC32(crc);
3980         COMP_CRC32(crc,
3981                            (char *) ControlFile,
3982                            offsetof(ControlFileData, crc));
3983         FIN_CRC32(crc);
3984
3985         if (!EQ_CRC32(crc, ControlFile->crc))
3986                 ereport(FATAL,
3987                                 (errmsg("incorrect checksum in control file")));
3988
3989         /*
3990          * Do compatibility checking immediately.  We do this here for 2 reasons:
3991          *
3992          * (1) if the database isn't compatible with the backend executable, we
3993          * want to abort before we can possibly do any damage;
3994          *
3995          * (2) this code is executed in the postmaster, so the setlocale() will
3996          * propagate to forked backends, which aren't going to read this file for
3997          * themselves.  (These locale settings are considered critical
3998          * compatibility items because they can affect sort order of indexes.)
3999          */
4000         if (ControlFile->catalog_version_no != CATALOG_VERSION_NO)
4001                 ereport(FATAL,
4002                                 (errmsg("database files are incompatible with server"),
4003                                  errdetail("The database cluster was initialized with CATALOG_VERSION_NO %d,"
4004                                   " but the server was compiled with CATALOG_VERSION_NO %d.",
4005                                                 ControlFile->catalog_version_no, CATALOG_VERSION_NO),
4006                                  errhint("It looks like you need to initdb.")));
4007         if (ControlFile->maxAlign != MAXIMUM_ALIGNOF)
4008                 ereport(FATAL,
4009                                 (errmsg("database files are incompatible with server"),
4010                    errdetail("The database cluster was initialized with MAXALIGN %d,"
4011                                          " but the server was compiled with MAXALIGN %d.",
4012                                          ControlFile->maxAlign, MAXIMUM_ALIGNOF),
4013                                  errhint("It looks like you need to initdb.")));
4014         if (ControlFile->floatFormat != FLOATFORMAT_VALUE)
4015                 ereport(FATAL,
4016                                 (errmsg("database files are incompatible with server"),
4017                                  errdetail("The database cluster appears to use a different floating-point number format than the server executable."),
4018                                  errhint("It looks like you need to initdb.")));
4019         if (ControlFile->blcksz != BLCKSZ)
4020                 ereport(FATAL,
4021                                 (errmsg("database files are incompatible with server"),
4022                          errdetail("The database cluster was initialized with BLCKSZ %d,"
4023                                            " but the server was compiled with BLCKSZ %d.",
4024                                            ControlFile->blcksz, BLCKSZ),
4025                                  errhint("It looks like you need to recompile or initdb.")));
4026         if (ControlFile->relseg_size != RELSEG_SIZE)
4027                 ereport(FATAL,
4028                                 (errmsg("database files are incompatible with server"),
4029                 errdetail("The database cluster was initialized with RELSEG_SIZE %d,"
4030                                   " but the server was compiled with RELSEG_SIZE %d.",
4031                                   ControlFile->relseg_size, RELSEG_SIZE),
4032                                  errhint("It looks like you need to recompile or initdb.")));
4033         if (ControlFile->xlog_blcksz != XLOG_BLCKSZ)
4034                 ereport(FATAL,
4035                                 (errmsg("database files are incompatible with server"),
4036                 errdetail("The database cluster was initialized with XLOG_BLCKSZ %d,"
4037                                   " but the server was compiled with XLOG_BLCKSZ %d.",
4038                                   ControlFile->xlog_blcksz, XLOG_BLCKSZ),
4039                                  errhint("It looks like you need to recompile or initdb.")));
4040         if (ControlFile->xlog_seg_size != XLOG_SEG_SIZE)
4041                 ereport(FATAL,
4042                                 (errmsg("database files are incompatible with server"),
4043                                  errdetail("The database cluster was initialized with XLOG_SEG_SIZE %d,"
4044                                            " but the server was compiled with XLOG_SEG_SIZE %d.",
4045                                                    ControlFile->xlog_seg_size, XLOG_SEG_SIZE),
4046                                  errhint("It looks like you need to recompile or initdb.")));
4047         if (ControlFile->nameDataLen != NAMEDATALEN)
4048                 ereport(FATAL,
4049                                 (errmsg("database files are incompatible with server"),
4050                 errdetail("The database cluster was initialized with NAMEDATALEN %d,"
4051                                   " but the server was compiled with NAMEDATALEN %d.",
4052                                   ControlFile->nameDataLen, NAMEDATALEN),
4053                                  errhint("It looks like you need to recompile or initdb.")));
4054         if (ControlFile->indexMaxKeys != INDEX_MAX_KEYS)
4055                 ereport(FATAL,
4056                                 (errmsg("database files are incompatible with server"),
4057                                  errdetail("The database cluster was initialized with INDEX_MAX_KEYS %d,"
4058                                           " but the server was compiled with INDEX_MAX_KEYS %d.",
4059                                                    ControlFile->indexMaxKeys, INDEX_MAX_KEYS),
4060                                  errhint("It looks like you need to recompile or initdb.")));
4061         if (ControlFile->toast_max_chunk_size != TOAST_MAX_CHUNK_SIZE)
4062                 ereport(FATAL,
4063                                 (errmsg("database files are incompatible with server"),
4064                                  errdetail("The database cluster was initialized with TOAST_MAX_CHUNK_SIZE %d,"
4065                                 " but the server was compiled with TOAST_MAX_CHUNK_SIZE %d.",
4066                           ControlFile->toast_max_chunk_size, (int) TOAST_MAX_CHUNK_SIZE),
4067                                  errhint("It looks like you need to recompile or initdb.")));
4068
4069 #ifdef HAVE_INT64_TIMESTAMP
4070         if (ControlFile->enableIntTimes != true)
4071                 ereport(FATAL,
4072                                 (errmsg("database files are incompatible with server"),
4073                                  errdetail("The database cluster was initialized without HAVE_INT64_TIMESTAMP"
4074                                   " but the server was compiled with HAVE_INT64_TIMESTAMP."),
4075                                  errhint("It looks like you need to recompile or initdb.")));
4076 #else
4077         if (ControlFile->enableIntTimes != false)
4078                 ereport(FATAL,
4079                                 (errmsg("database files are incompatible with server"),
4080                                  errdetail("The database cluster was initialized with HAVE_INT64_TIMESTAMP"
4081                            " but the server was compiled without HAVE_INT64_TIMESTAMP."),
4082                                  errhint("It looks like you need to recompile or initdb.")));
4083 #endif
4084
4085 #ifdef USE_FLOAT4_BYVAL
4086         if (ControlFile->float4ByVal != true)
4087                 ereport(FATAL,
4088                                 (errmsg("database files are incompatible with server"),
4089                                  errdetail("The database cluster was initialized without USE_FLOAT4_BYVAL"
4090                                                    " but the server was compiled with USE_FLOAT4_BYVAL."),
4091                                  errhint("It looks like you need to recompile or initdb.")));
4092 #else
4093         if (ControlFile->float4ByVal != false)
4094                 ereport(FATAL,
4095                                 (errmsg("database files are incompatible with server"),
4096                                  errdetail("The database cluster was initialized with USE_FLOAT4_BYVAL"
4097                                                    " but the server was compiled without USE_FLOAT4_BYVAL."),
4098                                  errhint("It looks like you need to recompile or initdb.")));
4099 #endif
4100
4101 #ifdef USE_FLOAT8_BYVAL
4102         if (ControlFile->float8ByVal != true)
4103                 ereport(FATAL,
4104                                 (errmsg("database files are incompatible with server"),
4105                                  errdetail("The database cluster was initialized without USE_FLOAT8_BYVAL"
4106                                                    " but the server was compiled with USE_FLOAT8_BYVAL."),
4107                                  errhint("It looks like you need to recompile or initdb.")));
4108 #else
4109         if (ControlFile->float8ByVal != false)
4110                 ereport(FATAL,
4111                                 (errmsg("database files are incompatible with server"),
4112                                  errdetail("The database cluster was initialized with USE_FLOAT8_BYVAL"
4113                                                    " but the server was compiled without USE_FLOAT8_BYVAL."),
4114                                  errhint("It looks like you need to recompile or initdb.")));
4115 #endif
4116 }
4117
4118 void
4119 UpdateControlFile(void)
4120 {
4121         int                     fd;
4122
4123         INIT_CRC32(ControlFile->crc);
4124         COMP_CRC32(ControlFile->crc,
4125                            (char *) ControlFile,
4126                            offsetof(ControlFileData, crc));
4127         FIN_CRC32(ControlFile->crc);
4128
4129         fd = BasicOpenFile(XLOG_CONTROL_FILE,
4130                                            O_RDWR | PG_BINARY,
4131                                            S_IRUSR | S_IWUSR);
4132         if (fd < 0)
4133                 ereport(PANIC,
4134                                 (errcode_for_file_access(),
4135                                  errmsg("could not open control file \"%s\": %m",
4136                                                 XLOG_CONTROL_FILE)));
4137
4138         errno = 0;
4139         if (write(fd, ControlFile, sizeof(ControlFileData)) != sizeof(ControlFileData))
4140         {
4141                 /* if write didn't set errno, assume problem is no disk space */
4142                 if (errno == 0)
4143                         errno = ENOSPC;
4144                 ereport(PANIC,
4145                                 (errcode_for_file_access(),
4146                                  errmsg("could not write to control file: %m")));
4147         }
4148
4149         if (pg_fsync(fd) != 0)
4150                 ereport(PANIC,
4151                                 (errcode_for_file_access(),
4152                                  errmsg("could not fsync control file: %m")));
4153
4154         if (close(fd))
4155                 ereport(PANIC,
4156                                 (errcode_for_file_access(),
4157                                  errmsg("could not close control file: %m")));
4158 }
4159
4160 /*
4161  * Initialization of shared memory for XLOG
4162  */
4163 Size
4164 XLOGShmemSize(void)
4165 {
4166         Size            size;
4167
4168         /* XLogCtl */
4169         size = sizeof(XLogCtlData);
4170         /* xlblocks array */
4171         size = add_size(size, mul_size(sizeof(XLogRecPtr), XLOGbuffers));
4172         /* extra alignment padding for XLOG I/O buffers */
4173         size = add_size(size, ALIGNOF_XLOG_BUFFER);
4174         /* and the buffers themselves */
4175         size = add_size(size, mul_size(XLOG_BLCKSZ, XLOGbuffers));
4176
4177         /*
4178          * Note: we don't count ControlFileData, it comes out of the "slop factor"
4179          * added by CreateSharedMemoryAndSemaphores.  This lets us use this
4180          * routine again below to compute the actual allocation size.
4181          */
4182
4183         return size;
4184 }
4185
4186 void
4187 XLOGShmemInit(void)
4188 {
4189         bool            foundCFile,
4190                                 foundXLog;
4191         char       *allocptr;
4192
4193         ControlFile = (ControlFileData *)
4194                 ShmemInitStruct("Control File", sizeof(ControlFileData), &foundCFile);
4195         XLogCtl = (XLogCtlData *)
4196                 ShmemInitStruct("XLOG Ctl", XLOGShmemSize(), &foundXLog);
4197
4198         if (foundCFile || foundXLog)
4199         {
4200                 /* both should be present or neither */
4201                 Assert(foundCFile && foundXLog);
4202                 return;
4203         }
4204
4205         memset(XLogCtl, 0, sizeof(XLogCtlData));
4206
4207         /*
4208          * Since XLogCtlData contains XLogRecPtr fields, its sizeof should be a
4209          * multiple of the alignment for same, so no extra alignment padding is
4210          * needed here.
4211          */
4212         allocptr = ((char *) XLogCtl) + sizeof(XLogCtlData);
4213         XLogCtl->xlblocks = (XLogRecPtr *) allocptr;
4214         memset(XLogCtl->xlblocks, 0, sizeof(XLogRecPtr) * XLOGbuffers);
4215         allocptr += sizeof(XLogRecPtr) * XLOGbuffers;
4216
4217         /*
4218          * Align the start of the page buffers to an ALIGNOF_XLOG_BUFFER boundary.
4219          */
4220         allocptr = (char *) TYPEALIGN(ALIGNOF_XLOG_BUFFER, allocptr);
4221         XLogCtl->pages = allocptr;
4222         memset(XLogCtl->pages, 0, (Size) XLOG_BLCKSZ * XLOGbuffers);
4223
4224         /*
4225          * Do basic initialization of XLogCtl shared data. (StartupXLOG will fill
4226          * in additional info.)
4227          */
4228         XLogCtl->XLogCacheBlck = XLOGbuffers - 1;
4229         XLogCtl->Insert.currpage = (XLogPageHeader) (XLogCtl->pages);
4230         SpinLockInit(&XLogCtl->info_lck);
4231
4232         /*
4233          * If we are not in bootstrap mode, pg_control should already exist. Read
4234          * and validate it immediately (see comments in ReadControlFile() for the
4235          * reasons why).
4236          */
4237         if (!IsBootstrapProcessingMode())
4238                 ReadControlFile();
4239 }
4240
4241 /*
4242  * This func must be called ONCE on system install.  It creates pg_control
4243  * and the initial XLOG segment.
4244  */
4245 void
4246 BootStrapXLOG(void)
4247 {
4248         CheckPoint      checkPoint;
4249         char       *buffer;
4250         XLogPageHeader page;
4251         XLogLongPageHeader longpage;
4252         XLogRecord *record;
4253         bool            use_existent;
4254         uint64          sysidentifier;
4255         struct timeval tv;
4256         pg_crc32        crc;
4257
4258         /*
4259          * Select a hopefully-unique system identifier code for this installation.
4260          * We use the result of gettimeofday(), including the fractional seconds
4261          * field, as being about as unique as we can easily get.  (Think not to
4262          * use random(), since it hasn't been seeded and there's no portable way
4263          * to seed it other than the system clock value...)  The upper half of the
4264          * uint64 value is just the tv_sec part, while the lower half is the XOR
4265          * of tv_sec and tv_usec.  This is to ensure that we don't lose uniqueness
4266          * unnecessarily if "uint64" is really only 32 bits wide.  A person
4267          * knowing this encoding can determine the initialization time of the
4268          * installation, which could perhaps be useful sometimes.
4269          */
4270         gettimeofday(&tv, NULL);
4271         sysidentifier = ((uint64) tv.tv_sec) << 32;
4272         sysidentifier |= (uint32) (tv.tv_sec | tv.tv_usec);
4273
4274         /* First timeline ID is always 1 */
4275         ThisTimeLineID = 1;
4276
4277         /* page buffer must be aligned suitably for O_DIRECT */
4278         buffer = (char *) palloc(XLOG_BLCKSZ + ALIGNOF_XLOG_BUFFER);
4279         page = (XLogPageHeader) TYPEALIGN(ALIGNOF_XLOG_BUFFER, buffer);
4280         memset(page, 0, XLOG_BLCKSZ);
4281
4282         /* Set up information for the initial checkpoint record */
4283         checkPoint.redo.xlogid = 0;
4284         checkPoint.redo.xrecoff = SizeOfXLogLongPHD;
4285         checkPoint.ThisTimeLineID = ThisTimeLineID;
4286         checkPoint.nextXidEpoch = 0;
4287         checkPoint.nextXid = FirstNormalTransactionId;
4288         checkPoint.nextOid = FirstBootstrapObjectId;
4289         checkPoint.nextMulti = FirstMultiXactId;
4290         checkPoint.nextMultiOffset = 0;
4291         checkPoint.time = (pg_time_t) time(NULL);
4292
4293         ShmemVariableCache->nextXid = checkPoint.nextXid;
4294         ShmemVariableCache->nextOid = checkPoint.nextOid;
4295         ShmemVariableCache->oidCount = 0;
4296         MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
4297
4298         /* Set up the XLOG page header */
4299         page->xlp_magic = XLOG_PAGE_MAGIC;
4300         page->xlp_info = XLP_LONG_HEADER;
4301         page->xlp_tli = ThisTimeLineID;
4302         page->xlp_pageaddr.xlogid = 0;
4303         page->xlp_pageaddr.xrecoff = 0;
4304         longpage = (XLogLongPageHeader) page;
4305         longpage->xlp_sysid = sysidentifier;
4306         longpage->xlp_seg_size = XLogSegSize;
4307         longpage->xlp_xlog_blcksz = XLOG_BLCKSZ;
4308
4309         /* Insert the initial checkpoint record */
4310         record = (XLogRecord *) ((char *) page + SizeOfXLogLongPHD);
4311         record->xl_prev.xlogid = 0;
4312         record->xl_prev.xrecoff = 0;
4313         record->xl_xid = InvalidTransactionId;
4314         record->xl_tot_len = SizeOfXLogRecord + sizeof(checkPoint);
4315         record->xl_len = sizeof(checkPoint);
4316         record->xl_info = XLOG_CHECKPOINT_SHUTDOWN;
4317         record->xl_rmid = RM_XLOG_ID;
4318         memcpy(XLogRecGetData(record), &checkPoint, sizeof(checkPoint));
4319
4320         INIT_CRC32(crc);
4321         COMP_CRC32(crc, &checkPoint, sizeof(checkPoint));
4322         COMP_CRC32(crc, (char *) record + sizeof(pg_crc32),
4323                            SizeOfXLogRecord - sizeof(pg_crc32));
4324         FIN_CRC32(crc);
4325         record->xl_crc = crc;
4326
4327         /* Create first XLOG segment file */
4328         use_existent = false;
4329         openLogFile = XLogFileInit(0, 0, &use_existent, false);
4330
4331         /* Write the first page with the initial record */
4332         errno = 0;
4333         if (write(openLogFile, page, XLOG_BLCKSZ) != XLOG_BLCKSZ)
4334         {
4335                 /* if write didn't set errno, assume problem is no disk space */
4336                 if (errno == 0)
4337                         errno = ENOSPC;
4338                 ereport(PANIC,
4339                                 (errcode_for_file_access(),
4340                           errmsg("could not write bootstrap transaction log file: %m")));
4341         }
4342
4343         if (pg_fsync(openLogFile) != 0)
4344                 ereport(PANIC,
4345                                 (errcode_for_file_access(),
4346                           errmsg("could not fsync bootstrap transaction log file: %m")));
4347
4348         if (close(openLogFile))
4349                 ereport(PANIC,
4350                                 (errcode_for_file_access(),
4351                           errmsg("could not close bootstrap transaction log file: %m")));
4352
4353         openLogFile = -1;
4354
4355         /* Now create pg_control */
4356
4357         memset(ControlFile, 0, sizeof(ControlFileData));
4358         /* Initialize pg_control status fields */
4359         ControlFile->system_identifier = sysidentifier;
4360         ControlFile->state = DB_SHUTDOWNED;
4361         ControlFile->time = checkPoint.time;
4362         ControlFile->checkPoint = checkPoint.redo;
4363         ControlFile->checkPointCopy = checkPoint;
4364         /* some additional ControlFile fields are set in WriteControlFile() */
4365
4366         WriteControlFile();
4367
4368         /* Bootstrap the commit log, too */
4369         BootStrapCLOG();
4370         BootStrapSUBTRANS();
4371         BootStrapMultiXact();
4372
4373         pfree(buffer);
4374 }
4375
4376 static char *
4377 str_time(pg_time_t tnow)
4378 {
4379         static char buf[128];
4380
4381         pg_strftime(buf, sizeof(buf),
4382                                 "%Y-%m-%d %H:%M:%S %Z",
4383                                 pg_localtime(&tnow, log_timezone));
4384
4385         return buf;
4386 }
4387
4388 /*
4389  * See if there is a recovery command file (recovery.conf), and if so
4390  * read in parameters for archive recovery.
4391  *
4392  * XXX longer term intention is to expand this to
4393  * cater for additional parameters and controls
4394  * possibly use a flex lexer similar to the GUC one
4395  */
4396 static void
4397 readRecoveryCommandFile(void)
4398 {
4399         FILE       *fd;
4400         char            cmdline[MAXPGPATH];
4401         TimeLineID      rtli = 0;
4402         bool            rtliGiven = false;
4403         bool            syntaxError = false;
4404
4405         fd = AllocateFile(RECOVERY_COMMAND_FILE, "r");
4406         if (fd == NULL)
4407         {
4408                 if (errno == ENOENT)
4409                         return;                         /* not there, so no archive recovery */
4410                 ereport(FATAL,
4411                                 (errcode_for_file_access(),
4412                                  errmsg("could not open recovery command file \"%s\": %m",
4413                                                 RECOVERY_COMMAND_FILE)));
4414         }
4415
4416         ereport(LOG,
4417                         (errmsg("starting archive recovery")));
4418
4419         /*
4420          * Parse the file...
4421          */
4422         while (fgets(cmdline, sizeof(cmdline), fd) != NULL)
4423         {
4424                 /* skip leading whitespace and check for # comment */
4425                 char       *ptr;
4426                 char       *tok1;
4427                 char       *tok2;
4428
4429                 for (ptr = cmdline; *ptr; ptr++)
4430                 {
4431                         if (!isspace((unsigned char) *ptr))
4432                                 break;
4433                 }
4434                 if (*ptr == '\0' || *ptr == '#')
4435                         continue;
4436
4437                 /* identify the quoted parameter value */
4438                 tok1 = strtok(ptr, "'");
4439                 if (!tok1)
4440                 {
4441                         syntaxError = true;
4442                         break;
4443                 }
4444                 tok2 = strtok(NULL, "'");
4445                 if (!tok2)
4446                 {
4447                         syntaxError = true;
4448                         break;
4449                 }
4450                 /* reparse to get just the parameter name */
4451                 tok1 = strtok(ptr, " \t=");
4452                 if (!tok1)
4453                 {
4454                         syntaxError = true;
4455                         break;
4456                 }
4457
4458                 if (strcmp(tok1, "restore_command") == 0)
4459                 {
4460                         recoveryRestoreCommand = pstrdup(tok2);
4461                         ereport(LOG,
4462                                         (errmsg("restore_command = '%s'",
4463                                                         recoveryRestoreCommand)));
4464                 }
4465                 else if (strcmp(tok1, "recovery_target_timeline") == 0)
4466                 {
4467                         rtliGiven = true;
4468                         if (strcmp(tok2, "latest") == 0)
4469                                 rtli = 0;
4470                         else
4471                         {
4472                                 errno = 0;
4473                                 rtli = (TimeLineID) strtoul(tok2, NULL, 0);
4474                                 if (errno == EINVAL || errno == ERANGE)
4475                                         ereport(FATAL,
4476                                                         (errmsg("recovery_target_timeline is not a valid number: \"%s\"",
4477                                                                         tok2)));
4478                         }
4479                         if (rtli)
4480                                 ereport(LOG,
4481                                                 (errmsg("recovery_target_timeline = %u", rtli)));
4482                         else
4483                                 ereport(LOG,
4484                                                 (errmsg("recovery_target_timeline = latest")));
4485                 }
4486                 else if (strcmp(tok1, "recovery_target_xid") == 0)
4487                 {
4488                         errno = 0;
4489                         recoveryTargetXid = (TransactionId) strtoul(tok2, NULL, 0);
4490                         if (errno == EINVAL || errno == ERANGE)
4491                                 ereport(FATAL,
4492                                  (errmsg("recovery_target_xid is not a valid number: \"%s\"",
4493                                                  tok2)));
4494                         ereport(LOG,
4495                                         (errmsg("recovery_target_xid = %u",
4496                                                         recoveryTargetXid)));
4497                         recoveryTarget = true;
4498                         recoveryTargetExact = true;
4499                 }
4500                 else if (strcmp(tok1, "recovery_target_time") == 0)
4501                 {
4502                         /*
4503                          * if recovery_target_xid specified, then this overrides
4504                          * recovery_target_time
4505                          */
4506                         if (recoveryTargetExact)
4507                                 continue;
4508                         recoveryTarget = true;
4509                         recoveryTargetExact = false;
4510
4511                         /*
4512                          * Convert the time string given by the user to TimestampTz form.
4513                          */
4514                         recoveryTargetTime =
4515                                 DatumGetTimestampTz(DirectFunctionCall3(timestamptz_in,
4516                                                                                                                 CStringGetDatum(tok2),
4517                                                                                                 ObjectIdGetDatum(InvalidOid),
4518                                                                                                                 Int32GetDatum(-1)));
4519                         ereport(LOG,
4520                                         (errmsg("recovery_target_time = '%s'",
4521                                                         timestamptz_to_str(recoveryTargetTime))));
4522                 }
4523                 else if (strcmp(tok1, "recovery_target_inclusive") == 0)
4524                 {
4525                         /*
4526                          * does nothing if a recovery_target is not also set
4527                          */
4528                         if (!parse_bool(tok2, &recoveryTargetInclusive))
4529                                   ereport(ERROR,
4530                                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4531                                           errmsg("parameter \"recovery_target_inclusive\" requires a Boolean value")));
4532                         ereport(LOG,
4533                                         (errmsg("recovery_target_inclusive = %s", tok2)));
4534                 }
4535                 else if (strcmp(tok1, "log_restartpoints") == 0)
4536                 {
4537                         /*
4538                          * does nothing if a recovery_target is not also set
4539                          */
4540                         if (!parse_bool(tok2, &recoveryLogRestartpoints))
4541                                   ereport(ERROR,
4542                                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4543                                           errmsg("parameter \"log_restartpoints\" requires a Boolean value")));
4544                         ereport(LOG,
4545                                         (errmsg("log_restartpoints = %s", tok2)));
4546                 }
4547                 else
4548                         ereport(FATAL,
4549                                         (errmsg("unrecognized recovery parameter \"%s\"",
4550                                                         tok1)));
4551         }
4552
4553         FreeFile(fd);
4554
4555         if (syntaxError)
4556                 ereport(FATAL,
4557                                 (errmsg("syntax error in recovery command file: %s",
4558                                                 cmdline),
4559                           errhint("Lines should have the format parameter = 'value'.")));
4560
4561         /* Check that required parameters were supplied */
4562         if (recoveryRestoreCommand == NULL)
4563                 ereport(FATAL,
4564                                 (errmsg("recovery command file \"%s\" did not specify restore_command",
4565                                                 RECOVERY_COMMAND_FILE)));
4566
4567         /* Enable fetching from archive recovery area */
4568         InArchiveRecovery = true;
4569
4570         /*
4571          * If user specified recovery_target_timeline, validate it or compute the
4572          * "latest" value.      We can't do this until after we've gotten the restore
4573          * command and set InArchiveRecovery, because we need to fetch timeline
4574          * history files from the archive.
4575          */
4576         if (rtliGiven)
4577         {
4578                 if (rtli)
4579                 {
4580                         /* Timeline 1 does not have a history file, all else should */
4581                         if (rtli != 1 && !existsTimeLineHistory(rtli))
4582                                 ereport(FATAL,
4583                                                 (errmsg("recovery target timeline %u does not exist",
4584                                                                 rtli)));
4585                         recoveryTargetTLI = rtli;
4586                 }
4587                 else
4588                 {
4589                         /* We start the "latest" search from pg_control's timeline */
4590                         recoveryTargetTLI = findNewestTimeLine(recoveryTargetTLI);
4591                 }
4592         }
4593 }
4594
4595 /*
4596  * Exit archive-recovery state
4597  */
4598 static void
4599 exitArchiveRecovery(TimeLineID endTLI, uint32 endLogId, uint32 endLogSeg)
4600 {
4601         char            recoveryPath[MAXPGPATH];
4602         char            xlogpath[MAXPGPATH];
4603
4604         /*
4605          * We are no longer in archive recovery state.
4606          */
4607         InArchiveRecovery = false;
4608
4609         /*
4610          * We should have the ending log segment currently open.  Verify, and then
4611          * close it (to avoid problems on Windows with trying to rename or delete
4612          * an open file).
4613          */
4614         Assert(readFile >= 0);
4615         Assert(readId == endLogId);
4616         Assert(readSeg == endLogSeg);
4617
4618         close(readFile);
4619         readFile = -1;
4620
4621         /*
4622          * If the segment was fetched from archival storage, we want to replace
4623          * the existing xlog segment (if any) with the archival version.  This is
4624          * because whatever is in XLOGDIR is very possibly older than what we have
4625          * from the archives, since it could have come from restoring a PGDATA
4626          * backup.      In any case, the archival version certainly is more
4627          * descriptive of what our current database state is, because that is what
4628          * we replayed from.
4629          *
4630          * Note that if we are establishing a new timeline, ThisTimeLineID is
4631          * already set to the new value, and so we will create a new file instead
4632          * of overwriting any existing file.  (This is, in fact, always the case
4633          * at present.)
4634          */
4635         snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYXLOG");
4636         XLogFilePath(xlogpath, ThisTimeLineID, endLogId, endLogSeg);
4637
4638         if (restoredFromArchive)
4639         {
4640                 ereport(DEBUG3,
4641                                 (errmsg_internal("moving last restored xlog to \"%s\"",
4642                                                                  xlogpath)));
4643                 unlink(xlogpath);               /* might or might not exist */
4644                 if (rename(recoveryPath, xlogpath) != 0)
4645                         ereport(FATAL,
4646                                         (errcode_for_file_access(),
4647                                          errmsg("could not rename file \"%s\" to \"%s\": %m",
4648                                                         recoveryPath, xlogpath)));
4649                 /* XXX might we need to fix permissions on the file? */
4650         }
4651         else
4652         {
4653                 /*
4654                  * If the latest segment is not archival, but there's still a
4655                  * RECOVERYXLOG laying about, get rid of it.
4656                  */
4657                 unlink(recoveryPath);   /* ignore any error */
4658
4659                 /*
4660                  * If we are establishing a new timeline, we have to copy data from
4661                  * the last WAL segment of the old timeline to create a starting WAL
4662                  * segment for the new timeline.
4663                  */
4664                 if (endTLI != ThisTimeLineID)
4665                         XLogFileCopy(endLogId, endLogSeg,
4666                                                  endTLI, endLogId, endLogSeg);
4667         }
4668
4669         /*
4670          * Let's just make real sure there are not .ready or .done flags posted
4671          * for the new segment.
4672          */
4673         XLogFileName(xlogpath, ThisTimeLineID, endLogId, endLogSeg);
4674         XLogArchiveCleanup(xlogpath);
4675
4676         /* Get rid of any remaining recovered timeline-history file, too */
4677         snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYHISTORY");
4678         unlink(recoveryPath);           /* ignore any error */
4679
4680         /*
4681          * Rename the config file out of the way, so that we don't accidentally
4682          * re-enter archive recovery mode in a subsequent crash.
4683          */
4684         unlink(RECOVERY_COMMAND_DONE);
4685         if (rename(RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE) != 0)
4686                 ereport(FATAL,
4687                                 (errcode_for_file_access(),
4688                                  errmsg("could not rename file \"%s\" to \"%s\": %m",
4689                                                 RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE)));
4690
4691         ereport(LOG,
4692                         (errmsg("archive recovery complete")));
4693 }
4694
4695 /*
4696  * For point-in-time recovery, this function decides whether we want to
4697  * stop applying the XLOG at or after the current record.
4698  *
4699  * Returns TRUE if we are stopping, FALSE otherwise.  On TRUE return,
4700  * *includeThis is set TRUE if we should apply this record before stopping.
4701  * Also, some information is saved in recoveryStopXid et al for use in
4702  * annotating the new timeline's history file.
4703  */
4704 static bool
4705 recoveryStopsHere(XLogRecord *record, bool *includeThis)
4706 {
4707         bool            stopsHere;
4708         uint8           record_info;
4709         TimestampTz recordXtime;
4710
4711         /* We only consider stopping at COMMIT or ABORT records */
4712         if (record->xl_rmid != RM_XACT_ID)
4713                 return false;
4714         record_info = record->xl_info & ~XLR_INFO_MASK;
4715         if (record_info == XLOG_XACT_COMMIT)
4716         {
4717                 xl_xact_commit *recordXactCommitData;
4718
4719                 recordXactCommitData = (xl_xact_commit *) XLogRecGetData(record);
4720                 recordXtime = recordXactCommitData->xact_time;
4721         }
4722         else if (record_info == XLOG_XACT_ABORT)
4723         {
4724                 xl_xact_abort *recordXactAbortData;
4725
4726                 recordXactAbortData = (xl_xact_abort *) XLogRecGetData(record);
4727                 recordXtime = recordXactAbortData->xact_time;
4728         }
4729         else
4730                 return false;
4731
4732         /* Remember the most recent COMMIT/ABORT time for logging purposes */
4733         recoveryLastXTime = recordXtime;
4734
4735         /* Do we have a PITR target at all? */
4736         if (!recoveryTarget)
4737                 return false;
4738
4739         if (recoveryTargetExact)
4740         {
4741                 /*
4742                  * there can be only one transaction end record with this exact
4743                  * transactionid
4744                  *
4745                  * when testing for an xid, we MUST test for equality only, since
4746                  * transactions are numbered in the order they start, not the order
4747                  * they complete. A higher numbered xid will complete before you about
4748                  * 50% of the time...
4749                  */
4750                 stopsHere = (record->xl_xid == recoveryTargetXid);
4751                 if (stopsHere)
4752                         *includeThis = recoveryTargetInclusive;
4753         }
4754         else
4755         {
4756                 /*
4757                  * there can be many transactions that share the same commit time, so
4758                  * we stop after the last one, if we are inclusive, or stop at the
4759                  * first one if we are exclusive
4760                  */
4761                 if (recoveryTargetInclusive)
4762                         stopsHere = (recordXtime > recoveryTargetTime);
4763                 else
4764                         stopsHere = (recordXtime >= recoveryTargetTime);
4765                 if (stopsHere)
4766                         *includeThis = false;
4767         }
4768
4769         if (stopsHere)
4770         {
4771                 recoveryStopXid = record->xl_xid;
4772                 recoveryStopTime = recordXtime;
4773                 recoveryStopAfter = *includeThis;
4774
4775                 if (record_info == XLOG_XACT_COMMIT)
4776                 {
4777                         if (recoveryStopAfter)
4778                                 ereport(LOG,
4779                                                 (errmsg("recovery stopping after commit of transaction %u, time %s",
4780                                                                 recoveryStopXid,
4781                                                                 timestamptz_to_str(recoveryStopTime))));
4782                         else
4783                                 ereport(LOG,
4784                                                 (errmsg("recovery stopping before commit of transaction %u, time %s",
4785                                                                 recoveryStopXid,
4786                                                                 timestamptz_to_str(recoveryStopTime))));
4787                 }
4788                 else
4789                 {
4790                         if (recoveryStopAfter)
4791                                 ereport(LOG,
4792                                                 (errmsg("recovery stopping after abort of transaction %u, time %s",
4793                                                                 recoveryStopXid,
4794                                                                 timestamptz_to_str(recoveryStopTime))));
4795                         else
4796                                 ereport(LOG,
4797                                                 (errmsg("recovery stopping before abort of transaction %u, time %s",
4798                                                                 recoveryStopXid,
4799                                                                 timestamptz_to_str(recoveryStopTime))));
4800                 }
4801         }
4802
4803         return stopsHere;
4804 }
4805
4806 /*
4807  * This must be called ONCE during postmaster or standalone-backend startup
4808  */
4809 void
4810 StartupXLOG(void)
4811 {
4812         XLogCtlInsert *Insert;
4813         CheckPoint      checkPoint;
4814         bool            wasShutdown;
4815         bool            reachedStopPoint = false;
4816         bool            haveBackupLabel = false;
4817         XLogRecPtr      RecPtr,
4818                                 LastRec,
4819                                 checkPointLoc,
4820                                 minRecoveryLoc,
4821                                 EndOfLog;
4822         uint32          endLogId;
4823         uint32          endLogSeg;
4824         XLogRecord *record;
4825         uint32          freespace;
4826         TransactionId oldestActiveXID;
4827
4828         /*
4829          * Read control file and check XLOG status looks valid.
4830          *
4831          * Note: in most control paths, *ControlFile is already valid and we need
4832          * not do ReadControlFile() here, but might as well do it to be sure.
4833          */
4834         ReadControlFile();
4835
4836         if (ControlFile->state < DB_SHUTDOWNED ||
4837                 ControlFile->state > DB_IN_PRODUCTION ||
4838                 !XRecOffIsValid(ControlFile->checkPoint.xrecoff))
4839                 ereport(FATAL,
4840                                 (errmsg("control file contains invalid data")));
4841
4842         if (ControlFile->state == DB_SHUTDOWNED)
4843                 ereport(LOG,
4844                                 (errmsg("database system was shut down at %s",
4845                                                 str_time(ControlFile->time))));
4846         else if (ControlFile->state == DB_SHUTDOWNING)
4847                 ereport(LOG,
4848                                 (errmsg("database system shutdown was interrupted; last known up at %s",
4849                                                 str_time(ControlFile->time))));
4850         else if (ControlFile->state == DB_IN_CRASH_RECOVERY)
4851                 ereport(LOG,
4852                    (errmsg("database system was interrupted while in recovery at %s",
4853                                    str_time(ControlFile->time)),
4854                         errhint("This probably means that some data is corrupted and"
4855                                         " you will have to use the last backup for recovery.")));
4856         else if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY)
4857                 ereport(LOG,
4858                                 (errmsg("database system was interrupted while in recovery at log time %s",
4859                                                 str_time(ControlFile->checkPointCopy.time)),
4860                                  errhint("If this has occurred more than once some data might be corrupted"
4861                           " and you might need to choose an earlier recovery target.")));
4862         else if (ControlFile->state == DB_IN_PRODUCTION)
4863                 ereport(LOG,
4864                           (errmsg("database system was interrupted; last known up at %s",
4865                                           str_time(ControlFile->time))));
4866
4867         /* This is just to allow attaching to startup process with a debugger */
4868 #ifdef XLOG_REPLAY_DELAY
4869         if (ControlFile->state != DB_SHUTDOWNED)
4870                 pg_usleep(60000000L);
4871 #endif
4872
4873         /*
4874          * Initialize on the assumption we want to recover to the same timeline
4875          * that's active according to pg_control.
4876          */
4877         recoveryTargetTLI = ControlFile->checkPointCopy.ThisTimeLineID;
4878
4879         /*
4880          * Check for recovery control file, and if so set up state for offline
4881          * recovery
4882          */
4883         readRecoveryCommandFile();
4884
4885         /* Now we can determine the list of expected TLIs */
4886         expectedTLIs = readTimeLineHistory(recoveryTargetTLI);
4887
4888         /*
4889          * If pg_control's timeline is not in expectedTLIs, then we cannot
4890          * proceed: the backup is not part of the history of the requested
4891          * timeline.
4892          */
4893         if (!list_member_int(expectedTLIs,
4894                                                  (int) ControlFile->checkPointCopy.ThisTimeLineID))
4895                 ereport(FATAL,
4896                                 (errmsg("requested timeline %u is not a child of database system timeline %u",
4897                                                 recoveryTargetTLI,
4898                                                 ControlFile->checkPointCopy.ThisTimeLineID)));
4899
4900         if (read_backup_label(&checkPointLoc, &minRecoveryLoc))
4901         {
4902                 /*
4903                  * When a backup_label file is present, we want to roll forward from
4904                  * the checkpoint it identifies, rather than using pg_control.
4905                  */
4906                 record = ReadCheckpointRecord(checkPointLoc, 0);
4907                 if (record != NULL)
4908                 {
4909                         ereport(DEBUG1,
4910                                         (errmsg("checkpoint record is at %X/%X",
4911                                                         checkPointLoc.xlogid, checkPointLoc.xrecoff)));
4912                         InRecovery = true;      /* force recovery even if SHUTDOWNED */
4913                 }
4914                 else
4915                 {
4916                         ereport(PANIC,
4917                                         (errmsg("could not locate required checkpoint record"),
4918                                          errhint("If you are not restoring from a backup, try removing the file \"%s/backup_label\".", DataDir)));
4919                 }
4920                 /* set flag to delete it later */
4921                 haveBackupLabel = true;
4922         }
4923         else
4924         {
4925                 /*
4926                  * Get the last valid checkpoint record.  If the latest one according
4927                  * to pg_control is broken, try the next-to-last one.
4928                  */
4929                 checkPointLoc = ControlFile->checkPoint;
4930                 record = ReadCheckpointRecord(checkPointLoc, 1);
4931                 if (record != NULL)
4932                 {
4933                         ereport(DEBUG1,
4934                                         (errmsg("checkpoint record is at %X/%X",
4935                                                         checkPointLoc.xlogid, checkPointLoc.xrecoff)));
4936                 }
4937                 else
4938                 {
4939                         checkPointLoc = ControlFile->prevCheckPoint;
4940                         record = ReadCheckpointRecord(checkPointLoc, 2);
4941                         if (record != NULL)
4942                         {
4943                                 ereport(LOG,
4944                                                 (errmsg("using previous checkpoint record at %X/%X",
4945                                                           checkPointLoc.xlogid, checkPointLoc.xrecoff)));
4946                                 InRecovery = true;              /* force recovery even if SHUTDOWNED */
4947                         }
4948                         else
4949                                 ereport(PANIC,
4950                                          (errmsg("could not locate a valid checkpoint record")));
4951                 }
4952         }
4953
4954         LastRec = RecPtr = checkPointLoc;
4955         memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
4956         wasShutdown = (record->xl_info == XLOG_CHECKPOINT_SHUTDOWN);
4957
4958         ereport(DEBUG1,
4959                         (errmsg("redo record is at %X/%X; shutdown %s",
4960                                         checkPoint.redo.xlogid, checkPoint.redo.xrecoff,
4961                                         wasShutdown ? "TRUE" : "FALSE")));
4962         ereport(DEBUG1,
4963                         (errmsg("next transaction ID: %u/%u; next OID: %u",
4964                                         checkPoint.nextXidEpoch, checkPoint.nextXid,
4965                                         checkPoint.nextOid)));
4966         ereport(DEBUG1,
4967                         (errmsg("next MultiXactId: %u; next MultiXactOffset: %u",
4968                                         checkPoint.nextMulti, checkPoint.nextMultiOffset)));
4969         if (!TransactionIdIsNormal(checkPoint.nextXid))
4970                 ereport(PANIC,
4971                                 (errmsg("invalid next transaction ID")));
4972
4973         ShmemVariableCache->nextXid = checkPoint.nextXid;
4974         ShmemVariableCache->nextOid = checkPoint.nextOid;
4975         ShmemVariableCache->oidCount = 0;
4976         MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
4977
4978         /*
4979          * We must replay WAL entries using the same TimeLineID they were created
4980          * under, so temporarily adopt the TLI indicated by the checkpoint (see
4981          * also xlog_redo()).
4982          */
4983         ThisTimeLineID = checkPoint.ThisTimeLineID;
4984
4985         RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
4986
4987         if (XLByteLT(RecPtr, checkPoint.redo))
4988                 ereport(PANIC,
4989                                 (errmsg("invalid redo in checkpoint record")));
4990
4991         /*
4992          * Check whether we need to force recovery from WAL.  If it appears to
4993          * have been a clean shutdown and we did not have a recovery.conf file,
4994          * then assume no recovery needed.
4995          */
4996         if (XLByteLT(checkPoint.redo, RecPtr))
4997         {
4998                 if (wasShutdown)
4999                         ereport(PANIC,
5000                                         (errmsg("invalid redo record in shutdown checkpoint")));
5001                 InRecovery = true;
5002         }
5003         else if (ControlFile->state != DB_SHUTDOWNED)
5004                 InRecovery = true;
5005         else if (InArchiveRecovery)
5006         {
5007                 /* force recovery due to presence of recovery.conf */
5008                 InRecovery = true;
5009         }
5010
5011         /* REDO */
5012         if (InRecovery)
5013         {
5014                 int                     rmid;
5015
5016                 /*
5017                  * Update pg_control to show that we are recovering and to show the
5018                  * selected checkpoint as the place we are starting from. We also mark
5019                  * pg_control with any minimum recovery stop point obtained from a
5020                  * backup history file.
5021                  */
5022                 if (InArchiveRecovery)
5023                 {
5024                         ereport(LOG,
5025                                         (errmsg("automatic recovery in progress")));
5026                         ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
5027                 }
5028                 else
5029                 {
5030                         ereport(LOG,
5031                                         (errmsg("database system was not properly shut down; "
5032                                                         "automatic recovery in progress")));
5033                         ControlFile->state = DB_IN_CRASH_RECOVERY;
5034                 }
5035                 ControlFile->prevCheckPoint = ControlFile->checkPoint;
5036                 ControlFile->checkPoint = checkPointLoc;
5037                 ControlFile->checkPointCopy = checkPoint;
5038                 if (minRecoveryLoc.xlogid != 0 || minRecoveryLoc.xrecoff != 0)
5039                         ControlFile->minRecoveryPoint = minRecoveryLoc;
5040                 ControlFile->time = (pg_time_t) time(NULL);
5041                 UpdateControlFile();
5042
5043                 /*
5044                  * If there was a backup label file, it's done its job and the info
5045                  * has now been propagated into pg_control.  We must get rid of the
5046                  * label file so that if we crash during recovery, we'll pick up at
5047                  * the latest recovery restartpoint instead of going all the way back
5048                  * to the backup start point.  It seems prudent though to just rename
5049                  * the file out of the way rather than delete it completely.
5050                  */
5051                 if (haveBackupLabel)
5052                 {
5053                         unlink(BACKUP_LABEL_OLD);
5054                         if (rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) != 0)
5055                                 ereport(FATAL,
5056                                                 (errcode_for_file_access(),
5057                                                  errmsg("could not rename file \"%s\" to \"%s\": %m",
5058                                                                 BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
5059                 }
5060
5061                 /* Initialize resource managers */
5062                 for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
5063                 {
5064                         if (RmgrTable[rmid].rm_startup != NULL)
5065                                 RmgrTable[rmid].rm_startup();
5066                 }
5067
5068                 /*
5069                  * Find the first record that logically follows the checkpoint --- it
5070                  * might physically precede it, though.
5071                  */
5072                 if (XLByteLT(checkPoint.redo, RecPtr))
5073                 {
5074                         /* back up to find the record */
5075                         record = ReadRecord(&(checkPoint.redo), PANIC);
5076                 }
5077                 else
5078                 {
5079                         /* just have to read next record after CheckPoint */
5080                         record = ReadRecord(NULL, LOG);
5081                 }
5082
5083                 if (record != NULL)
5084                 {
5085                         bool            recoveryContinue = true;
5086                         bool            recoveryApply = true;
5087                         ErrorContextCallback errcontext;
5088
5089                         InRedo = true;
5090                         ereport(LOG,
5091                                         (errmsg("redo starts at %X/%X",
5092                                                         ReadRecPtr.xlogid, ReadRecPtr.xrecoff)));
5093
5094                         /*
5095                          * main redo apply loop
5096                          */
5097                         do
5098                         {
5099 #ifdef WAL_DEBUG
5100                                 if (XLOG_DEBUG)
5101                                 {
5102                                         StringInfoData buf;
5103
5104                                         initStringInfo(&buf);
5105                                         appendStringInfo(&buf, "REDO @ %X/%X; LSN %X/%X: ",
5106                                                                          ReadRecPtr.xlogid, ReadRecPtr.xrecoff,
5107                                                                          EndRecPtr.xlogid, EndRecPtr.xrecoff);
5108                                         xlog_outrec(&buf, record);
5109                                         appendStringInfo(&buf, " - ");
5110                                         RmgrTable[record->xl_rmid].rm_desc(&buf,
5111                                                                                                            record->xl_info,
5112                                                                                                          XLogRecGetData(record));
5113                                         elog(LOG, "%s", buf.data);
5114                                         pfree(buf.data);
5115                                 }
5116 #endif
5117
5118                                 /*
5119                                  * Have we reached our recovery target?
5120                                  */
5121                                 if (recoveryStopsHere(record, &recoveryApply))
5122                                 {
5123                                         reachedStopPoint = true;        /* see below */
5124                                         recoveryContinue = false;
5125                                         if (!recoveryApply)
5126                                                 break;
5127                                 }
5128
5129                                 /* Setup error traceback support for ereport() */
5130                                 errcontext.callback = rm_redo_error_callback;
5131                                 errcontext.arg = (void *) record;
5132                                 errcontext.previous = error_context_stack;
5133                                 error_context_stack = &errcontext;
5134
5135                                 /* nextXid must be beyond record's xid */
5136                                 if (TransactionIdFollowsOrEquals(record->xl_xid,
5137                                                                                                  ShmemVariableCache->nextXid))
5138                                 {
5139                                         ShmemVariableCache->nextXid = record->xl_xid;
5140                                         TransactionIdAdvance(ShmemVariableCache->nextXid);
5141                                 }
5142
5143                                 if (record->xl_info & XLR_BKP_BLOCK_MASK)
5144                                         RestoreBkpBlocks(record, EndRecPtr);
5145
5146                                 RmgrTable[record->xl_rmid].rm_redo(EndRecPtr, record);
5147
5148                                 /* Pop the error context stack */
5149                                 error_context_stack = errcontext.previous;
5150
5151                                 LastRec = ReadRecPtr;
5152
5153                                 record = ReadRecord(NULL, LOG);
5154                         } while (record != NULL && recoveryContinue);
5155
5156                         /*
5157                          * end of main redo apply loop
5158                          */
5159
5160                         ereport(LOG,
5161                                         (errmsg("redo done at %X/%X",
5162                                                         ReadRecPtr.xlogid, ReadRecPtr.xrecoff)));
5163                         if (recoveryLastXTime)
5164                                 ereport(LOG,
5165                                          (errmsg("last completed transaction was at log time %s",
5166                                                          timestamptz_to_str(recoveryLastXTime))));
5167                         InRedo = false;
5168                 }
5169                 else
5170                 {
5171                         /* there are no WAL records following the checkpoint */
5172                         ereport(LOG,
5173                                         (errmsg("redo is not required")));
5174                 }
5175         }
5176
5177         /*
5178          * Re-fetch the last valid or last applied record, so we can identify the
5179          * exact endpoint of what we consider the valid portion of WAL.
5180          */
5181         record = ReadRecord(&LastRec, PANIC);
5182         EndOfLog = EndRecPtr;
5183         XLByteToPrevSeg(EndOfLog, endLogId, endLogSeg);
5184
5185         /*
5186          * Complain if we did not roll forward far enough to render the backup
5187          * dump consistent.
5188          */
5189         if (XLByteLT(EndOfLog, ControlFile->minRecoveryPoint))
5190         {
5191                 if (reachedStopPoint)   /* stopped because of stop request */
5192                         ereport(FATAL,
5193                                         (errmsg("requested recovery stop point is before end time of backup dump")));
5194                 else    /* ran off end of WAL */
5195                         ereport(FATAL,
5196                                         (errmsg("WAL ends before end time of backup dump")));
5197         }
5198
5199         /*
5200          * Consider whether we need to assign a new timeline ID.
5201          *
5202          * If we are doing an archive recovery, we always assign a new ID.      This
5203          * handles a couple of issues.  If we stopped short of the end of WAL
5204          * during recovery, then we are clearly generating a new timeline and must
5205          * assign it a unique new ID.  Even if we ran to the end, modifying the
5206          * current last segment is problematic because it may result in trying to
5207          * overwrite an already-archived copy of that segment, and we encourage
5208          * DBAs to make their archive_commands reject that.  We can dodge the
5209          * problem by making the new active segment have a new timeline ID.
5210          *
5211          * In a normal crash recovery, we can just extend the timeline we were in.
5212          */
5213         if (InArchiveRecovery)
5214         {
5215                 ThisTimeLineID = findNewestTimeLine(recoveryTargetTLI) + 1;
5216                 ereport(LOG,
5217                                 (errmsg("selected new timeline ID: %u", ThisTimeLineID)));
5218                 writeTimeLineHistory(ThisTimeLineID, recoveryTargetTLI,
5219                                                          curFileTLI, endLogId, endLogSeg);
5220         }
5221
5222         /* Save the selected TimeLineID in shared memory, too */
5223         XLogCtl->ThisTimeLineID = ThisTimeLineID;
5224
5225         /*
5226          * We are now done reading the old WAL.  Turn off archive fetching if it
5227          * was active, and make a writable copy of the last WAL segment. (Note
5228          * that we also have a copy of the last block of the old WAL in readBuf;
5229          * we will use that below.)
5230          */
5231         if (InArchiveRecovery)
5232                 exitArchiveRecovery(curFileTLI, endLogId, endLogSeg);
5233
5234         /*
5235          * Prepare to write WAL starting at EndOfLog position, and init xlog
5236          * buffer cache using the block containing the last record from the
5237          * previous incarnation.
5238          */
5239         openLogId = endLogId;
5240         openLogSeg = endLogSeg;
5241         openLogFile = XLogFileOpen(openLogId, openLogSeg);
5242         openLogOff = 0;
5243         Insert = &XLogCtl->Insert;
5244         Insert->PrevRecord = LastRec;
5245         XLogCtl->xlblocks[0].xlogid = openLogId;
5246         XLogCtl->xlblocks[0].xrecoff =
5247                 ((EndOfLog.xrecoff - 1) / XLOG_BLCKSZ + 1) * XLOG_BLCKSZ;
5248
5249         /*
5250          * Tricky point here: readBuf contains the *last* block that the LastRec
5251          * record spans, not the one it starts in.      The last block is indeed the
5252          * one we want to use.
5253          */
5254         Assert(readOff == (XLogCtl->xlblocks[0].xrecoff - XLOG_BLCKSZ) % XLogSegSize);
5255         memcpy((char *) Insert->currpage, readBuf, XLOG_BLCKSZ);
5256         Insert->currpos = (char *) Insert->currpage +
5257                 (EndOfLog.xrecoff + XLOG_BLCKSZ - XLogCtl->xlblocks[0].xrecoff);
5258
5259         LogwrtResult.Write = LogwrtResult.Flush = EndOfLog;
5260
5261         XLogCtl->Write.LogwrtResult = LogwrtResult;
5262         Insert->LogwrtResult = LogwrtResult;
5263         XLogCtl->LogwrtResult = LogwrtResult;
5264
5265         XLogCtl->LogwrtRqst.Write = EndOfLog;
5266         XLogCtl->LogwrtRqst.Flush = EndOfLog;
5267
5268         freespace = INSERT_FREESPACE(Insert);
5269         if (freespace > 0)
5270         {
5271                 /* Make sure rest of page is zero */
5272                 MemSet(Insert->currpos, 0, freespace);
5273                 XLogCtl->Write.curridx = 0;
5274         }
5275         else
5276         {
5277                 /*
5278                  * Whenever Write.LogwrtResult points to exactly the end of a page,
5279                  * Write.curridx must point to the *next* page (see XLogWrite()).
5280                  *
5281                  * Note: it might seem we should do AdvanceXLInsertBuffer() here, but
5282                  * this is sufficient.  The first actual attempt to insert a log
5283                  * record will advance the insert state.
5284                  */
5285                 XLogCtl->Write.curridx = NextBufIdx(0);
5286         }
5287
5288         /* Pre-scan prepared transactions to find out the range of XIDs present */
5289         oldestActiveXID = PrescanPreparedTransactions();
5290
5291         if (InRecovery)
5292         {
5293                 int                     rmid;
5294
5295                 /*
5296                  * Allow resource managers to do any required cleanup.
5297                  */
5298                 for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
5299                 {
5300                         if (RmgrTable[rmid].rm_cleanup != NULL)
5301                                 RmgrTable[rmid].rm_cleanup();
5302                 }
5303
5304                 /*
5305                  * Check to see if the XLOG sequence contained any unresolved
5306                  * references to uninitialized pages.
5307                  */
5308                 XLogCheckInvalidPages();
5309
5310                 /*
5311                  * Reset pgstat data, because it may be invalid after recovery.
5312                  */
5313                 pgstat_reset_all();
5314
5315                 /*
5316                  * Perform a checkpoint to update all our recovery activity to disk.
5317                  *
5318                  * Note that we write a shutdown checkpoint rather than an on-line
5319                  * one. This is not particularly critical, but since we may be
5320                  * assigning a new TLI, using a shutdown checkpoint allows us to have
5321                  * the rule that TLI only changes in shutdown checkpoints, which
5322                  * allows some extra error checking in xlog_redo.
5323                  */
5324                 CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
5325         }
5326
5327         /*
5328          * Preallocate additional log files, if wanted.
5329          */
5330         PreallocXlogFiles(EndOfLog);
5331
5332         /*
5333          * Okay, we're officially UP.
5334          */
5335         InRecovery = false;
5336
5337         ControlFile->state = DB_IN_PRODUCTION;
5338         ControlFile->time = (pg_time_t) time(NULL);
5339         UpdateControlFile();
5340
5341         /* start the archive_timeout timer running */
5342         XLogCtl->Write.lastSegSwitchTime = ControlFile->time;
5343
5344         /* initialize shared-memory copy of latest checkpoint XID/epoch */
5345         XLogCtl->ckptXidEpoch = ControlFile->checkPointCopy.nextXidEpoch;
5346         XLogCtl->ckptXid = ControlFile->checkPointCopy.nextXid;
5347
5348         /* also initialize latestCompletedXid, to nextXid - 1 */
5349         ShmemVariableCache->latestCompletedXid = ShmemVariableCache->nextXid;
5350         TransactionIdRetreat(ShmemVariableCache->latestCompletedXid);
5351
5352         /* Start up the commit log and related stuff, too */
5353         StartupCLOG();
5354         StartupSUBTRANS(oldestActiveXID);
5355         StartupMultiXact();
5356
5357         /* Reload shared-memory state for prepared transactions */
5358         RecoverPreparedTransactions();
5359
5360         /* Shut down readFile facility, free space */
5361         if (readFile >= 0)
5362         {
5363                 close(readFile);
5364                 readFile = -1;
5365         }
5366         if (readBuf)
5367         {
5368                 free(readBuf);
5369                 readBuf = NULL;
5370         }
5371         if (readRecordBuf)
5372         {
5373                 free(readRecordBuf);
5374                 readRecordBuf = NULL;
5375                 readRecordBufSize = 0;
5376         }
5377 }
5378
5379 /*
5380  * Subroutine to try to fetch and validate a prior checkpoint record.
5381  *
5382  * whichChkpt identifies the checkpoint (merely for reporting purposes).
5383  * 1 for "primary", 2 for "secondary", 0 for "other" (backup_label)
5384  */
5385 static XLogRecord *
5386 ReadCheckpointRecord(XLogRecPtr RecPtr, int whichChkpt)
5387 {
5388         XLogRecord *record;
5389
5390         if (!XRecOffIsValid(RecPtr.xrecoff))
5391         {
5392                 switch (whichChkpt)
5393                 {
5394                         case 1:
5395                                 ereport(LOG,
5396                                 (errmsg("invalid primary checkpoint link in control file")));
5397                                 break;
5398                         case 2:
5399                                 ereport(LOG,
5400                                                 (errmsg("invalid secondary checkpoint link in control file")));
5401                                 break;
5402                         default:
5403                                 ereport(LOG,
5404                                    (errmsg("invalid checkpoint link in backup_label file")));
5405                                 break;
5406                 }
5407                 return NULL;
5408         }
5409
5410         record = ReadRecord(&RecPtr, LOG);
5411
5412         if (record == NULL)
5413         {
5414                 switch (whichChkpt)
5415                 {
5416                         case 1:
5417                                 ereport(LOG,
5418                                                 (errmsg("invalid primary checkpoint record")));
5419                                 break;
5420                         case 2:
5421                                 ereport(LOG,
5422                                                 (errmsg("invalid secondary checkpoint record")));
5423                                 break;
5424                         default:
5425                                 ereport(LOG,
5426                                                 (errmsg("invalid checkpoint record")));
5427                                 break;
5428                 }
5429                 return NULL;
5430         }
5431         if (record->xl_rmid != RM_XLOG_ID)
5432         {
5433                 switch (whichChkpt)
5434                 {
5435                         case 1:
5436                                 ereport(LOG,
5437                                                 (errmsg("invalid resource manager ID in primary checkpoint record")));
5438                                 break;
5439                         case 2:
5440                                 ereport(LOG,
5441                                                 (errmsg("invalid resource manager ID in secondary checkpoint record")));
5442                                 break;
5443                         default:
5444                                 ereport(LOG,
5445                                 (errmsg("invalid resource manager ID in checkpoint record")));
5446                                 break;
5447                 }
5448                 return NULL;
5449         }
5450         if (record->xl_info != XLOG_CHECKPOINT_SHUTDOWN &&
5451                 record->xl_info != XLOG_CHECKPOINT_ONLINE)
5452         {
5453                 switch (whichChkpt)
5454                 {
5455                         case 1:
5456                                 ereport(LOG,
5457                                    (errmsg("invalid xl_info in primary checkpoint record")));
5458                                 break;
5459                         case 2:
5460                                 ereport(LOG,
5461                                  (errmsg("invalid xl_info in secondary checkpoint record")));
5462                                 break;
5463                         default:
5464                                 ereport(LOG,
5465                                                 (errmsg("invalid xl_info in checkpoint record")));
5466                                 break;
5467                 }
5468                 return NULL;
5469         }
5470         if (record->xl_len != sizeof(CheckPoint) ||
5471                 record->xl_tot_len != SizeOfXLogRecord + sizeof(CheckPoint))
5472         {
5473                 switch (whichChkpt)
5474                 {
5475                         case 1:
5476                                 ereport(LOG,
5477                                         (errmsg("invalid length of primary checkpoint record")));
5478                                 break;
5479                         case 2:
5480                                 ereport(LOG,
5481                                   (errmsg("invalid length of secondary checkpoint record")));
5482                                 break;
5483                         default:
5484                                 ereport(LOG,
5485                                                 (errmsg("invalid length of checkpoint record")));
5486                                 break;
5487                 }
5488                 return NULL;
5489         }
5490         return record;
5491 }
5492
5493 /*
5494  * This must be called during startup of a backend process, except that
5495  * it need not be called in a standalone backend (which does StartupXLOG
5496  * instead).  We need to initialize the local copies of ThisTimeLineID and
5497  * RedoRecPtr.
5498  *
5499  * Note: before Postgres 8.0, we went to some effort to keep the postmaster
5500  * process's copies of ThisTimeLineID and RedoRecPtr valid too.  This was
5501  * unnecessary however, since the postmaster itself never touches XLOG anyway.
5502  */
5503 void
5504 InitXLOGAccess(void)
5505 {
5506         /* ThisTimeLineID doesn't change so we need no lock to copy it */
5507         ThisTimeLineID = XLogCtl->ThisTimeLineID;
5508         /* Use GetRedoRecPtr to copy the RedoRecPtr safely */
5509         (void) GetRedoRecPtr();
5510 }
5511
5512 /*
5513  * Once spawned, a backend may update its local RedoRecPtr from
5514  * XLogCtl->Insert.RedoRecPtr; it must hold the insert lock or info_lck
5515  * to do so.  This is done in XLogInsert() or GetRedoRecPtr().
5516  */
5517 XLogRecPtr
5518 GetRedoRecPtr(void)
5519 {
5520         /* use volatile pointer to prevent code rearrangement */
5521         volatile XLogCtlData *xlogctl = XLogCtl;
5522
5523         SpinLockAcquire(&xlogctl->info_lck);
5524         Assert(XLByteLE(RedoRecPtr, xlogctl->Insert.RedoRecPtr));
5525         RedoRecPtr = xlogctl->Insert.RedoRecPtr;
5526         SpinLockRelease(&xlogctl->info_lck);
5527
5528         return RedoRecPtr;
5529 }
5530
5531 /*
5532  * GetInsertRecPtr -- Returns the current insert position.
5533  *
5534  * NOTE: The value *actually* returned is the position of the last full
5535  * xlog page. It lags behind the real insert position by at most 1 page.
5536  * For that, we don't need to acquire WALInsertLock which can be quite
5537  * heavily contended, and an approximation is enough for the current
5538  * usage of this function.
5539  */
5540 XLogRecPtr
5541 GetInsertRecPtr(void)
5542 {
5543         /* use volatile pointer to prevent code rearrangement */
5544         volatile XLogCtlData *xlogctl = XLogCtl;
5545         XLogRecPtr      recptr;
5546
5547         SpinLockAcquire(&xlogctl->info_lck);
5548         recptr = xlogctl->LogwrtRqst.Write;
5549         SpinLockRelease(&xlogctl->info_lck);
5550
5551         return recptr;
5552 }
5553
5554 /*
5555  * Get the time of the last xlog segment switch
5556  */
5557 pg_time_t
5558 GetLastSegSwitchTime(void)
5559 {
5560         pg_time_t       result;
5561
5562         /* Need WALWriteLock, but shared lock is sufficient */
5563         LWLockAcquire(WALWriteLock, LW_SHARED);
5564         result = XLogCtl->Write.lastSegSwitchTime;
5565         LWLockRelease(WALWriteLock);
5566
5567         return result;
5568 }
5569
5570 /*
5571  * GetNextXidAndEpoch - get the current nextXid value and associated epoch
5572  *
5573  * This is exported for use by code that would like to have 64-bit XIDs.
5574  * We don't really support such things, but all XIDs within the system
5575  * can be presumed "close to" the result, and thus the epoch associated
5576  * with them can be determined.
5577  */
5578 void
5579 GetNextXidAndEpoch(TransactionId *xid, uint32 *epoch)
5580 {
5581         uint32          ckptXidEpoch;
5582         TransactionId ckptXid;
5583         TransactionId nextXid;
5584
5585         /* Must read checkpoint info first, else have race condition */
5586         {
5587                 /* use volatile pointer to prevent code rearrangement */
5588                 volatile XLogCtlData *xlogctl = XLogCtl;
5589
5590                 SpinLockAcquire(&xlogctl->info_lck);
5591                 ckptXidEpoch = xlogctl->ckptXidEpoch;
5592                 ckptXid = xlogctl->ckptXid;
5593                 SpinLockRelease(&xlogctl->info_lck);
5594         }
5595
5596         /* Now fetch current nextXid */
5597         nextXid = ReadNewTransactionId();
5598
5599         /*
5600          * nextXid is certainly logically later than ckptXid.  So if it's
5601          * numerically less, it must have wrapped into the next epoch.
5602          */
5603         if (nextXid < ckptXid)
5604                 ckptXidEpoch++;
5605
5606         *xid = nextXid;
5607         *epoch = ckptXidEpoch;
5608 }
5609
5610 /*
5611  * This must be called ONCE during postmaster or standalone-backend shutdown
5612  */
5613 void
5614 ShutdownXLOG(int code, Datum arg)
5615 {
5616         ereport(LOG,
5617                         (errmsg("shutting down")));
5618
5619         CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
5620         ShutdownCLOG();
5621         ShutdownSUBTRANS();
5622         ShutdownMultiXact();
5623
5624         ereport(LOG,
5625                         (errmsg("database system is shut down")));
5626 }
5627
5628 /*
5629  * Log start of a checkpoint.
5630  */
5631 static void
5632 LogCheckpointStart(int flags)
5633 {
5634         elog(LOG, "checkpoint starting:%s%s%s%s%s%s",
5635                  (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
5636                  (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
5637                  (flags & CHECKPOINT_FORCE) ? " force" : "",
5638                  (flags & CHECKPOINT_WAIT) ? " wait" : "",
5639                  (flags & CHECKPOINT_CAUSE_XLOG) ? " xlog" : "",
5640                  (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "");
5641 }
5642
5643 /*
5644  * Log end of a checkpoint.
5645  */
5646 static void
5647 LogCheckpointEnd(void)
5648 {
5649         long            write_secs,
5650                                 sync_secs,
5651                                 total_secs;
5652         int                     write_usecs,
5653                                 sync_usecs,
5654                                 total_usecs;
5655
5656         CheckpointStats.ckpt_end_t = GetCurrentTimestamp();
5657
5658         TimestampDifference(CheckpointStats.ckpt_start_t,
5659                                                 CheckpointStats.ckpt_end_t,
5660                                                 &total_secs, &total_usecs);
5661
5662         TimestampDifference(CheckpointStats.ckpt_write_t,
5663                                                 CheckpointStats.ckpt_sync_t,
5664                                                 &write_secs, &write_usecs);
5665
5666         TimestampDifference(CheckpointStats.ckpt_sync_t,
5667                                                 CheckpointStats.ckpt_sync_end_t,
5668                                                 &sync_secs, &sync_usecs);
5669
5670         elog(LOG, "checkpoint complete: wrote %d buffers (%.1f%%); "
5671                  "%d transaction log file(s) added, %d removed, %d recycled; "
5672                  "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s",
5673                  CheckpointStats.ckpt_bufs_written,
5674                  (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
5675                  CheckpointStats.ckpt_segs_added,
5676                  CheckpointStats.ckpt_segs_removed,
5677                  CheckpointStats.ckpt_segs_recycled,
5678                  write_secs, write_usecs / 1000,
5679                  sync_secs, sync_usecs / 1000,
5680                  total_secs, total_usecs / 1000);
5681 }
5682
5683 /*
5684  * Perform a checkpoint --- either during shutdown, or on-the-fly
5685  *
5686  * flags is a bitwise OR of the following:
5687  *      CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown.
5688  *      CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP,
5689  *              ignoring checkpoint_completion_target parameter.
5690  *      CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occured
5691  *              since the last one (implied by CHECKPOINT_IS_SHUTDOWN).
5692  *
5693  * Note: flags contains other bits, of interest here only for logging purposes.
5694  * In particular note that this routine is synchronous and does not pay
5695  * attention to CHECKPOINT_WAIT.
5696  */
5697 void
5698 CreateCheckPoint(int flags)
5699 {
5700         bool            shutdown = (flags & CHECKPOINT_IS_SHUTDOWN) != 0;
5701         CheckPoint      checkPoint;
5702         XLogRecPtr      recptr;
5703         XLogCtlInsert *Insert = &XLogCtl->Insert;
5704         XLogRecData rdata;
5705         uint32          freespace;
5706         uint32          _logId;
5707         uint32          _logSeg;
5708         TransactionId *inCommitXids;
5709         int                     nInCommit;
5710
5711         /*
5712          * Acquire CheckpointLock to ensure only one checkpoint happens at a time.
5713          * (This is just pro forma, since in the present system structure there is
5714          * only one process that is allowed to issue checkpoints at any given
5715          * time.)
5716          */
5717         LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
5718
5719         /*
5720          * Prepare to accumulate statistics.
5721          *
5722          * Note: because it is possible for log_checkpoints to change while a
5723          * checkpoint proceeds, we always accumulate stats, even if
5724          * log_checkpoints is currently off.
5725          */
5726         MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
5727         CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
5728
5729         /*
5730          * Use a critical section to force system panic if we have trouble.
5731          */
5732         START_CRIT_SECTION();
5733
5734         if (shutdown)
5735         {
5736                 ControlFile->state = DB_SHUTDOWNING;
5737                 ControlFile->time = (pg_time_t) time(NULL);
5738                 UpdateControlFile();
5739         }
5740
5741         /*
5742          * Let smgr prepare for checkpoint; this has to happen before we determine
5743          * the REDO pointer.  Note that smgr must not do anything that'd have to
5744          * be undone if we decide no checkpoint is needed.
5745          */
5746         smgrpreckpt();
5747
5748         /* Begin filling in the checkpoint WAL record */
5749         MemSet(&checkPoint, 0, sizeof(checkPoint));
5750         checkPoint.ThisTimeLineID = ThisTimeLineID;
5751         checkPoint.time = (pg_time_t) time(NULL);
5752
5753         /*
5754          * We must hold WALInsertLock while examining insert state to determine
5755          * the checkpoint REDO pointer.
5756          */
5757         LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
5758
5759         /*
5760          * If this isn't a shutdown or forced checkpoint, and we have not inserted
5761          * any XLOG records since the start of the last checkpoint, skip the
5762          * checkpoint.  The idea here is to avoid inserting duplicate checkpoints
5763          * when the system is idle. That wastes log space, and more importantly it
5764          * exposes us to possible loss of both current and previous checkpoint
5765          * records if the machine crashes just as we're writing the update.
5766          * (Perhaps it'd make even more sense to checkpoint only when the previous
5767          * checkpoint record is in a different xlog page?)
5768          *
5769          * We have to make two tests to determine that nothing has happened since
5770          * the start of the last checkpoint: current insertion point must match
5771          * the end of the last checkpoint record, and its redo pointer must point
5772          * to itself.
5773          */
5774         if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_FORCE)) == 0)
5775         {
5776                 XLogRecPtr      curInsert;
5777
5778                 INSERT_RECPTR(curInsert, Insert, Insert->curridx);
5779                 if (curInsert.xlogid == ControlFile->checkPoint.xlogid &&
5780                         curInsert.xrecoff == ControlFile->checkPoint.xrecoff +
5781                         MAXALIGN(SizeOfXLogRecord + sizeof(CheckPoint)) &&
5782                         ControlFile->checkPoint.xlogid ==
5783                         ControlFile->checkPointCopy.redo.xlogid &&
5784                         ControlFile->checkPoint.xrecoff ==
5785                         ControlFile->checkPointCopy.redo.xrecoff)
5786                 {
5787                         LWLockRelease(WALInsertLock);
5788                         LWLockRelease(CheckpointLock);
5789                         END_CRIT_SECTION();
5790                         return;
5791                 }
5792         }
5793
5794         /*
5795          * Compute new REDO record ptr = location of next XLOG record.
5796          *
5797          * NB: this is NOT necessarily where the checkpoint record itself will be,
5798          * since other backends may insert more XLOG records while we're off doing
5799          * the buffer flush work.  Those XLOG records are logically after the
5800          * checkpoint, even though physically before it.  Got that?
5801          */
5802         freespace = INSERT_FREESPACE(Insert);
5803         if (freespace < SizeOfXLogRecord)
5804         {
5805                 (void) AdvanceXLInsertBuffer(false);
5806                 /* OK to ignore update return flag, since we will do flush anyway */
5807                 freespace = INSERT_FREESPACE(Insert);
5808         }
5809         INSERT_RECPTR(checkPoint.redo, Insert, Insert->curridx);
5810
5811         /*
5812          * Here we update the shared RedoRecPtr for future XLogInsert calls; this
5813          * must be done while holding the insert lock AND the info_lck.
5814          *
5815          * Note: if we fail to complete the checkpoint, RedoRecPtr will be left
5816          * pointing past where it really needs to point.  This is okay; the only
5817          * consequence is that XLogInsert might back up whole buffers that it
5818          * didn't really need to.  We can't postpone advancing RedoRecPtr because
5819          * XLogInserts that happen while we are dumping buffers must assume that
5820          * their buffer changes are not included in the checkpoint.
5821          */
5822         {
5823                 /* use volatile pointer to prevent code rearrangement */
5824                 volatile XLogCtlData *xlogctl = XLogCtl;
5825
5826                 SpinLockAcquire(&xlogctl->info_lck);
5827                 RedoRecPtr = xlogctl->Insert.RedoRecPtr = checkPoint.redo;
5828                 SpinLockRelease(&xlogctl->info_lck);
5829         }
5830
5831         /*
5832          * Now we can release WAL insert lock, allowing other xacts to proceed
5833          * while we are flushing disk buffers.
5834          */
5835         LWLockRelease(WALInsertLock);
5836
5837         /*
5838          * If enabled, log checkpoint start.  We postpone this until now so as not
5839          * to log anything if we decided to skip the checkpoint.
5840          */
5841         if (log_checkpoints)
5842                 LogCheckpointStart(flags);
5843
5844         /*
5845          * Before flushing data, we must wait for any transactions that are
5846          * currently in their commit critical sections.  If an xact inserted its
5847          * commit record into XLOG just before the REDO point, then a crash
5848          * restart from the REDO point would not replay that record, which means
5849          * that our flushing had better include the xact's update of pg_clog.  So
5850          * we wait till he's out of his commit critical section before proceeding.
5851          * See notes in RecordTransactionCommit().
5852          *
5853          * Because we've already released WALInsertLock, this test is a bit fuzzy:
5854          * it is possible that we will wait for xacts we didn't really need to
5855          * wait for.  But the delay should be short and it seems better to make
5856          * checkpoint take a bit longer than to hold locks longer than necessary.
5857          * (In fact, the whole reason we have this issue is that xact.c does
5858          * commit record XLOG insertion and clog update as two separate steps
5859          * protected by different locks, but again that seems best on grounds of
5860          * minimizing lock contention.)
5861          *
5862          * A transaction that has not yet set inCommit when we look cannot be at
5863          * risk, since he's not inserted his commit record yet; and one that's
5864          * already cleared it is not at risk either, since he's done fixing clog
5865          * and we will correctly flush the update below.  So we cannot miss any
5866          * xacts we need to wait for.
5867          */
5868         nInCommit = GetTransactionsInCommit(&inCommitXids);
5869         if (nInCommit > 0)
5870         {
5871                 do
5872                 {
5873                         pg_usleep(10000L);      /* wait for 10 msec */
5874                 } while (HaveTransactionsInCommit(inCommitXids, nInCommit));
5875         }
5876         pfree(inCommitXids);
5877
5878         /*
5879          * Get the other info we need for the checkpoint record.
5880          */
5881         LWLockAcquire(XidGenLock, LW_SHARED);
5882         checkPoint.nextXid = ShmemVariableCache->nextXid;
5883         LWLockRelease(XidGenLock);
5884
5885         /* Increase XID epoch if we've wrapped around since last checkpoint */
5886         checkPoint.nextXidEpoch = ControlFile->checkPointCopy.nextXidEpoch;
5887         if (checkPoint.nextXid < ControlFile->checkPointCopy.nextXid)
5888                 checkPoint.nextXidEpoch++;
5889
5890         LWLockAcquire(OidGenLock, LW_SHARED);
5891         checkPoint.nextOid = ShmemVariableCache->nextOid;
5892         if (!shutdown)
5893                 checkPoint.nextOid += ShmemVariableCache->oidCount;
5894         LWLockRelease(OidGenLock);
5895
5896         MultiXactGetCheckptMulti(shutdown,
5897                                                          &checkPoint.nextMulti,
5898                                                          &checkPoint.nextMultiOffset);
5899
5900         /*
5901          * Having constructed the checkpoint record, ensure all shmem disk buffers
5902          * and commit-log buffers are flushed to disk.
5903          *
5904          * This I/O could fail for various reasons.  If so, we will fail to
5905          * complete the checkpoint, but there is no reason to force a system
5906          * panic. Accordingly, exit critical section while doing it.
5907          */
5908         END_CRIT_SECTION();
5909
5910         CheckPointGuts(checkPoint.redo, flags);
5911
5912         START_CRIT_SECTION();
5913
5914         /*
5915          * Now insert the checkpoint record into XLOG.
5916          */
5917         rdata.data = (char *) (&checkPoint);
5918         rdata.len = sizeof(checkPoint);
5919         rdata.buffer = InvalidBuffer;
5920         rdata.next = NULL;
5921
5922         recptr = XLogInsert(RM_XLOG_ID,
5923                                                 shutdown ? XLOG_CHECKPOINT_SHUTDOWN :
5924                                                 XLOG_CHECKPOINT_ONLINE,
5925                                                 &rdata);
5926
5927         XLogFlush(recptr);
5928
5929         /*
5930          * We now have ProcLastRecPtr = start of actual checkpoint record, recptr
5931          * = end of actual checkpoint record.
5932          */
5933         if (shutdown && !XLByteEQ(checkPoint.redo, ProcLastRecPtr))
5934                 ereport(PANIC,
5935                                 (errmsg("concurrent transaction log activity while database system is shutting down")));
5936
5937         /*
5938          * Select point at which we can truncate the log, which we base on the
5939          * prior checkpoint's earliest info.
5940          */
5941         XLByteToSeg(ControlFile->checkPointCopy.redo, _logId, _logSeg);
5942
5943         /*
5944          * Update the control file.
5945          */
5946         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
5947         if (shutdown)
5948                 ControlFile->state = DB_SHUTDOWNED;
5949         ControlFile->prevCheckPoint = ControlFile->checkPoint;
5950         ControlFile->checkPoint = ProcLastRecPtr;
5951         ControlFile->checkPointCopy = checkPoint;
5952         ControlFile->time = (pg_time_t) time(NULL);
5953         UpdateControlFile();
5954         LWLockRelease(ControlFileLock);
5955
5956         /* Update shared-memory copy of checkpoint XID/epoch */
5957         {
5958                 /* use volatile pointer to prevent code rearrangement */
5959                 volatile XLogCtlData *xlogctl = XLogCtl;
5960
5961                 SpinLockAcquire(&xlogctl->info_lck);
5962                 xlogctl->ckptXidEpoch = checkPoint.nextXidEpoch;
5963                 xlogctl->ckptXid = checkPoint.nextXid;
5964                 SpinLockRelease(&xlogctl->info_lck);
5965         }
5966
5967         /*
5968          * We are now done with critical updates; no need for system panic if we
5969          * have trouble while fooling with old log segments.
5970          */
5971         END_CRIT_SECTION();
5972
5973         /*
5974          * Let smgr do post-checkpoint cleanup (eg, deleting old files).
5975          */
5976         smgrpostckpt();
5977
5978         /*
5979          * Delete old log files (those no longer needed even for previous
5980          * checkpoint).
5981          */
5982         if (_logId || _logSeg)
5983         {
5984                 PrevLogSeg(_logId, _logSeg);
5985                 RemoveOldXlogFiles(_logId, _logSeg, recptr);
5986         }
5987
5988         /*
5989          * Make more log segments if needed.  (Do this after recycling old log
5990          * segments, since that may supply some of the needed files.)
5991          */
5992         if (!shutdown)
5993                 PreallocXlogFiles(recptr);
5994
5995         /*
5996          * Truncate pg_subtrans if possible.  We can throw away all data before
5997          * the oldest XMIN of any running transaction.  No future transaction will
5998          * attempt to reference any pg_subtrans entry older than that (see Asserts
5999          * in subtrans.c).      During recovery, though, we mustn't do this because
6000          * StartupSUBTRANS hasn't been called yet.
6001          */
6002         if (!InRecovery)
6003                 TruncateSUBTRANS(GetOldestXmin(true, false));
6004
6005         /* All real work is done, but log before releasing lock. */
6006         if (log_checkpoints)
6007                 LogCheckpointEnd();
6008
6009         LWLockRelease(CheckpointLock);
6010 }
6011
6012 /*
6013  * Flush all data in shared memory to disk, and fsync
6014  *
6015  * This is the common code shared between regular checkpoints and
6016  * recovery restartpoints.
6017  */
6018 static void
6019 CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
6020 {
6021         CheckPointCLOG();
6022         CheckPointSUBTRANS();
6023         CheckPointMultiXact();
6024         CheckPointBuffers(flags);       /* performs all required fsyncs */
6025         /* We deliberately delay 2PC checkpointing as long as possible */
6026         CheckPointTwoPhase(checkPointRedo);
6027 }
6028
6029 /*
6030  * Set a recovery restart point if appropriate
6031  *
6032  * This is similar to CreateCheckPoint, but is used during WAL recovery
6033  * to establish a point from which recovery can roll forward without
6034  * replaying the entire recovery log.  This function is called each time
6035  * a checkpoint record is read from XLOG; it must determine whether a
6036  * restartpoint is needed or not.
6037  */
6038 static void
6039 RecoveryRestartPoint(const CheckPoint *checkPoint)
6040 {
6041         int                     elapsed_secs;
6042         int                     rmid;
6043
6044         /*
6045          * Do nothing if the elapsed time since the last restartpoint is less than
6046          * half of checkpoint_timeout.  (We use a value less than
6047          * checkpoint_timeout so that variations in the timing of checkpoints on
6048          * the master, or speed of transmission of WAL segments to a slave, won't
6049          * make the slave skip a restartpoint once it's synced with the master.)
6050          * Checking true elapsed time keeps us from doing restartpoints too often
6051          * while rapidly scanning large amounts of WAL.
6052          */
6053         elapsed_secs = (pg_time_t) time(NULL) - ControlFile->time;
6054         if (elapsed_secs < CheckPointTimeout / 2)
6055                 return;
6056
6057         /*
6058          * Is it safe to checkpoint?  We must ask each of the resource managers
6059          * whether they have any partial state information that might prevent a
6060          * correct restart from this point.  If so, we skip this opportunity, but
6061          * return at the next checkpoint record for another try.
6062          */
6063         for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
6064         {
6065                 if (RmgrTable[rmid].rm_safe_restartpoint != NULL)
6066                         if (!(RmgrTable[rmid].rm_safe_restartpoint()))
6067                         {
6068                                 elog(DEBUG2, "RM %d not safe to record restart point at %X/%X",
6069                                          rmid,
6070                                          checkPoint->redo.xlogid,
6071                                          checkPoint->redo.xrecoff);
6072                                 return;
6073                         }
6074         }
6075
6076         /*
6077          * OK, force data out to disk
6078          */
6079         CheckPointGuts(checkPoint->redo, CHECKPOINT_IMMEDIATE);
6080
6081         /*
6082          * Update pg_control so that any subsequent crash will restart from this
6083          * checkpoint.  Note: ReadRecPtr gives the XLOG address of the checkpoint
6084          * record itself.
6085          */
6086         ControlFile->prevCheckPoint = ControlFile->checkPoint;
6087         ControlFile->checkPoint = ReadRecPtr;
6088         ControlFile->checkPointCopy = *checkPoint;
6089         ControlFile->time = (pg_time_t) time(NULL);
6090         UpdateControlFile();
6091
6092         ereport((recoveryLogRestartpoints ? LOG : DEBUG2),
6093                         (errmsg("recovery restart point at %X/%X",
6094                                         checkPoint->redo.xlogid, checkPoint->redo.xrecoff)));
6095         if (recoveryLastXTime)
6096                 ereport((recoveryLogRestartpoints ? LOG : DEBUG2),
6097                                 (errmsg("last completed transaction was at log time %s",
6098                                                 timestamptz_to_str(recoveryLastXTime))));
6099 }
6100
6101 /*
6102  * Write a NEXTOID log record
6103  */
6104 void
6105 XLogPutNextOid(Oid nextOid)
6106 {
6107         XLogRecData rdata;
6108
6109         rdata.data = (char *) (&nextOid);
6110         rdata.len = sizeof(Oid);
6111         rdata.buffer = InvalidBuffer;
6112         rdata.next = NULL;
6113         (void) XLogInsert(RM_XLOG_ID, XLOG_NEXTOID, &rdata);
6114
6115         /*
6116          * We need not flush the NEXTOID record immediately, because any of the
6117          * just-allocated OIDs could only reach disk as part of a tuple insert or
6118          * update that would have its own XLOG record that must follow the NEXTOID
6119          * record.      Therefore, the standard buffer LSN interlock applied to those
6120          * records will ensure no such OID reaches disk before the NEXTOID record
6121          * does.
6122          *
6123          * Note, however, that the above statement only covers state "within" the
6124          * database.  When we use a generated OID as a file or directory name, we
6125          * are in a sense violating the basic WAL rule, because that filesystem
6126          * change may reach disk before the NEXTOID WAL record does.  The impact
6127          * of this is that if a database crash occurs immediately afterward, we
6128          * might after restart re-generate the same OID and find that it conflicts
6129          * with the leftover file or directory.  But since for safety's sake we
6130          * always loop until finding a nonconflicting filename, this poses no real
6131          * problem in practice. See pgsql-hackers discussion 27-Sep-2006.
6132          */
6133 }
6134
6135 /*
6136  * Write an XLOG SWITCH record.
6137  *
6138  * Here we just blindly issue an XLogInsert request for the record.
6139  * All the magic happens inside XLogInsert.
6140  *
6141  * The return value is either the end+1 address of the switch record,
6142  * or the end+1 address of the prior segment if we did not need to
6143  * write a switch record because we are already at segment start.
6144  */
6145 XLogRecPtr
6146 RequestXLogSwitch(void)
6147 {
6148         XLogRecPtr      RecPtr;
6149         XLogRecData rdata;
6150
6151         /* XLOG SWITCH, alone among xlog record types, has no data */
6152         rdata.buffer = InvalidBuffer;
6153         rdata.data = NULL;
6154         rdata.len = 0;
6155         rdata.next = NULL;
6156
6157         RecPtr = XLogInsert(RM_XLOG_ID, XLOG_SWITCH, &rdata);
6158
6159         return RecPtr;
6160 }
6161
6162 /*
6163  * XLOG resource manager's routines
6164  */
6165 void
6166 xlog_redo(XLogRecPtr lsn, XLogRecord *record)
6167 {
6168         uint8           info = record->xl_info & ~XLR_INFO_MASK;
6169
6170         if (info == XLOG_NEXTOID)
6171         {
6172                 Oid                     nextOid;
6173
6174                 memcpy(&nextOid, XLogRecGetData(record), sizeof(Oid));
6175                 if (ShmemVariableCache->nextOid < nextOid)
6176                 {
6177                         ShmemVariableCache->nextOid = nextOid;
6178                         ShmemVariableCache->oidCount = 0;
6179                 }
6180         }
6181         else if (info == XLOG_CHECKPOINT_SHUTDOWN)
6182         {
6183                 CheckPoint      checkPoint;
6184
6185                 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
6186                 /* In a SHUTDOWN checkpoint, believe the counters exactly */
6187                 ShmemVariableCache->nextXid = checkPoint.nextXid;
6188                 ShmemVariableCache->nextOid = checkPoint.nextOid;
6189                 ShmemVariableCache->oidCount = 0;
6190                 MultiXactSetNextMXact(checkPoint.nextMulti,
6191                                                           checkPoint.nextMultiOffset);
6192
6193                 /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
6194                 ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
6195                 ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
6196
6197                 /*
6198                  * TLI may change in a shutdown checkpoint, but it shouldn't decrease
6199                  */
6200                 if (checkPoint.ThisTimeLineID != ThisTimeLineID)
6201                 {
6202                         if (checkPoint.ThisTimeLineID < ThisTimeLineID ||
6203                                 !list_member_int(expectedTLIs,
6204                                                                  (int) checkPoint.ThisTimeLineID))
6205                                 ereport(PANIC,
6206                                                 (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
6207                                                                 checkPoint.ThisTimeLineID, ThisTimeLineID)));
6208                         /* Following WAL records should be run with new TLI */
6209                         ThisTimeLineID = checkPoint.ThisTimeLineID;
6210                 }
6211
6212                 RecoveryRestartPoint(&checkPoint);
6213         }
6214         else if (info == XLOG_CHECKPOINT_ONLINE)
6215         {
6216                 CheckPoint      checkPoint;
6217
6218                 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
6219                 /* In an ONLINE checkpoint, treat the counters like NEXTOID */
6220                 if (TransactionIdPrecedes(ShmemVariableCache->nextXid,
6221                                                                   checkPoint.nextXid))
6222                         ShmemVariableCache->nextXid = checkPoint.nextXid;
6223                 if (ShmemVariableCache->nextOid < checkPoint.nextOid)
6224                 {
6225                         ShmemVariableCache->nextOid = checkPoint.nextOid;
6226                         ShmemVariableCache->oidCount = 0;
6227                 }
6228                 MultiXactAdvanceNextMXact(checkPoint.nextMulti,
6229                                                                   checkPoint.nextMultiOffset);
6230
6231                 /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
6232                 ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
6233                 ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
6234
6235                 /* TLI should not change in an on-line checkpoint */
6236                 if (checkPoint.ThisTimeLineID != ThisTimeLineID)
6237                         ereport(PANIC,
6238                                         (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
6239                                                         checkPoint.ThisTimeLineID, ThisTimeLineID)));
6240
6241                 RecoveryRestartPoint(&checkPoint);
6242         }
6243         else if (info == XLOG_NOOP)
6244         {
6245                 /* nothing to do here */
6246         }
6247         else if (info == XLOG_SWITCH)
6248         {
6249                 /* nothing to do here */
6250         }
6251 }
6252
6253 void
6254 xlog_desc(StringInfo buf, uint8 xl_info, char *rec)
6255 {
6256         uint8           info = xl_info & ~XLR_INFO_MASK;
6257
6258         if (info == XLOG_CHECKPOINT_SHUTDOWN ||
6259                 info == XLOG_CHECKPOINT_ONLINE)
6260         {
6261                 CheckPoint *checkpoint = (CheckPoint *) rec;
6262
6263                 appendStringInfo(buf, "checkpoint: redo %X/%X; "
6264                                                  "tli %u; xid %u/%u; oid %u; multi %u; offset %u; %s",
6265                                                  checkpoint->redo.xlogid, checkpoint->redo.xrecoff,
6266                                                  checkpoint->ThisTimeLineID,
6267                                                  checkpoint->nextXidEpoch, checkpoint->nextXid,
6268                                                  checkpoint->nextOid,
6269                                                  checkpoint->nextMulti,
6270                                                  checkpoint->nextMultiOffset,
6271                                  (info == XLOG_CHECKPOINT_SHUTDOWN) ? "shutdown" : "online");
6272         }
6273         else if (info == XLOG_NOOP)
6274         {
6275                 appendStringInfo(buf, "xlog no-op");
6276         }
6277         else if (info == XLOG_NEXTOID)
6278         {
6279                 Oid                     nextOid;
6280
6281                 memcpy(&nextOid, rec, sizeof(Oid));
6282                 appendStringInfo(buf, "nextOid: %u", nextOid);
6283         }
6284         else if (info == XLOG_SWITCH)
6285         {
6286                 appendStringInfo(buf, "xlog switch");
6287         }
6288         else
6289                 appendStringInfo(buf, "UNKNOWN");
6290 }
6291
6292 #ifdef WAL_DEBUG
6293
6294 static void
6295 xlog_outrec(StringInfo buf, XLogRecord *record)
6296 {
6297         int                     i;
6298
6299         appendStringInfo(buf, "prev %X/%X; xid %u",
6300                                          record->xl_prev.xlogid, record->xl_prev.xrecoff,
6301                                          record->xl_xid);
6302
6303         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
6304         {
6305                 if (record->xl_info & XLR_SET_BKP_BLOCK(i))
6306                         appendStringInfo(buf, "; bkpb%d", i + 1);
6307         }
6308
6309         appendStringInfo(buf, ": %s", RmgrTable[record->xl_rmid].rm_name);
6310 }
6311 #endif   /* WAL_DEBUG */
6312
6313
6314 /*
6315  * Return the (possible) sync flag used for opening a file, depending on the
6316  * value of the GUC wal_sync_method.
6317  */
6318 static int
6319 get_sync_bit(int method)
6320 {
6321         /* If fsync is disabled, never open in sync mode */
6322         if (!enableFsync)
6323                 return 0;
6324
6325         switch (method)
6326         {
6327                 /*
6328                  * enum values for all sync options are defined even if they are not
6329                  * supported on the current platform.  But if not, they are not
6330                  * included in the enum option array, and therefore will never be seen
6331                  * here.
6332                  */
6333                 case SYNC_METHOD_FSYNC:
6334                 case SYNC_METHOD_FSYNC_WRITETHROUGH:
6335                 case SYNC_METHOD_FDATASYNC:
6336                         return 0;
6337 #ifdef OPEN_SYNC_FLAG
6338                 case SYNC_METHOD_OPEN:
6339                         return OPEN_SYNC_FLAG;
6340 #endif
6341 #ifdef OPEN_DATASYNC_FLAG
6342                 case SYNC_METHOD_OPEN_DSYNC:
6343                         return OPEN_DATASYNC_FLAG;
6344 #endif
6345                 default:
6346                         /* can't happen (unless we are out of sync with option array) */
6347                         elog(ERROR, "unrecognized wal_sync_method: %d", method);
6348                         return 0; /* silence warning */
6349         }
6350 }
6351
6352 /*
6353  * GUC support
6354  */
6355 bool
6356 assign_xlog_sync_method(int new_sync_method, bool doit, GucSource source)
6357 {
6358         if (!doit)
6359                 return true;
6360
6361         if (sync_method != new_sync_method)
6362         {
6363                 /*
6364                  * To ensure that no blocks escape unsynced, force an fsync on the
6365                  * currently open log segment (if any).  Also, if the open flag is
6366                  * changing, close the log file so it will be reopened (with new flag
6367                  * bit) at next use.
6368                  */
6369                 if (openLogFile >= 0)
6370                 {
6371                         if (pg_fsync(openLogFile) != 0)
6372                                 ereport(PANIC,
6373                                                 (errcode_for_file_access(),
6374                                                  errmsg("could not fsync log file %u, segment %u: %m",
6375                                                                 openLogId, openLogSeg)));
6376                         if (get_sync_bit(sync_method) != get_sync_bit(new_sync_method))
6377                                 XLogFileClose();
6378                 }
6379         }
6380
6381         return true;
6382 }
6383
6384
6385 /*
6386  * Issue appropriate kind of fsync (if any) on the current XLOG output file
6387  */
6388 static void
6389 issue_xlog_fsync(void)
6390 {
6391         switch (sync_method)
6392         {
6393                 case SYNC_METHOD_FSYNC:
6394                         if (pg_fsync_no_writethrough(openLogFile) != 0)
6395                                 ereport(PANIC,
6396                                                 (errcode_for_file_access(),
6397                                                  errmsg("could not fsync log file %u, segment %u: %m",
6398                                                                 openLogId, openLogSeg)));
6399                         break;
6400 #ifdef HAVE_FSYNC_WRITETHROUGH
6401                 case SYNC_METHOD_FSYNC_WRITETHROUGH:
6402                         if (pg_fsync_writethrough(openLogFile) != 0)
6403                                 ereport(PANIC,
6404                                                 (errcode_for_file_access(),
6405                                                  errmsg("could not fsync write-through log file %u, segment %u: %m",
6406                                                                 openLogId, openLogSeg)));
6407                         break;
6408 #endif
6409 #ifdef HAVE_FDATASYNC
6410                 case SYNC_METHOD_FDATASYNC:
6411                         if (pg_fdatasync(openLogFile) != 0)
6412                                 ereport(PANIC,
6413                                                 (errcode_for_file_access(),
6414                                         errmsg("could not fdatasync log file %u, segment %u: %m",
6415                                                    openLogId, openLogSeg)));
6416                         break;
6417 #endif
6418                 case SYNC_METHOD_OPEN:
6419                 case SYNC_METHOD_OPEN_DSYNC:
6420                         /* write synced it already */
6421                         break;
6422                 default:
6423                         elog(PANIC, "unrecognized wal_sync_method: %d", sync_method);
6424                         break;
6425         }
6426 }
6427
6428
6429 /*
6430  * pg_start_backup: set up for taking an on-line backup dump
6431  *
6432  * Essentially what this does is to create a backup label file in $PGDATA,
6433  * where it will be archived as part of the backup dump.  The label file
6434  * contains the user-supplied label string (typically this would be used
6435  * to tell where the backup dump will be stored) and the starting time and
6436  * starting WAL location for the dump.
6437  */
6438 Datum
6439 pg_start_backup(PG_FUNCTION_ARGS)
6440 {
6441         text       *backupid = PG_GETARG_TEXT_P(0);
6442         char       *backupidstr;
6443         XLogRecPtr      checkpointloc;
6444         XLogRecPtr      startpoint;
6445         pg_time_t       stamp_time;
6446         char            strfbuf[128];
6447         char            xlogfilename[MAXFNAMELEN];
6448         uint32          _logId;
6449         uint32          _logSeg;
6450         struct stat stat_buf;
6451         FILE       *fp;
6452
6453         if (!superuser())
6454                 ereport(ERROR,
6455                                 (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
6456                                  errmsg("must be superuser to run a backup")));
6457
6458         if (!XLogArchivingActive())
6459                 ereport(ERROR,
6460                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
6461                                  errmsg("WAL archiving is not active"),
6462                                  errhint("archive_mode must be enabled at server start.")));
6463
6464         if (!XLogArchiveCommandSet())
6465                 ereport(ERROR,
6466                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
6467                                  errmsg("WAL archiving is not active"),
6468                                  errhint("archive_command must be defined before "
6469                                                  "online backups can be made safely.")));
6470
6471         backupidstr = text_to_cstring(backupid);
6472
6473         /*
6474          * Mark backup active in shared memory.  We must do full-page WAL writes
6475          * during an on-line backup even if not doing so at other times, because
6476          * it's quite possible for the backup dump to obtain a "torn" (partially
6477          * written) copy of a database page if it reads the page concurrently with
6478          * our write to the same page.  This can be fixed as long as the first
6479          * write to the page in the WAL sequence is a full-page write. Hence, we
6480          * turn on forcePageWrites and then force a CHECKPOINT, to ensure there
6481          * are no dirty pages in shared memory that might get dumped while the
6482          * backup is in progress without having a corresponding WAL record.  (Once
6483          * the backup is complete, we need not force full-page writes anymore,
6484          * since we expect that any pages not modified during the backup interval
6485          * must have been correctly captured by the backup.)
6486          *
6487          * We must hold WALInsertLock to change the value of forcePageWrites, to
6488          * ensure adequate interlocking against XLogInsert().
6489          */
6490         LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
6491         if (XLogCtl->Insert.forcePageWrites)
6492         {
6493                 LWLockRelease(WALInsertLock);
6494                 ereport(ERROR,
6495                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
6496                                  errmsg("a backup is already in progress"),
6497                                  errhint("Run pg_stop_backup() and try again.")));
6498         }
6499         XLogCtl->Insert.forcePageWrites = true;
6500         LWLockRelease(WALInsertLock);
6501
6502         /* Ensure we release forcePageWrites if fail below */
6503         PG_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) 0);
6504         {
6505                 /*
6506                  * Force a CHECKPOINT.  Aside from being necessary to prevent torn
6507                  * page problems, this guarantees that two successive backup runs will
6508                  * have different checkpoint positions and hence different history
6509                  * file names, even if nothing happened in between.
6510                  *
6511                  * We don't use CHECKPOINT_IMMEDIATE, hence this can take awhile.
6512                  */
6513                 RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT);
6514
6515                 /*
6516                  * Now we need to fetch the checkpoint record location, and also its
6517                  * REDO pointer.  The oldest point in WAL that would be needed to
6518                  * restore starting from the checkpoint is precisely the REDO pointer.
6519                  */
6520                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
6521                 checkpointloc = ControlFile->checkPoint;
6522                 startpoint = ControlFile->checkPointCopy.redo;
6523                 LWLockRelease(ControlFileLock);
6524
6525                 XLByteToSeg(startpoint, _logId, _logSeg);
6526                 XLogFileName(xlogfilename, ThisTimeLineID, _logId, _logSeg);
6527
6528                 /* Use the log timezone here, not the session timezone */
6529                 stamp_time = (pg_time_t) time(NULL);
6530                 pg_strftime(strfbuf, sizeof(strfbuf),
6531                                         "%Y-%m-%d %H:%M:%S %Z",
6532                                         pg_localtime(&stamp_time, log_timezone));
6533
6534                 /*
6535                  * Check for existing backup label --- implies a backup is already
6536                  * running.  (XXX given that we checked forcePageWrites above, maybe
6537                  * it would be OK to just unlink any such label file?)
6538                  */
6539                 if (stat(BACKUP_LABEL_FILE, &stat_buf) != 0)
6540                 {
6541                         if (errno != ENOENT)
6542                                 ereport(ERROR,
6543                                                 (errcode_for_file_access(),
6544                                                  errmsg("could not stat file \"%s\": %m",
6545                                                                 BACKUP_LABEL_FILE)));
6546                 }
6547                 else
6548                         ereport(ERROR,
6549                                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
6550                                          errmsg("a backup is already in progress"),
6551                                          errhint("If you're sure there is no backup in progress, remove file \"%s\" and try again.",
6552                                                          BACKUP_LABEL_FILE)));
6553
6554                 /*
6555                  * Okay, write the file
6556                  */
6557                 fp = AllocateFile(BACKUP_LABEL_FILE, "w");
6558                 if (!fp)
6559                         ereport(ERROR,
6560                                         (errcode_for_file_access(),
6561                                          errmsg("could not create file \"%s\": %m",
6562                                                         BACKUP_LABEL_FILE)));
6563                 fprintf(fp, "START WAL LOCATION: %X/%X (file %s)\n",
6564                                 startpoint.xlogid, startpoint.xrecoff, xlogfilename);
6565                 fprintf(fp, "CHECKPOINT LOCATION: %X/%X\n",
6566                                 checkpointloc.xlogid, checkpointloc.xrecoff);
6567                 fprintf(fp, "START TIME: %s\n", strfbuf);
6568                 fprintf(fp, "LABEL: %s\n", backupidstr);
6569                 if (fflush(fp) || ferror(fp) || FreeFile(fp))
6570                         ereport(ERROR,
6571                                         (errcode_for_file_access(),
6572                                          errmsg("could not write file \"%s\": %m",
6573                                                         BACKUP_LABEL_FILE)));
6574         }
6575         PG_END_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) 0);
6576
6577         /*
6578          * We're done.  As a convenience, return the starting WAL location.
6579          */
6580         snprintf(xlogfilename, sizeof(xlogfilename), "%X/%X",
6581                          startpoint.xlogid, startpoint.xrecoff);
6582         PG_RETURN_TEXT_P(cstring_to_text(xlogfilename));
6583 }
6584
6585 /* Error cleanup callback for pg_start_backup */
6586 static void
6587 pg_start_backup_callback(int code, Datum arg)
6588 {
6589         /* Turn off forcePageWrites on failure */
6590         LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
6591         XLogCtl->Insert.forcePageWrites = false;
6592         LWLockRelease(WALInsertLock);
6593 }
6594
6595 /*
6596  * pg_stop_backup: finish taking an on-line backup dump
6597  *
6598  * We remove the backup label file created by pg_start_backup, and instead
6599  * create a backup history file in pg_xlog (whence it will immediately be
6600  * archived).  The backup history file contains the same info found in
6601  * the label file, plus the backup-end time and WAL location.
6602  * Note: different from CancelBackup which just cancels online backup mode.
6603  */
6604 Datum
6605 pg_stop_backup(PG_FUNCTION_ARGS)
6606 {
6607         XLogRecPtr      startpoint;
6608         XLogRecPtr      stoppoint;
6609         pg_time_t       stamp_time;
6610         char            strfbuf[128];
6611         char            histfilepath[MAXPGPATH];
6612         char            startxlogfilename[MAXFNAMELEN];
6613         char            stopxlogfilename[MAXFNAMELEN];
6614         uint32          _logId;
6615         uint32          _logSeg;
6616         FILE       *lfp;
6617         FILE       *fp;
6618         char            ch;
6619         int                     ich;
6620         int                     seconds_before_warning;
6621         int                     waits = 0;
6622
6623         if (!superuser())
6624                 ereport(ERROR,
6625                                 (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
6626                                  (errmsg("must be superuser to run a backup"))));
6627
6628         if (!XLogArchivingActive())
6629                 ereport(ERROR,
6630                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
6631                                  errmsg("WAL archiving is not active"),
6632                                  errhint("archive_mode must be enabled at server start.")));
6633
6634         /*
6635          * OK to clear forcePageWrites
6636          */
6637         LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
6638         XLogCtl->Insert.forcePageWrites = false;
6639         LWLockRelease(WALInsertLock);
6640
6641         /*
6642          * Force a switch to a new xlog segment file, so that the backup is valid
6643          * as soon as archiver moves out the current segment file. We'll report
6644          * the end address of the XLOG SWITCH record as the backup stopping point.
6645          */
6646         stoppoint = RequestXLogSwitch();
6647
6648         XLByteToSeg(stoppoint, _logId, _logSeg);
6649         XLogFileName(stopxlogfilename, ThisTimeLineID, _logId, _logSeg);
6650
6651         /* Use the log timezone here, not the session timezone */
6652         stamp_time = (pg_time_t) time(NULL);
6653         pg_strftime(strfbuf, sizeof(strfbuf),
6654                                 "%Y-%m-%d %H:%M:%S %Z",
6655                                 pg_localtime(&stamp_time, log_timezone));
6656
6657         /*
6658          * Open the existing label file
6659          */
6660         lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
6661         if (!lfp)
6662         {
6663                 if (errno != ENOENT)
6664                         ereport(ERROR,
6665                                         (errcode_for_file_access(),
6666                                          errmsg("could not read file \"%s\": %m",
6667                                                         BACKUP_LABEL_FILE)));
6668                 ereport(ERROR,
6669                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
6670                                  errmsg("a backup is not in progress")));
6671         }
6672
6673         /*
6674          * Read and parse the START WAL LOCATION line (this code is pretty crude,
6675          * but we are not expecting any variability in the file format).
6676          */
6677         if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %24s)%c",
6678                            &startpoint.xlogid, &startpoint.xrecoff, startxlogfilename,
6679                            &ch) != 4 || ch != '\n')
6680                 ereport(ERROR,
6681                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
6682                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
6683
6684         /*
6685          * Write the backup history file
6686          */
6687         XLByteToSeg(startpoint, _logId, _logSeg);
6688         BackupHistoryFilePath(histfilepath, ThisTimeLineID, _logId, _logSeg,
6689                                                   startpoint.xrecoff % XLogSegSize);
6690         fp = AllocateFile(histfilepath, "w");
6691         if (!fp)
6692                 ereport(ERROR,
6693                                 (errcode_for_file_access(),
6694                                  errmsg("could not create file \"%s\": %m",
6695                                                 histfilepath)));
6696         fprintf(fp, "START WAL LOCATION: %X/%X (file %s)\n",
6697                         startpoint.xlogid, startpoint.xrecoff, startxlogfilename);
6698         fprintf(fp, "STOP WAL LOCATION: %X/%X (file %s)\n",
6699                         stoppoint.xlogid, stoppoint.xrecoff, stopxlogfilename);
6700         /* transfer remaining lines from label to history file */
6701         while ((ich = fgetc(lfp)) != EOF)
6702                 fputc(ich, fp);
6703         fprintf(fp, "STOP TIME: %s\n", strfbuf);
6704         if (fflush(fp) || ferror(fp) || FreeFile(fp))
6705                 ereport(ERROR,
6706                                 (errcode_for_file_access(),
6707                                  errmsg("could not write file \"%s\": %m",
6708                                                 histfilepath)));
6709
6710         /*
6711          * Close and remove the backup label file
6712          */
6713         if (ferror(lfp) || FreeFile(lfp))
6714                 ereport(ERROR,
6715                                 (errcode_for_file_access(),
6716                                  errmsg("could not read file \"%s\": %m",
6717                                                 BACKUP_LABEL_FILE)));
6718         if (unlink(BACKUP_LABEL_FILE) != 0)
6719                 ereport(ERROR,
6720                                 (errcode_for_file_access(),
6721                                  errmsg("could not remove file \"%s\": %m",
6722                                                 BACKUP_LABEL_FILE)));
6723
6724         /*
6725          * Clean out any no-longer-needed history files.  As a side effect, this
6726          * will post a .ready file for the newly created history file, notifying
6727          * the archiver that history file may be archived immediately.
6728          */
6729         CleanupBackupHistory();
6730
6731         /*
6732          * Wait until both the last WAL file filled during backup and the history
6733          * file have been archived.  We assume that the alphabetic sorting
6734          * property of the WAL files ensures any earlier WAL files are safely
6735          * archived as well.
6736          *
6737          * We wait forever, since archive_command is supposed to work and
6738          * we assume the admin wanted his backup to work completely. If you
6739          * don't wish to wait, you can set statement_timeout.
6740          */
6741         BackupHistoryFileName(histfilepath, ThisTimeLineID, _logId, _logSeg,
6742                                                   startpoint.xrecoff % XLogSegSize);
6743
6744         seconds_before_warning = 60;
6745         waits = 0;
6746
6747         while (XLogArchiveIsBusy(stopxlogfilename) ||
6748                    XLogArchiveIsBusy(histfilepath))
6749         {
6750                 CHECK_FOR_INTERRUPTS();
6751
6752                 pg_usleep(1000000L);
6753
6754                 if (++waits >= seconds_before_warning)
6755                 {
6756                         seconds_before_warning *= 2;     /* This wraps in >10 years... */
6757                         ereport(WARNING,
6758                                         (errmsg("pg_stop_backup still waiting for archive to complete (%d seconds elapsed)",
6759                                                         waits)));
6760                 }
6761         }
6762
6763         /*
6764          * We're done.  As a convenience, return the ending WAL location.
6765          */
6766         snprintf(stopxlogfilename, sizeof(stopxlogfilename), "%X/%X",
6767                          stoppoint.xlogid, stoppoint.xrecoff);
6768         PG_RETURN_TEXT_P(cstring_to_text(stopxlogfilename));
6769 }
6770
6771 /*
6772  * pg_switch_xlog: switch to next xlog file
6773  */
6774 Datum
6775 pg_switch_xlog(PG_FUNCTION_ARGS)
6776 {
6777         XLogRecPtr      switchpoint;
6778         char            location[MAXFNAMELEN];
6779
6780         if (!superuser())
6781                 ereport(ERROR,
6782                                 (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
6783                          (errmsg("must be superuser to switch transaction log files"))));
6784
6785         switchpoint = RequestXLogSwitch();
6786
6787         /*
6788          * As a convenience, return the WAL location of the switch record
6789          */
6790         snprintf(location, sizeof(location), "%X/%X",
6791                          switchpoint.xlogid, switchpoint.xrecoff);
6792         PG_RETURN_TEXT_P(cstring_to_text(location));
6793 }
6794
6795 /*
6796  * Report the current WAL write location (same format as pg_start_backup etc)
6797  *
6798  * This is useful for determining how much of WAL is visible to an external
6799  * archiving process.  Note that the data before this point is written out
6800  * to the kernel, but is not necessarily synced to disk.
6801  */
6802 Datum
6803 pg_current_xlog_location(PG_FUNCTION_ARGS)
6804 {
6805         char            location[MAXFNAMELEN];
6806
6807         /* Make sure we have an up-to-date local LogwrtResult */
6808         {
6809                 /* use volatile pointer to prevent code rearrangement */
6810                 volatile XLogCtlData *xlogctl = XLogCtl;
6811
6812                 SpinLockAcquire(&xlogctl->info_lck);
6813                 LogwrtResult = xlogctl->LogwrtResult;
6814                 SpinLockRelease(&xlogctl->info_lck);
6815         }
6816
6817         snprintf(location, sizeof(location), "%X/%X",
6818                          LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff);
6819         PG_RETURN_TEXT_P(cstring_to_text(location));
6820 }
6821
6822 /*
6823  * Report the current WAL insert location (same format as pg_start_backup etc)
6824  *
6825  * This function is mostly for debugging purposes.
6826  */
6827 Datum
6828 pg_current_xlog_insert_location(PG_FUNCTION_ARGS)
6829 {
6830         XLogCtlInsert *Insert = &XLogCtl->Insert;
6831         XLogRecPtr      current_recptr;
6832         char            location[MAXFNAMELEN];
6833
6834         /*
6835          * Get the current end-of-WAL position ... shared lock is sufficient
6836          */
6837         LWLockAcquire(WALInsertLock, LW_SHARED);
6838         INSERT_RECPTR(current_recptr, Insert, Insert->curridx);
6839         LWLockRelease(WALInsertLock);
6840
6841         snprintf(location, sizeof(location), "%X/%X",
6842                          current_recptr.xlogid, current_recptr.xrecoff);
6843         PG_RETURN_TEXT_P(cstring_to_text(location));
6844 }
6845
6846 /*
6847  * Compute an xlog file name and decimal byte offset given a WAL location,
6848  * such as is returned by pg_stop_backup() or pg_xlog_switch().
6849  *
6850  * Note that a location exactly at a segment boundary is taken to be in
6851  * the previous segment.  This is usually the right thing, since the
6852  * expected usage is to determine which xlog file(s) are ready to archive.
6853  */
6854 Datum
6855 pg_xlogfile_name_offset(PG_FUNCTION_ARGS)
6856 {
6857         text       *location = PG_GETARG_TEXT_P(0);
6858         char       *locationstr;
6859         unsigned int uxlogid;
6860         unsigned int uxrecoff;
6861         uint32          xlogid;
6862         uint32          xlogseg;
6863         uint32          xrecoff;
6864         XLogRecPtr      locationpoint;
6865         char            xlogfilename[MAXFNAMELEN];
6866         Datum           values[2];
6867         bool            isnull[2];
6868         TupleDesc       resultTupleDesc;
6869         HeapTuple       resultHeapTuple;
6870         Datum           result;
6871
6872         /*
6873          * Read input and parse
6874          */
6875         locationstr = text_to_cstring(location);
6876
6877         if (sscanf(locationstr, "%X/%X", &uxlogid, &uxrecoff) != 2)
6878                 ereport(ERROR,
6879                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6880                                  errmsg("could not parse transaction log location \"%s\"",
6881                                                 locationstr)));
6882
6883         locationpoint.xlogid = uxlogid;
6884         locationpoint.xrecoff = uxrecoff;
6885
6886         /*
6887          * Construct a tuple descriptor for the result row.  This must match this
6888          * function's pg_proc entry!
6889          */
6890         resultTupleDesc = CreateTemplateTupleDesc(2, false);
6891         TupleDescInitEntry(resultTupleDesc, (AttrNumber) 1, "file_name",
6892                                            TEXTOID, -1, 0);
6893         TupleDescInitEntry(resultTupleDesc, (AttrNumber) 2, "file_offset",
6894                                            INT4OID, -1, 0);
6895
6896         resultTupleDesc = BlessTupleDesc(resultTupleDesc);
6897
6898         /*
6899          * xlogfilename
6900          */
6901         XLByteToPrevSeg(locationpoint, xlogid, xlogseg);
6902         XLogFileName(xlogfilename, ThisTimeLineID, xlogid, xlogseg);
6903
6904         values[0] = CStringGetTextDatum(xlogfilename);
6905         isnull[0] = false;
6906
6907         /*
6908          * offset
6909          */
6910         xrecoff = locationpoint.xrecoff - xlogseg * XLogSegSize;
6911
6912         values[1] = UInt32GetDatum(xrecoff);
6913         isnull[1] = false;
6914
6915         /*
6916          * Tuple jam: Having first prepared your Datums, then squash together
6917          */
6918         resultHeapTuple = heap_form_tuple(resultTupleDesc, values, isnull);
6919
6920         result = HeapTupleGetDatum(resultHeapTuple);
6921
6922         PG_RETURN_DATUM(result);
6923 }
6924
6925 /*
6926  * Compute an xlog file name given a WAL location,
6927  * such as is returned by pg_stop_backup() or pg_xlog_switch().
6928  */
6929 Datum
6930 pg_xlogfile_name(PG_FUNCTION_ARGS)
6931 {
6932         text       *location = PG_GETARG_TEXT_P(0);
6933         char       *locationstr;
6934         unsigned int uxlogid;
6935         unsigned int uxrecoff;
6936         uint32          xlogid;
6937         uint32          xlogseg;
6938         XLogRecPtr      locationpoint;
6939         char            xlogfilename[MAXFNAMELEN];
6940
6941         locationstr = text_to_cstring(location);
6942
6943         if (sscanf(locationstr, "%X/%X", &uxlogid, &uxrecoff) != 2)
6944                 ereport(ERROR,
6945                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6946                                  errmsg("could not parse transaction log location \"%s\"",
6947                                                 locationstr)));
6948
6949         locationpoint.xlogid = uxlogid;
6950         locationpoint.xrecoff = uxrecoff;
6951
6952         XLByteToPrevSeg(locationpoint, xlogid, xlogseg);
6953         XLogFileName(xlogfilename, ThisTimeLineID, xlogid, xlogseg);
6954
6955         PG_RETURN_TEXT_P(cstring_to_text(xlogfilename));
6956 }
6957
6958 /*
6959  * read_backup_label: check to see if a backup_label file is present
6960  *
6961  * If we see a backup_label during recovery, we assume that we are recovering
6962  * from a backup dump file, and we therefore roll forward from the checkpoint
6963  * identified by the label file, NOT what pg_control says.      This avoids the
6964  * problem that pg_control might have been archived one or more checkpoints
6965  * later than the start of the dump, and so if we rely on it as the start
6966  * point, we will fail to restore a consistent database state.
6967  *
6968  * We also attempt to retrieve the corresponding backup history file.
6969  * If successful, set *minRecoveryLoc to constrain valid PITR stopping
6970  * points.
6971  *
6972  * Returns TRUE if a backup_label was found (and fills the checkpoint
6973  * location into *checkPointLoc); returns FALSE if not.
6974  */
6975 static bool
6976 read_backup_label(XLogRecPtr *checkPointLoc, XLogRecPtr *minRecoveryLoc)
6977 {
6978         XLogRecPtr      startpoint;
6979         XLogRecPtr      stoppoint;
6980         char            histfilename[MAXFNAMELEN];
6981         char            histfilepath[MAXPGPATH];
6982         char            startxlogfilename[MAXFNAMELEN];
6983         char            stopxlogfilename[MAXFNAMELEN];
6984         TimeLineID      tli;
6985         uint32          _logId;
6986         uint32          _logSeg;
6987         FILE       *lfp;
6988         FILE       *fp;
6989         char            ch;
6990
6991         /* Default is to not constrain recovery stop point */
6992         minRecoveryLoc->xlogid = 0;
6993         minRecoveryLoc->xrecoff = 0;
6994
6995         /*
6996          * See if label file is present
6997          */
6998         lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
6999         if (!lfp)
7000         {
7001                 if (errno != ENOENT)
7002                         ereport(FATAL,
7003                                         (errcode_for_file_access(),
7004                                          errmsg("could not read file \"%s\": %m",
7005                                                         BACKUP_LABEL_FILE)));
7006                 return false;                   /* it's not there, all is fine */
7007         }
7008
7009         /*
7010          * Read and parse the START WAL LOCATION and CHECKPOINT lines (this code
7011          * is pretty crude, but we are not expecting any variability in the file
7012          * format).
7013          */
7014         if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %08X%16s)%c",
7015                            &startpoint.xlogid, &startpoint.xrecoff, &tli,
7016                            startxlogfilename, &ch) != 5 || ch != '\n')
7017                 ereport(FATAL,
7018                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
7019                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
7020         if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%X%c",
7021                            &checkPointLoc->xlogid, &checkPointLoc->xrecoff,
7022                            &ch) != 3 || ch != '\n')
7023                 ereport(FATAL,
7024                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
7025                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
7026         if (ferror(lfp) || FreeFile(lfp))
7027                 ereport(FATAL,
7028                                 (errcode_for_file_access(),
7029                                  errmsg("could not read file \"%s\": %m",
7030                                                 BACKUP_LABEL_FILE)));
7031
7032         /*
7033          * Try to retrieve the backup history file (no error if we can't)
7034          */
7035         XLByteToSeg(startpoint, _logId, _logSeg);
7036         BackupHistoryFileName(histfilename, tli, _logId, _logSeg,
7037                                                   startpoint.xrecoff % XLogSegSize);
7038
7039         if (InArchiveRecovery)
7040                 RestoreArchivedFile(histfilepath, histfilename, "RECOVERYHISTORY", 0);
7041         else
7042                 BackupHistoryFilePath(histfilepath, tli, _logId, _logSeg,
7043                                                           startpoint.xrecoff % XLogSegSize);
7044
7045         fp = AllocateFile(histfilepath, "r");
7046         if (fp)
7047         {
7048                 /*
7049                  * Parse history file to identify stop point.
7050                  */
7051                 if (fscanf(fp, "START WAL LOCATION: %X/%X (file %24s)%c",
7052                                    &startpoint.xlogid, &startpoint.xrecoff, startxlogfilename,
7053                                    &ch) != 4 || ch != '\n')
7054                         ereport(FATAL,
7055                                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
7056                                          errmsg("invalid data in file \"%s\"", histfilename)));
7057                 if (fscanf(fp, "STOP WAL LOCATION: %X/%X (file %24s)%c",
7058                                    &stoppoint.xlogid, &stoppoint.xrecoff, stopxlogfilename,
7059                                    &ch) != 4 || ch != '\n')
7060                         ereport(FATAL,
7061                                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
7062                                          errmsg("invalid data in file \"%s\"", histfilename)));
7063                 *minRecoveryLoc = stoppoint;
7064                 if (ferror(fp) || FreeFile(fp))
7065                         ereport(FATAL,
7066                                         (errcode_for_file_access(),
7067                                          errmsg("could not read file \"%s\": %m",
7068                                                         histfilepath)));
7069         }
7070
7071         return true;
7072 }
7073
7074 /*
7075  * Error context callback for errors occurring during rm_redo().
7076  */
7077 static void
7078 rm_redo_error_callback(void *arg)
7079 {
7080         XLogRecord *record = (XLogRecord *) arg;
7081         StringInfoData buf;
7082
7083         initStringInfo(&buf);
7084         RmgrTable[record->xl_rmid].rm_desc(&buf,
7085                                                                            record->xl_info,
7086                                                                            XLogRecGetData(record));
7087
7088         /* don't bother emitting empty description */
7089         if (buf.len > 0)
7090                 errcontext("xlog redo %s", buf.data);
7091
7092         pfree(buf.data);
7093 }
7094
7095 /*
7096  * BackupInProgress: check if online backup mode is active
7097  *
7098  * This is done by checking for existence of the "backup_label" file.
7099  */
7100 bool
7101 BackupInProgress(void)
7102 {
7103         struct stat stat_buf;
7104
7105         return (stat(BACKUP_LABEL_FILE, &stat_buf) == 0);
7106 }
7107
7108 /*
7109  * CancelBackup: rename the "backup_label" file to cancel backup mode
7110  *
7111  * If the "backup_label" file exists, it will be renamed to "backup_label.old".
7112  * Note that this will render an online backup in progress useless.
7113  * To correctly finish an online backup, pg_stop_backup must be called.
7114  */
7115 void
7116 CancelBackup(void)
7117 {
7118         struct stat stat_buf;
7119
7120         /* if the file is not there, return */
7121         if (stat(BACKUP_LABEL_FILE, &stat_buf) < 0)
7122                 return;
7123
7124         /* remove leftover file from previously cancelled backup if it exists */
7125         unlink(BACKUP_LABEL_OLD);
7126
7127         if (rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) == 0)
7128         {
7129                 ereport(LOG,
7130                                 (errmsg("online backup mode cancelled"),
7131                                  errdetail("\"%s\" was renamed to \"%s\".",
7132                                                 BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
7133         }
7134         else
7135         {
7136                 ereport(WARNING,
7137                                 (errcode_for_file_access(),
7138                                  errmsg("online backup mode was not cancelled"),
7139                                  errdetail("Could not rename \"%s\" to \"%s\": %m.",
7140                                                 BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
7141         }
7142 }
7143