src/backend/access/transam/xlogarchive.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * xlogarchive.c
   4  *              Functions for archiving WAL files and restoring from the archive.
   5  *
   6  *
   7  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
   8  * Portions Copyright (c) 1994, Regents of the University of California
   9  *
  10  * src/backend/access/transam/xlogarchive.c
  11  *
  12  *-------------------------------------------------------------------------
  13  */
  14
  15 #include "postgres.h"
  16
  17 #include <sys/stat.h>
  18 #include <sys/wait.h>
  19 #include <signal.h>
  20 #include <unistd.h>
  21
  22 #include "access/xlog.h"
  23 #include "access/xlog_internal.h"
  24 #include "access/xlogarchive.h"
  25 #include "miscadmin.h"
  26 #include "pgstat.h"
  27 #include "postmaster/startup.h"
  28 #include "postmaster/pgarch.h"
  29 #include "replication/walsender.h"
  30 #include "storage/fd.h"
  31 #include "storage/ipc.h"
  32 #include "storage/lwlock.h"
  33
  34 /*
  35  * Attempt to retrieve the specified file from off-line archival storage.
  36  * If successful, fill "path" with its complete path (note that this will be
  37  * a temp file name that doesn't follow the normal naming convention), and
  38  * return true.
  39  *
  40  * If not successful, fill "path" with the name of the normal on-line file
  41  * (which may or may not actually exist, but we'll try to use it), and return
  42  * false.
  43  *
  44  * For fixed-size files, the caller may pass the expected size as an
  45  * additional crosscheck on successful recovery.  If the file size is not
  46  * known, set expectedSize = 0.
  47  *
  48  * When 'cleanupEnabled' is false, refrain from deleting any old WAL segments
  49  * in the archive. This is used when fetching the initial checkpoint record,
  50  * when we are not yet sure how far back we need the WAL.
  51  */
  52 bool
  53 RestoreArchivedFile(char *path, const char *xlogfname,
  54                                         const char *recovername, off_t expectedSize,
  55                                         bool cleanupEnabled)
  56 {
  57         char            xlogpath[MAXPGPATH];
  58         char            lastRestartPointFname[MAXPGPATH];
  59         bool            ret;
  60         struct stat stat_buf;
  61         XLogSegNo       restartSegNo;
  62         XLogRecPtr      restartRedoPtr;
  63         TimeLineID      restartTli;
  64
  65         /*
  66          * Ignore restore_command when not in archive recovery (meaning we are in
  67          * crash recovery).
  68          */
  69         if (!ArchiveRecoveryRequested)
  70                 goto not_available;
  71
  72         /* In standby mode, restore_command might not be supplied */
  73         if (recoveryRestoreCommand == NULL || strcmp(recoveryRestoreCommand, "") == 0)
  74                 goto not_available;
  75
  76         /*
  77          * When doing archive recovery, we always prefer an archived log file even
  78          * if a file of the same name exists in XLOGDIR.  The reason is that the
  79          * file in XLOGDIR could be an old, un-filled or partly-filled version
  80          * that was copied and restored as part of backing up $PGDATA.
  81          *
  82          * We could try to optimize this slightly by checking the local copy
  83          * lastchange timestamp against the archived copy, but we have no API to
  84          * do this, nor can we guarantee that the lastchange timestamp was
  85          * preserved correctly when we copied to archive. Our aim is robustness,
  86          * so we elect not to do this.
  87          *
  88          * If we cannot obtain the log file from the archive, however, we will try
  89          * to use the XLOGDIR file if it exists.  This is so that we can make use
  90          * of log segments that weren't yet transferred to the archive.
  91          *
  92          * Notice that we don't actually overwrite any files when we copy back
  93          * from archive because the restore_command may inadvertently restore
  94          * inappropriate xlogs, or they may be corrupt, so we may wish to fallback
  95          * to the segments remaining in current XLOGDIR later. The
  96          * copy-from-archive filename is always the same, ensuring that we don't
  97          * run out of disk space on long recoveries.
  98          */
  99         snprintf(xlogpath, MAXPGPATH, XLOGDIR "/%s", recovername);
 100
 101         /*
 102          * Make sure there is no existing file named recovername.
 103          */
 104         if (stat(xlogpath, &stat_buf) != 0)
 105         {
 106                 if (errno != ENOENT)
 107                         ereport(FATAL,
 108                                         (errcode_for_file_access(),
 109                                          errmsg("could not stat file \"%s\": %m",
 110                                                         xlogpath)));
 111         }
 112         else
 113         {
 114                 if (unlink(xlogpath) != 0)
 115                         ereport(FATAL,
 116                                         (errcode_for_file_access(),
 117                                          errmsg("could not remove file \"%s\": %m",
 118                                                         xlogpath)));
 119         }
 120
 121         /*
 122          * Calculate the archive file cutoff point for use during log shipping
 123          * replication. All files earlier than this point can be deleted from the
 124          * archive, though there is no requirement to do so.
 125          *
 126          * If cleanup is not enabled, initialise this with the filename of
 127          * InvalidXLogRecPtr, which will prevent the deletion of any WAL files
 128          * from the archive because of the alphabetic sorting property of WAL
 129          * filenames.
 130          *
 131          * Once we have successfully located the redo pointer of the checkpoint
 132          * from which we start recovery we never request a file prior to the redo
 133          * pointer of the last restartpoint. When redo begins we know that we have
 134          * successfully located it, so there is no need for additional status
 135          * flags to signify the point when we can begin deleting WAL files from
 136          * the archive.
 137          */
 138         if (cleanupEnabled)
 139         {
 140                 GetOldestRestartPoint(&restartRedoPtr, &restartTli);
 141                 XLByteToSeg(restartRedoPtr, restartSegNo, wal_segment_size);
 142                 XLogFileName(lastRestartPointFname, restartTli, restartSegNo,
 143                                          wal_segment_size);
 144                 /* we shouldn't need anything earlier than last restart point */
 145                 Assert(strcmp(lastRestartPointFname, xlogfname) <= 0);
 146         }
 147         else
 148                 XLogFileName(lastRestartPointFname, 0, 0L, wal_segment_size);
 149
 150         /*
 151          * Check signals before restore command and reset afterwards.
 152          */
 153         PreRestoreCommand();
 154
 155         /*
 156          * Copy xlog from archival storage to XLOGDIR
 157          */
 158         ret = shell_restore(xlogfname, xlogpath, lastRestartPointFname);
 159
 160         PostRestoreCommand();
 161
 162         if (ret)
 163         {
 164                 /*
 165                  * command apparently succeeded, but let's make sure the file is
 166                  * really there now and has the correct size.
 167                  */
 168                 if (stat(xlogpath, &stat_buf) == 0)
 169                 {
 170                         if (expectedSize > 0 && stat_buf.st_size != expectedSize)
 171                         {
 172                                 int                     elevel;
 173
 174                                 /*
 175                                  * If we find a partial file in standby mode, we assume it's
 176                                  * because it's just being copied to the archive, and keep
 177                                  * trying.
 178                                  *
 179                                  * Otherwise treat a wrong-sized file as FATAL to ensure the
 180                                  * DBA would notice it, but is that too strong? We could try
 181                                  * to plow ahead with a local copy of the file ... but the
 182                                  * problem is that there probably isn't one, and we'd
 183                                  * incorrectly conclude we've reached the end of WAL and we're
 184                                  * done recovering ...
 185                                  */
 186                                 if (StandbyMode && stat_buf.st_size < expectedSize)
 187                                         elevel = DEBUG1;
 188                                 else
 189                                         elevel = FATAL;
 190                                 ereport(elevel,
 191                                                 (errmsg("archive file \"%s\" has wrong size: %lld instead of %lld",
 192                                                                 xlogfname,
 193                                                                 (long long int) stat_buf.st_size,
 194                                                                 (long long int) expectedSize)));
 195                                 return false;
 196                         }
 197                         else
 198                         {
 199                                 ereport(LOG,
 200                                                 (errmsg("restored log file \"%s\" from archive",
 201                                                                 xlogfname)));
 202                                 strcpy(path, xlogpath);
 203                                 return true;
 204                         }
 205                 }
 206                 else
 207                 {
 208                         /* stat failed */
 209                         int                     elevel = (errno == ENOENT) ? LOG : FATAL;
 210
 211                         ereport(elevel,
 212                                         (errcode_for_file_access(),
 213                                          errmsg("could not stat file \"%s\": %m", xlogpath),
 214                                          errdetail("restore_command returned a zero exit status, but stat() failed.")));
 215                 }
 216         }
 217
 218 not_available:
 219
 220         /*
 221          * if an archived file is not available, there might still be a version of
 222          * this file in XLOGDIR, so return that as the filename to open.
 223          *
 224          * In many recovery scenarios we expect this to fail also, but if so that
 225          * just means we've reached the end of WAL.
 226          */
 227         snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname);
 228         return false;
 229 }
 230
 231 /*
 232  * A file was restored from the archive under a temporary filename (path),
 233  * and now we want to keep it. Rename it under the permanent filename in
 234  * pg_wal (xlogfname), replacing any existing file with the same name.
 235  */
 236 void
 237 KeepFileRestoredFromArchive(const char *path, const char *xlogfname)
 238 {
 239         char            xlogfpath[MAXPGPATH];
 240         bool            reload = false;
 241         struct stat statbuf;
 242
 243         snprintf(xlogfpath, MAXPGPATH, XLOGDIR "/%s", xlogfname);
 244
 245         if (stat(xlogfpath, &statbuf) == 0)
 246         {
 247                 char            oldpath[MAXPGPATH];
 248
 249 #ifdef WIN32
 250                 static unsigned int deletedcounter = 1;
 251
 252                 /*
 253                  * On Windows, if another process (e.g a walsender process) holds the
 254                  * file open in FILE_SHARE_DELETE mode, unlink will succeed, but the
 255                  * file will still show up in directory listing until the last handle
 256                  * is closed, and we cannot rename the new file in its place until
 257                  * that. To avoid that problem, rename the old file to a temporary
 258                  * name first. Use a counter to create a unique filename, because the
 259                  * same file might be restored from the archive multiple times, and a
 260                  * walsender could still be holding onto an old deleted version of it.
 261                  */
 262                 snprintf(oldpath, MAXPGPATH, "%s.deleted%u",
 263                                  xlogfpath, deletedcounter++);
 264                 if (rename(xlogfpath, oldpath) != 0)
 265                 {
 266                         ereport(ERROR,
 267                                         (errcode_for_file_access(),
 268                                          errmsg("could not rename file \"%s\" to \"%s\": %m",
 269                                                         xlogfpath, oldpath)));
 270                 }
 271 #else
 272                 /* same-size buffers, so this never truncates */
 273                 strlcpy(oldpath, xlogfpath, MAXPGPATH);
 274 #endif
 275                 if (unlink(oldpath) != 0)
 276                         ereport(FATAL,
 277                                         (errcode_for_file_access(),
 278                                          errmsg("could not remove file \"%s\": %m",
 279                                                         xlogfpath)));
 280                 reload = true;
 281         }
 282
 283         durable_rename(path, xlogfpath, ERROR);
 284
 285         /*
 286          * Create .done file forcibly to prevent the restored segment from being
 287          * archived again later.
 288          */
 289         if (XLogArchiveMode != ARCHIVE_MODE_ALWAYS)
 290                 XLogArchiveForceDone(xlogfname);
 291         else
 292                 XLogArchiveNotify(xlogfname);
 293
 294         /*
 295          * If the existing file was replaced, since walsenders might have it open,
 296          * request them to reload a currently-open segment. This is only required
 297          * for WAL segments, walsenders don't hold other files open, but there's
 298          * no harm in doing this too often, and we don't know what kind of a file
 299          * we're dealing with here.
 300          */
 301         if (reload)
 302                 WalSndRqstFileReload();
 303
 304         /*
 305          * Signal walsender that new WAL has arrived. Again, this isn't necessary
 306          * if we restored something other than a WAL segment, but it does no harm
 307          * either.
 308          */
 309         WalSndWakeup();
 310 }
 311
 312 /*
 313  * XLogArchiveNotify
 314  *
 315  * Create an archive notification file
 316  *
 317  * The name of the notification file is the message that will be picked up
 318  * by the archiver, e.g. we write 0000000100000001000000C6.ready
 319  * and the archiver then knows to archive XLOGDIR/0000000100000001000000C6,
 320  * then when complete, rename it to 0000000100000001000000C6.done
 321  */
 322 void
 323 XLogArchiveNotify(const char *xlog)
 324 {
 325         char            archiveStatusPath[MAXPGPATH];
 326         FILE       *fd;
 327
 328         /* insert an otherwise empty file called <XLOG>.ready */
 329         StatusFilePath(archiveStatusPath, xlog, ".ready");
 330         fd = AllocateFile(archiveStatusPath, "w");
 331         if (fd == NULL)
 332         {
 333                 ereport(LOG,
 334                                 (errcode_for_file_access(),
 335                                  errmsg("could not create archive status file \"%s\": %m",
 336                                                 archiveStatusPath)));
 337                 return;
 338         }
 339         if (FreeFile(fd))
 340         {
 341                 ereport(LOG,
 342                                 (errcode_for_file_access(),
 343                                  errmsg("could not write archive status file \"%s\": %m",
 344                                                 archiveStatusPath)));
 345                 return;
 346         }
 347
 348         /*
 349          * Timeline history files are given the highest archival priority to lower
 350          * the chance that a promoted standby will choose a timeline that is
 351          * already in use.  However, the archiver ordinarily tries to gather
 352          * multiple files to archive from each scan of the archive_status
 353          * directory, which means that newly created timeline history files could
 354          * be left unarchived for a while.  To ensure that the archiver picks up
 355          * timeline history files as soon as possible, we force the archiver to
 356          * scan the archive_status directory the next time it looks for a file to
 357          * archive.
 358          */
 359         if (IsTLHistoryFileName(xlog))
 360                 PgArchForceDirScan();
 361
 362         /* Notify archiver that it's got something to do */
 363         if (IsUnderPostmaster)
 364                 PgArchWakeup();
 365 }
 366
 367 /*
 368  * Convenience routine to notify using segment number representation of filename
 369  */
 370 void
 371 XLogArchiveNotifySeg(XLogSegNo segno, TimeLineID tli)
 372 {
 373         char            xlog[MAXFNAMELEN];
 374
 375         Assert(tli != 0);
 376
 377         XLogFileName(xlog, tli, segno, wal_segment_size);
 378         XLogArchiveNotify(xlog);
 379 }
 380
 381 /*
 382  * XLogArchiveForceDone
 383  *
 384  * Emit notification forcibly that an XLOG segment file has been successfully
 385  * archived, by creating <XLOG>.done regardless of whether <XLOG>.ready
 386  * exists or not.
 387  */
 388 void
 389 XLogArchiveForceDone(const char *xlog)
 390 {
 391         char            archiveReady[MAXPGPATH];
 392         char            archiveDone[MAXPGPATH];
 393         struct stat stat_buf;
 394         FILE       *fd;
 395
 396         /* Exit if already known done */
 397         StatusFilePath(archiveDone, xlog, ".done");
 398         if (stat(archiveDone, &stat_buf) == 0)
 399                 return;
 400
 401         /* If .ready exists, rename it to .done */
 402         StatusFilePath(archiveReady, xlog, ".ready");
 403         if (stat(archiveReady, &stat_buf) == 0)
 404         {
 405                 (void) durable_rename(archiveReady, archiveDone, WARNING);
 406                 return;
 407         }
 408
 409         /* insert an otherwise empty file called <XLOG>.done */
 410         fd = AllocateFile(archiveDone, "w");
 411         if (fd == NULL)
 412         {
 413                 ereport(LOG,
 414                                 (errcode_for_file_access(),
 415                                  errmsg("could not create archive status file \"%s\": %m",
 416                                                 archiveDone)));
 417                 return;
 418         }
 419         if (FreeFile(fd))
 420         {
 421                 ereport(LOG,
 422                                 (errcode_for_file_access(),
 423                                  errmsg("could not write archive status file \"%s\": %m",
 424                                                 archiveDone)));
 425                 return;
 426         }
 427 }
 428
 429 /*
 430  * XLogArchiveCheckDone
 431  *
 432  * This is called when we are ready to delete or recycle an old XLOG segment
 433  * file or backup history file.  If it is okay to delete it then return true.
 434  * If it is not time to delete it, make sure a .ready file exists, and return
 435  * false.
 436  *
 437  * If <XLOG>.done exists, then return true; else if <XLOG>.ready exists,
 438  * then return false; else create <XLOG>.ready and return false.
 439  *
 440  * The reason we do things this way is so that if the original attempt to
 441  * create <XLOG>.ready fails, we'll retry during subsequent checkpoints.
 442  */
 443 bool
 444 XLogArchiveCheckDone(const char *xlog)
 445 {
 446         char            archiveStatusPath[MAXPGPATH];
 447         struct stat stat_buf;
 448
 449         /* The file is always deletable if archive_mode is "off". */
 450         if (!XLogArchivingActive())
 451                 return true;
 452
 453         /*
 454          * During archive recovery, the file is deletable if archive_mode is not
 455          * "always".
 456          */
 457         if (!XLogArchivingAlways() &&
 458                 GetRecoveryState() == RECOVERY_STATE_ARCHIVE)
 459                 return true;
 460
 461         /*
 462          * At this point of the logic, note that we are either a primary with
 463          * archive_mode set to "on" or "always", or a standby with archive_mode
 464          * set to "always".
 465          */
 466
 467         /* First check for .done --- this means archiver is done with it */
 468         StatusFilePath(archiveStatusPath, xlog, ".done");
 469         if (stat(archiveStatusPath, &stat_buf) == 0)
 470                 return true;
 471
 472         /* check for .ready --- this means archiver is still busy with it */
 473         StatusFilePath(archiveStatusPath, xlog, ".ready");
 474         if (stat(archiveStatusPath, &stat_buf) == 0)
 475                 return false;
 476
 477         /* Race condition --- maybe archiver just finished, so recheck */
 478         StatusFilePath(archiveStatusPath, xlog, ".done");
 479         if (stat(archiveStatusPath, &stat_buf) == 0)
 480                 return true;
 481
 482         /* Retry creation of the .ready file */
 483         XLogArchiveNotify(xlog);
 484         return false;
 485 }
 486
 487 /*
 488  * XLogArchiveIsBusy
 489  *
 490  * Check to see if an XLOG segment file is still unarchived.
 491  * This is almost but not quite the inverse of XLogArchiveCheckDone: in
 492  * the first place we aren't chartered to recreate the .ready file, and
 493  * in the second place we should consider that if the file is already gone
 494  * then it's not busy.  (This check is needed to handle the race condition
 495  * that a checkpoint already deleted the no-longer-needed file.)
 496  */
 497 bool
 498 XLogArchiveIsBusy(const char *xlog)
 499 {
 500         char            archiveStatusPath[MAXPGPATH];
 501         struct stat stat_buf;
 502
 503         /* First check for .done --- this means archiver is done with it */
 504         StatusFilePath(archiveStatusPath, xlog, ".done");
 505         if (stat(archiveStatusPath, &stat_buf) == 0)
 506                 return false;
 507
 508         /* check for .ready --- this means archiver is still busy with it */
 509         StatusFilePath(archiveStatusPath, xlog, ".ready");
 510         if (stat(archiveStatusPath, &stat_buf) == 0)
 511                 return true;
 512
 513         /* Race condition --- maybe archiver just finished, so recheck */
 514         StatusFilePath(archiveStatusPath, xlog, ".done");
 515         if (stat(archiveStatusPath, &stat_buf) == 0)
 516                 return false;
 517
 518         /*
 519          * Check to see if the WAL file has been removed by checkpoint, which
 520          * implies it has already been archived, and explains why we can't see a
 521          * status file for it.
 522          */
 523         snprintf(archiveStatusPath, MAXPGPATH, XLOGDIR "/%s", xlog);
 524         if (stat(archiveStatusPath, &stat_buf) != 0 &&
 525                 errno == ENOENT)
 526                 return false;
 527
 528         return true;
 529 }
 530
 531 /*
 532  * XLogArchiveIsReadyOrDone
 533  *
 534  * Check to see if an XLOG segment file has a .ready or .done file.
 535  * This is similar to XLogArchiveIsBusy(), but returns true if the file
 536  * is already archived or is about to be archived.
 537  *
 538  * This is currently only used at recovery.  During normal operation this
 539  * would be racy: the file might get removed or marked with .ready as we're
 540  * checking it, or immediately after we return.
 541  */
 542 bool
 543 XLogArchiveIsReadyOrDone(const char *xlog)
 544 {
 545         char            archiveStatusPath[MAXPGPATH];
 546         struct stat stat_buf;
 547
 548         /* First check for .done --- this means archiver is done with it */
 549         StatusFilePath(archiveStatusPath, xlog, ".done");
 550         if (stat(archiveStatusPath, &stat_buf) == 0)
 551                 return true;
 552
 553         /* check for .ready --- this means archiver is still busy with it */
 554         StatusFilePath(archiveStatusPath, xlog, ".ready");
 555         if (stat(archiveStatusPath, &stat_buf) == 0)
 556                 return true;
 557
 558         /* Race condition --- maybe archiver just finished, so recheck */
 559         StatusFilePath(archiveStatusPath, xlog, ".done");
 560         if (stat(archiveStatusPath, &stat_buf) == 0)
 561                 return true;
 562
 563         return false;
 564 }
 565
 566 /*
 567  * XLogArchiveIsReady
 568  *
 569  * Check to see if an XLOG segment file has an archive notification (.ready)
 570  * file.
 571  */
 572 bool
 573 XLogArchiveIsReady(const char *xlog)
 574 {
 575         char            archiveStatusPath[MAXPGPATH];
 576         struct stat stat_buf;
 577
 578         StatusFilePath(archiveStatusPath, xlog, ".ready");
 579         if (stat(archiveStatusPath, &stat_buf) == 0)
 580                 return true;
 581
 582         return false;
 583 }
 584
 585 /*
 586  * XLogArchiveCleanup
 587  *
 588  * Cleanup archive notification file(s) for a particular xlog segment
 589  */
 590 void
 591 XLogArchiveCleanup(const char *xlog)
 592 {
 593         char            archiveStatusPath[MAXPGPATH];
 594
 595         /* Remove the .done file */
 596         StatusFilePath(archiveStatusPath, xlog, ".done");
 597         unlink(archiveStatusPath);
 598         /* should we complain about failure? */
 599
 600         /* Remove the .ready file if present --- normally it shouldn't be */
 601         StatusFilePath(archiveStatusPath, xlog, ".ready");
 602         unlink(archiveStatusPath);
 603         /* should we complain about failure? */
 604 }