src/bin/pg_rewind/pg_rewind.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * pg_rewind.c
   4  *        Synchronizes a PostgreSQL data directory to a new timeline
   5  *
   6  * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
   7  *
   8  *-------------------------------------------------------------------------
   9  */
  10 #include "postgres_fe.h"
  11
  12 #include <sys/stat.h>
  13 #include <fcntl.h>
  14 #include <time.h>
  15 #include <unistd.h>
  16
  17 #include "access/timeline.h"
  18 #include "access/xlog_internal.h"
  19 #include "catalog/catversion.h"
  20 #include "catalog/pg_control.h"
  21 #include "common/controldata_utils.h"
  22 #include "common/file_perm.h"
  23 #include "common/restricted_token.h"
  24 #include "common/string.h"
  25 #include "fe_utils/recovery_gen.h"
  26 #include "file_ops.h"
  27 #include "filemap.h"
  28 #include "getopt_long.h"
  29 #include "pg_rewind.h"
  30 #include "rewind_source.h"
  31 #include "storage/bufpage.h"
  32
  33 static void usage(const char *progname);
  34
  35 static void perform_rewind(filemap_t *filemap, rewind_source *source,
  36                                                    XLogRecPtr chkptrec,
  37                                                    TimeLineID chkpttli,
  38                                                    XLogRecPtr chkptredo);
  39
  40 static void createBackupLabel(XLogRecPtr startpoint, TimeLineID starttli,
  41                                                           XLogRecPtr checkpointloc);
  42
  43 static void digestControlFile(ControlFileData *ControlFile,
  44                                                           const char *content, size_t size);
  45 static void getRestoreCommand(const char *argv0);
  46 static void sanityChecks(void);
  47 static void findCommonAncestorTimeline(XLogRecPtr *recptr, int *tliIndex);
  48 static void ensureCleanShutdown(const char *argv0);
  49 static void disconnect_atexit(void);
  50
  51 static ControlFileData ControlFile_target;
  52 static ControlFileData ControlFile_source;
  53 static ControlFileData ControlFile_source_after;
  54
  55 const char *progname;
  56 int                     WalSegSz;
  57
  58 /* Configuration options */
  59 char       *datadir_target = NULL;
  60 char       *datadir_source = NULL;
  61 char       *connstr_source = NULL;
  62 char       *restore_command = NULL;
  63
  64 static bool debug = false;
  65 bool            showprogress = false;
  66 bool            dry_run = false;
  67 bool            do_sync = true;
  68 bool            restore_wal = false;
  69
  70 /* Target history */
  71 TimeLineHistoryEntry *targetHistory;
  72 int                     targetNentries;
  73
  74 /* Progress counters */
  75 uint64          fetch_size;
  76 uint64          fetch_done;
  77
  78 static PGconn *conn;
  79 static rewind_source *source;
  80
  81 static void
  82 usage(const char *progname)
  83 {
  84         printf(_("%s resynchronizes a PostgreSQL cluster with another copy of the cluster.\n\n"), progname);
  85         printf(_("Usage:\n  %s [OPTION]...\n\n"), progname);
  86         printf(_("Options:\n"));
  87         printf(_("  -c, --restore-target-wal       use restore_command in target configuration to\n"
  88                          "                                 retrieve WAL files from archives\n"));
  89         printf(_("  -D, --target-pgdata=DIRECTORY  existing data directory to modify\n"));
  90         printf(_("      --source-pgdata=DIRECTORY  source data directory to synchronize with\n"));
  91         printf(_("      --source-server=CONNSTR    source server to synchronize with\n"));
  92         printf(_("  -n, --dry-run                  stop before modifying anything\n"));
  93         printf(_("  -N, --no-sync                  do not wait for changes to be written\n"
  94                          "                                 safely to disk\n"));
  95         printf(_("  -P, --progress                 write progress messages\n"));
  96         printf(_("  -R, --write-recovery-conf      write configuration for replication\n"
  97                          "                                 (requires --source-server)\n"));
  98         printf(_("      --debug                    write a lot of debug messages\n"));
  99         printf(_("      --no-ensure-shutdown       do not automatically fix unclean shutdown\n"));
 100         printf(_("  -V, --version                  output version information, then exit\n"));
 101         printf(_("  -?, --help                     show this help, then exit\n"));
 102         printf(_("\nReport bugs to <%s>.\n"), PACKAGE_BUGREPORT);
 103         printf(_("%s home page: <%s>\n"), PACKAGE_NAME, PACKAGE_URL);
 104 }
 105
 106
 107 int
 108 main(int argc, char **argv)
 109 {
 110         static struct option long_options[] = {
 111                 {"help", no_argument, NULL, '?'},
 112                 {"target-pgdata", required_argument, NULL, 'D'},
 113                 {"write-recovery-conf", no_argument, NULL, 'R'},
 114                 {"source-pgdata", required_argument, NULL, 1},
 115                 {"source-server", required_argument, NULL, 2},
 116                 {"no-ensure-shutdown", no_argument, NULL, 4},
 117                 {"version", no_argument, NULL, 'V'},
 118                 {"restore-target-wal", no_argument, NULL, 'c'},
 119                 {"dry-run", no_argument, NULL, 'n'},
 120                 {"no-sync", no_argument, NULL, 'N'},
 121                 {"progress", no_argument, NULL, 'P'},
 122                 {"debug", no_argument, NULL, 3},
 123                 {NULL, 0, NULL, 0}
 124         };
 125         int                     option_index;
 126         int                     c;
 127         XLogRecPtr      divergerec;
 128         int                     lastcommontliIndex;
 129         XLogRecPtr      chkptrec;
 130         TimeLineID      chkpttli;
 131         XLogRecPtr      chkptredo;
 132         XLogRecPtr      target_wal_endrec;
 133         size_t          size;
 134         char       *buffer;
 135         bool            no_ensure_shutdown = false;
 136         bool            rewind_needed;
 137         bool            writerecoveryconf = false;
 138         filemap_t  *filemap;
 139
 140         pg_logging_init(argv[0]);
 141         set_pglocale_pgservice(argv[0], PG_TEXTDOMAIN("pg_rewind"));
 142         progname = get_progname(argv[0]);
 143
 144         /* Process command-line arguments */
 145         if (argc > 1)
 146         {
 147                 if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-?") == 0)
 148                 {
 149                         usage(progname);
 150                         exit(0);
 151                 }
 152                 if (strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-V") == 0)
 153                 {
 154                         puts("pg_rewind (PostgreSQL) " PG_VERSION);
 155                         exit(0);
 156                 }
 157         }
 158
 159         while ((c = getopt_long(argc, argv, "cD:nNPR", long_options, &option_index)) != -1)
 160         {
 161                 switch (c)
 162                 {
 163                         case '?':
 164                                 fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname);
 165                                 exit(1);
 166
 167                         case 'c':
 168                                 restore_wal = true;
 169                                 break;
 170
 171                         case 'P':
 172                                 showprogress = true;
 173                                 break;
 174
 175                         case 'n':
 176                                 dry_run = true;
 177                                 break;
 178
 179                         case 'N':
 180                                 do_sync = false;
 181                                 break;
 182
 183                         case 'R':
 184                                 writerecoveryconf = true;
 185                                 break;
 186
 187                         case 3:
 188                                 debug = true;
 189                                 pg_logging_increase_verbosity();
 190                                 break;
 191
 192                         case 'D':                       /* -D or --target-pgdata */
 193                                 datadir_target = pg_strdup(optarg);
 194                                 break;
 195
 196                         case 1:                         /* --source-pgdata */
 197                                 datadir_source = pg_strdup(optarg);
 198                                 break;
 199
 200                         case 2:                         /* --source-server */
 201                                 connstr_source = pg_strdup(optarg);
 202                                 break;
 203
 204                         case 4:
 205                                 no_ensure_shutdown = true;
 206                                 break;
 207                 }
 208         }
 209
 210         if (datadir_source == NULL && connstr_source == NULL)
 211         {
 212                 pg_log_error("no source specified (--source-pgdata or --source-server)");
 213                 fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname);
 214                 exit(1);
 215         }
 216
 217         if (datadir_source != NULL && connstr_source != NULL)
 218         {
 219                 pg_log_error("only one of --source-pgdata or --source-server can be specified");
 220                 fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname);
 221                 exit(1);
 222         }
 223
 224         if (datadir_target == NULL)
 225         {
 226                 pg_log_error("no target data directory specified (--target-pgdata)");
 227                 fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname);
 228                 exit(1);
 229         }
 230
 231         if (writerecoveryconf && connstr_source == NULL)
 232         {
 233                 pg_log_error("no source server information (--source-server) specified for --write-recovery-conf");
 234                 fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname);
 235                 exit(1);
 236         }
 237
 238         if (optind < argc)
 239         {
 240                 pg_log_error("too many command-line arguments (first is \"%s\")",
 241                                          argv[optind]);
 242                 fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname);
 243                 exit(1);
 244         }
 245
 246         /*
 247          * Don't allow pg_rewind to be run as root, to avoid overwriting the
 248          * ownership of files in the data directory. We need only check for root
 249          * -- any other user won't have sufficient permissions to modify files in
 250          * the data directory.
 251          */
 252 #ifndef WIN32
 253         if (geteuid() == 0)
 254         {
 255                 pg_log_error("cannot be executed by \"root\"");
 256                 fprintf(stderr, _("You must run %s as the PostgreSQL superuser.\n"),
 257                                 progname);
 258                 exit(1);
 259         }
 260 #endif
 261
 262         get_restricted_token();
 263
 264         /* Set mask based on PGDATA permissions */
 265         if (!GetDataDirectoryCreatePerm(datadir_target))
 266         {
 267                 pg_log_error("could not read permissions of directory \"%s\": %m",
 268                                          datadir_target);
 269                 exit(1);
 270         }
 271
 272         umask(pg_mode_mask);
 273
 274         getRestoreCommand(argv[0]);
 275
 276         atexit(disconnect_atexit);
 277
 278         /*
 279          * Ok, we have all the options and we're ready to start. First, connect to
 280          * remote server.
 281          */
 282         if (connstr_source)
 283         {
 284                 conn = PQconnectdb(connstr_source);
 285
 286                 if (PQstatus(conn) == CONNECTION_BAD)
 287                         pg_fatal("%s", PQerrorMessage(conn));
 288
 289                 if (showprogress)
 290                         pg_log_info("connected to server");
 291
 292                 source = init_libpq_source(conn);
 293         }
 294         else
 295                 source = init_local_source(datadir_source);
 296
 297         /*
 298          * Check the status of the target instance.
 299          *
 300          * If the target instance was not cleanly shut down, start and stop the
 301          * target cluster once in single-user mode to enforce recovery to finish,
 302          * ensuring that the cluster can be used by pg_rewind.  Note that if
 303          * no_ensure_shutdown is specified, pg_rewind ignores this step, and users
 304          * need to make sure by themselves that the target cluster is in a clean
 305          * state.
 306          */
 307         buffer = slurpFile(datadir_target, "global/pg_control", &size);
 308         digestControlFile(&ControlFile_target, buffer, size);
 309         pg_free(buffer);
 310
 311         if (!no_ensure_shutdown &&
 312                 ControlFile_target.state != DB_SHUTDOWNED &&
 313                 ControlFile_target.state != DB_SHUTDOWNED_IN_RECOVERY)
 314         {
 315                 ensureCleanShutdown(argv[0]);
 316
 317                 buffer = slurpFile(datadir_target, "global/pg_control", &size);
 318                 digestControlFile(&ControlFile_target, buffer, size);
 319                 pg_free(buffer);
 320         }
 321
 322         buffer = source->fetch_file(source, "global/pg_control", &size);
 323         digestControlFile(&ControlFile_source, buffer, size);
 324         pg_free(buffer);
 325
 326         sanityChecks();
 327
 328         /*
 329          * Find the common ancestor timeline between the clusters.
 330          *
 331          * If both clusters are already on the same timeline, there's nothing to
 332          * do.
 333          */
 334         if (ControlFile_target.checkPointCopy.ThisTimeLineID ==
 335                 ControlFile_source.checkPointCopy.ThisTimeLineID)
 336         {
 337                 pg_log_info("source and target cluster are on the same timeline");
 338                 rewind_needed = false;
 339                 target_wal_endrec = 0;
 340         }
 341         else
 342         {
 343                 XLogRecPtr      chkptendrec;
 344
 345                 findCommonAncestorTimeline(&divergerec, &lastcommontliIndex);
 346                 pg_log_info("servers diverged at WAL location %X/%X on timeline %u",
 347                                         LSN_FORMAT_ARGS(divergerec),
 348                                         targetHistory[lastcommontliIndex].tli);
 349
 350                 /*
 351                  * Determine the end-of-WAL on the target.
 352                  *
 353                  * The WAL ends at the last shutdown checkpoint, or at
 354                  * minRecoveryPoint if it was a standby. (If we supported rewinding a
 355                  * server that was not shut down cleanly, we would need to replay
 356                  * until we reach the first invalid record, like crash recovery does.)
 357                  */
 358
 359                 /* read the checkpoint record on the target to see where it ends. */
 360                 chkptendrec = readOneRecord(datadir_target,
 361                                                                         ControlFile_target.checkPoint,
 362                                                                         targetNentries - 1,
 363                                                                         restore_command);
 364
 365                 if (ControlFile_target.minRecoveryPoint > chkptendrec)
 366                 {
 367                         target_wal_endrec = ControlFile_target.minRecoveryPoint;
 368                 }
 369                 else
 370                 {
 371                         target_wal_endrec = chkptendrec;
 372                 }
 373
 374                 /*
 375                  * Check for the possibility that the target is in fact a direct
 376                  * ancestor of the source. In that case, there is no divergent history
 377                  * in the target that needs rewinding.
 378                  */
 379                 if (target_wal_endrec > divergerec)
 380                 {
 381                         rewind_needed = true;
 382                 }
 383                 else
 384                 {
 385                         /* the last common checkpoint record must be part of target WAL */
 386                         Assert(target_wal_endrec == divergerec);
 387
 388                         rewind_needed = false;
 389                 }
 390         }
 391
 392         if (!rewind_needed)
 393         {
 394                 pg_log_info("no rewind required");
 395                 if (writerecoveryconf && !dry_run)
 396                         WriteRecoveryConfig(conn, datadir_target,
 397                                                                 GenerateRecoveryConfig(conn, NULL));
 398                 exit(0);
 399         }
 400
 401         findLastCheckpoint(datadir_target, divergerec, lastcommontliIndex,
 402                                            &chkptrec, &chkpttli, &chkptredo, restore_command);
 403         pg_log_info("rewinding from last common checkpoint at %X/%X on timeline %u",
 404                                 LSN_FORMAT_ARGS(chkptrec), chkpttli);
 405
 406         /* Initialize the hash table to track the status of each file */
 407         filehash_init();
 408
 409         /*
 410          * Collect information about all files in the both data directories.
 411          */
 412         if (showprogress)
 413                 pg_log_info("reading source file list");
 414         source->traverse_files(source, &process_source_file);
 415
 416         if (showprogress)
 417                 pg_log_info("reading target file list");
 418         traverse_datadir(datadir_target, &process_target_file);
 419
 420         /*
 421          * Read the target WAL from last checkpoint before the point of fork, to
 422          * extract all the pages that were modified on the target cluster after
 423          * the fork.
 424          */
 425         if (showprogress)
 426                 pg_log_info("reading WAL in target");
 427         extractPageMap(datadir_target, chkptrec, lastcommontliIndex,
 428                                    target_wal_endrec, restore_command);
 429
 430         /*
 431          * We have collected all information we need from both systems. Decide
 432          * what to do with each file.
 433          */
 434         filemap = decide_file_actions();
 435         if (showprogress)
 436                 calculate_totals(filemap);
 437
 438         /* this is too verbose even for verbose mode */
 439         if (debug)
 440                 print_filemap(filemap);
 441
 442         /*
 443          * Ok, we're ready to start copying things over.
 444          */
 445         if (showprogress)
 446         {
 447                 pg_log_info("need to copy %lu MB (total source directory size is %lu MB)",
 448                                         (unsigned long) (filemap->fetch_size / (1024 * 1024)),
 449                                         (unsigned long) (filemap->total_size / (1024 * 1024)));
 450
 451                 fetch_size = filemap->fetch_size;
 452                 fetch_done = 0;
 453         }
 454
 455         /*
 456          * We have now collected all the information we need from both systems,
 457          * and we are ready to start modifying the target directory.
 458          *
 459          * This is the point of no return. Once we start copying things, there is
 460          * no turning back!
 461          */
 462         perform_rewind(filemap, source, chkptrec, chkpttli, chkptredo);
 463
 464         if (showprogress)
 465                 pg_log_info("syncing target data directory");
 466         sync_target_dir();
 467
 468         /* Also update the standby configuration, if requested. */
 469         if (writerecoveryconf && !dry_run)
 470                 WriteRecoveryConfig(conn, datadir_target,
 471                                                         GenerateRecoveryConfig(conn, NULL));
 472
 473         /* don't need the source connection anymore */
 474         source->destroy(source);
 475         if (conn)
 476         {
 477                 PQfinish(conn);
 478                 conn = NULL;
 479         }
 480
 481         pg_log_info("Done!");
 482
 483         return 0;
 484 }
 485
 486 /*
 487  * Perform the rewind.
 488  *
 489  * We have already collected all the information we need from the
 490  * target and the source.
 491  */
 492 static void
 493 perform_rewind(filemap_t *filemap, rewind_source *source,
 494                            XLogRecPtr chkptrec,
 495                            TimeLineID chkpttli,
 496                            XLogRecPtr chkptredo)
 497 {
 498         XLogRecPtr      endrec;
 499         TimeLineID      endtli;
 500         ControlFileData ControlFile_new;
 501         size_t          size;
 502         char       *buffer;
 503
 504         /*
 505          * Execute the actions in the file map, fetching data from the source
 506          * system as needed.
 507          */
 508         for (int i = 0; i < filemap->nentries; i++)
 509         {
 510                 file_entry_t *entry = filemap->entries[i];
 511
 512                 /*
 513                  * If this is a relation file, copy the modified blocks.
 514                  *
 515                  * This is in addition to any other changes.
 516                  */
 517                 if (entry->target_pages_to_overwrite.bitmapsize > 0)
 518                 {
 519                         datapagemap_iterator_t *iter;
 520                         BlockNumber blkno;
 521                         off_t           offset;
 522
 523                         iter = datapagemap_iterate(&entry->target_pages_to_overwrite);
 524                         while (datapagemap_next(iter, &blkno))
 525                         {
 526                                 offset = blkno * BLCKSZ;
 527                                 source->queue_fetch_range(source, entry->path, offset, BLCKSZ);
 528                         }
 529                         pg_free(iter);
 530                 }
 531
 532                 switch (entry->action)
 533                 {
 534                         case FILE_ACTION_NONE:
 535                                 /* nothing else to do */
 536                                 break;
 537
 538                         case FILE_ACTION_COPY:
 539                                 /* Truncate the old file out of the way, if any */
 540                                 open_target_file(entry->path, true);
 541                                 source->queue_fetch_range(source, entry->path,
 542                                                                                   0, entry->source_size);
 543                                 break;
 544
 545                         case FILE_ACTION_TRUNCATE:
 546                                 truncate_target_file(entry->path, entry->source_size);
 547                                 break;
 548
 549                         case FILE_ACTION_COPY_TAIL:
 550                                 source->queue_fetch_range(source, entry->path,
 551                                                                                   entry->target_size,
 552                                                                                   entry->source_size - entry->target_size);
 553                                 break;
 554
 555                         case FILE_ACTION_REMOVE:
 556                                 remove_target(entry);
 557                                 break;
 558
 559                         case FILE_ACTION_CREATE:
 560                                 create_target(entry);
 561                                 break;
 562
 563                         case FILE_ACTION_UNDECIDED:
 564                                 pg_fatal("no action decided for file \"%s\"", entry->path);
 565                                 break;
 566                 }
 567         }
 568
 569         /* Complete any remaining range-fetches that we queued up above. */
 570         source->finish_fetch(source);
 571
 572         close_target_file();
 573
 574         progress_report(true);
 575
 576         /*
 577          * Fetch the control file from the source last. This ensures that the
 578          * minRecoveryPoint is up-to-date.
 579          */
 580         buffer = source->fetch_file(source, "global/pg_control", &size);
 581         digestControlFile(&ControlFile_source_after, buffer, size);
 582         pg_free(buffer);
 583
 584         /*
 585          * Sanity check: If the source is a local system, the control file should
 586          * not have changed since we started.
 587          *
 588          * XXX: We assume it hasn't been modified, but actually, what could go
 589          * wrong? The logic handles a libpq source that's modified concurrently,
 590          * why not a local datadir?
 591          */
 592         if (datadir_source &&
 593                 memcmp(&ControlFile_source, &ControlFile_source_after,
 594                            sizeof(ControlFileData)) != 0)
 595         {
 596                 pg_fatal("source system was modified while pg_rewind was running");
 597         }
 598
 599         if (showprogress)
 600                 pg_log_info("creating backup label and updating control file");
 601
 602         /*
 603          * Create a backup label file, to tell the target where to begin the WAL
 604          * replay. Normally, from the last common checkpoint between the source
 605          * and the target. But if the source is a standby server, it's possible
 606          * that the last common checkpoint is *after* the standby's restartpoint.
 607          * That implies that the source server has applied the checkpoint record,
 608          * but hasn't performed a corresponding restartpoint yet. Make sure we
 609          * start at the restartpoint's redo point in that case.
 610          *
 611          * Use the old version of the source's control file for this. The server
 612          * might have finished the restartpoint after we started copying files,
 613          * but we must begin from the redo point at the time that started copying.
 614          */
 615         if (ControlFile_source.checkPointCopy.redo < chkptredo)
 616         {
 617                 chkptredo = ControlFile_source.checkPointCopy.redo;
 618                 chkpttli = ControlFile_source.checkPointCopy.ThisTimeLineID;
 619                 chkptrec = ControlFile_source.checkPoint;
 620         }
 621         createBackupLabel(chkptredo, chkpttli, chkptrec);
 622
 623         /*
 624          * Update control file of target, to tell the target how far it must
 625          * replay the WAL (minRecoveryPoint).
 626          */
 627         if (connstr_source)
 628         {
 629                 /*
 630                  * The source is a live server. Like in an online backup, it's
 631                  * important that we recover all the WAL that was generated while we
 632                  * were copying files.
 633                  */
 634                 if (ControlFile_source_after.state == DB_IN_ARCHIVE_RECOVERY)
 635                 {
 636                         /*
 637                          * Source is a standby server. We must replay to its
 638                          * minRecoveryPoint.
 639                          */
 640                         endrec = ControlFile_source_after.minRecoveryPoint;
 641                         endtli = ControlFile_source_after.minRecoveryPointTLI;
 642                 }
 643                 else
 644                 {
 645                         /*
 646                          * Source is a production, non-standby, server. We must replay to
 647                          * the last WAL insert location.
 648                          */
 649                         if (ControlFile_source_after.state != DB_IN_PRODUCTION)
 650                                 pg_fatal("source system was in unexpected state at end of rewind");
 651
 652                         endrec = source->get_current_wal_insert_lsn(source);
 653                         endtli = ControlFile_source_after.checkPointCopy.ThisTimeLineID;
 654                 }
 655         }
 656         else
 657         {
 658                 /*
 659                  * Source is a local data directory. It should've shut down cleanly,
 660                  * and we must replay to the latest shutdown checkpoint.
 661                  */
 662                 endrec = ControlFile_source_after.checkPoint;
 663                 endtli = ControlFile_source_after.checkPointCopy.ThisTimeLineID;
 664         }
 665
 666         memcpy(&ControlFile_new, &ControlFile_source_after, sizeof(ControlFileData));
 667         ControlFile_new.minRecoveryPoint = endrec;
 668         ControlFile_new.minRecoveryPointTLI = endtli;
 669         ControlFile_new.state = DB_IN_ARCHIVE_RECOVERY;
 670         if (!dry_run)
 671                 update_controlfile(datadir_target, &ControlFile_new, do_sync);
 672 }
 673
 674 static void
 675 sanityChecks(void)
 676 {
 677         /* TODO Check that there's no backup_label in either cluster */
 678
 679         /* Check system_identifier match */
 680         if (ControlFile_target.system_identifier != ControlFile_source.system_identifier)
 681                 pg_fatal("source and target clusters are from different systems");
 682
 683         /* check version */
 684         if (ControlFile_target.pg_control_version != PG_CONTROL_VERSION ||
 685                 ControlFile_source.pg_control_version != PG_CONTROL_VERSION ||
 686                 ControlFile_target.catalog_version_no != CATALOG_VERSION_NO ||
 687                 ControlFile_source.catalog_version_no != CATALOG_VERSION_NO)
 688         {
 689                 pg_fatal("clusters are not compatible with this version of pg_rewind");
 690         }
 691
 692         /*
 693          * Target cluster need to use checksums or hint bit wal-logging, this to
 694          * prevent from data corruption that could occur because of hint bits.
 695          */
 696         if (ControlFile_target.data_checksum_version != PG_DATA_CHECKSUM_VERSION &&
 697                 !ControlFile_target.wal_log_hints)
 698         {
 699                 pg_fatal("target server needs to use either data checksums or \"wal_log_hints = on\"");
 700         }
 701
 702         /*
 703          * Target cluster better not be running. This doesn't guard against
 704          * someone starting the cluster concurrently. Also, this is probably more
 705          * strict than necessary; it's OK if the target node was not shut down
 706          * cleanly, as long as it isn't running at the moment.
 707          */
 708         if (ControlFile_target.state != DB_SHUTDOWNED &&
 709                 ControlFile_target.state != DB_SHUTDOWNED_IN_RECOVERY)
 710                 pg_fatal("target server must be shut down cleanly");
 711
 712         /*
 713          * When the source is a data directory, also require that the source
 714          * server is shut down. There isn't any very strong reason for this
 715          * limitation, but better safe than sorry.
 716          */
 717         if (datadir_source &&
 718                 ControlFile_source.state != DB_SHUTDOWNED &&
 719                 ControlFile_source.state != DB_SHUTDOWNED_IN_RECOVERY)
 720                 pg_fatal("source data directory must be shut down cleanly");
 721 }
 722
 723 /*
 724  * Print a progress report based on the fetch_size and fetch_done variables.
 725  *
 726  * Progress report is written at maximum once per second, except that the
 727  * last progress report is always printed.
 728  *
 729  * If finished is set to true, this is the last progress report. The cursor
 730  * is moved to the next line.
 731  */
 732 void
 733 progress_report(bool finished)
 734 {
 735         static pg_time_t last_progress_report = 0;
 736         int                     percent;
 737         char            fetch_done_str[32];
 738         char            fetch_size_str[32];
 739         pg_time_t       now;
 740
 741         if (!showprogress)
 742                 return;
 743
 744         now = time(NULL);
 745         if (now == last_progress_report && !finished)
 746                 return;                                 /* Max once per second */
 747
 748         last_progress_report = now;
 749         percent = fetch_size ? (int) ((fetch_done) * 100 / fetch_size) : 0;
 750
 751         /*
 752          * Avoid overflowing past 100% or the full size. This may make the total
 753          * size number change as we approach the end of the backup (the estimate
 754          * will always be wrong if WAL is included), but that's better than having
 755          * the done column be bigger than the total.
 756          */
 757         if (percent > 100)
 758                 percent = 100;
 759         if (fetch_done > fetch_size)
 760                 fetch_size = fetch_done;
 761
 762         snprintf(fetch_done_str, sizeof(fetch_done_str), UINT64_FORMAT,
 763                          fetch_done / 1024);
 764         snprintf(fetch_size_str, sizeof(fetch_size_str), UINT64_FORMAT,
 765                          fetch_size / 1024);
 766
 767         fprintf(stderr, _("%*s/%s kB (%d%%) copied"),
 768                         (int) strlen(fetch_size_str), fetch_done_str, fetch_size_str,
 769                         percent);
 770
 771         /*
 772          * Stay on the same line if reporting to a terminal and we're not done
 773          * yet.
 774          */
 775         fputc((!finished && isatty(fileno(stderr))) ? '\r' : '\n', stderr);
 776 }
 777
 778 /*
 779  * Find minimum from two WAL locations assuming InvalidXLogRecPtr means
 780  * infinity as src/include/access/timeline.h states. This routine should
 781  * be used only when comparing WAL locations related to history files.
 782  */
 783 static XLogRecPtr
 784 MinXLogRecPtr(XLogRecPtr a, XLogRecPtr b)
 785 {
 786         if (XLogRecPtrIsInvalid(a))
 787                 return b;
 788         else if (XLogRecPtrIsInvalid(b))
 789                 return a;
 790         else
 791                 return Min(a, b);
 792 }
 793
 794 /*
 795  * Retrieve timeline history for given control file which should behold
 796  * either source or target.
 797  */
 798 static TimeLineHistoryEntry *
 799 getTimelineHistory(ControlFileData *controlFile, int *nentries)
 800 {
 801         TimeLineHistoryEntry *history;
 802         TimeLineID      tli;
 803
 804         tli = controlFile->checkPointCopy.ThisTimeLineID;
 805
 806         /*
 807          * Timeline 1 does not have a history file, so there is no need to check
 808          * and fake an entry with infinite start and end positions.
 809          */
 810         if (tli == 1)
 811         {
 812                 history = (TimeLineHistoryEntry *) pg_malloc(sizeof(TimeLineHistoryEntry));
 813                 history->tli = tli;
 814                 history->begin = history->end = InvalidXLogRecPtr;
 815                 *nentries = 1;
 816         }
 817         else
 818         {
 819                 char            path[MAXPGPATH];
 820                 char       *histfile;
 821
 822                 TLHistoryFilePath(path, tli);
 823
 824                 /* Get history file from appropriate source */
 825                 if (controlFile == &ControlFile_source)
 826                         histfile = source->fetch_file(source, path, NULL);
 827                 else if (controlFile == &ControlFile_target)
 828                         histfile = slurpFile(datadir_target, path, NULL);
 829                 else
 830                         pg_fatal("invalid control file");
 831
 832                 history = rewind_parseTimeLineHistory(histfile, tli, nentries);
 833                 pg_free(histfile);
 834         }
 835
 836         if (debug)
 837         {
 838                 int                     i;
 839
 840                 if (controlFile == &ControlFile_source)
 841                         pg_log_debug("Source timeline history:");
 842                 else if (controlFile == &ControlFile_target)
 843                         pg_log_debug("Target timeline history:");
 844                 else
 845                         Assert(false);
 846
 847                 /*
 848                  * Print the target timeline history.
 849                  */
 850                 for (i = 0; i < targetNentries; i++)
 851                 {
 852                         TimeLineHistoryEntry *entry;
 853
 854                         entry = &history[i];
 855                         pg_log_debug("%u: %X/%X - %X/%X", entry->tli,
 856                                                  LSN_FORMAT_ARGS(entry->begin),
 857                                                  LSN_FORMAT_ARGS(entry->end));
 858                 }
 859         }
 860
 861         return history;
 862 }
 863
 864 /*
 865  * Determine the TLI of the last common timeline in the timeline history of the
 866  * two clusters. targetHistory is filled with target timeline history and
 867  * targetNentries is number of items in targetHistory. *tliIndex is set to the
 868  * index of last common timeline in targetHistory array, and *recptr is set to
 869  * the position where the timeline history diverged (ie. the first WAL record
 870  * that's not the same in both clusters).
 871  *
 872  * Control files of both clusters must be read into ControlFile_target/source
 873  * before calling this routine.
 874  */
 875 static void
 876 findCommonAncestorTimeline(XLogRecPtr *recptr, int *tliIndex)
 877 {
 878         TimeLineHistoryEntry *sourceHistory;
 879         int                     sourceNentries;
 880         int                     i,
 881                                 n;
 882
 883         /* Retrieve timelines for both source and target */
 884         sourceHistory = getTimelineHistory(&ControlFile_source, &sourceNentries);
 885         targetHistory = getTimelineHistory(&ControlFile_target, &targetNentries);
 886
 887         /*
 888          * Trace the history forward, until we hit the timeline diverge. It may
 889          * still be possible that the source and target nodes used the same
 890          * timeline number in their history but with different start position
 891          * depending on the history files that each node has fetched in previous
 892          * recovery processes. Hence check the start position of the new timeline
 893          * as well and move down by one extra timeline entry if they do not match.
 894          */
 895         n = Min(sourceNentries, targetNentries);
 896         for (i = 0; i < n; i++)
 897         {
 898                 if (sourceHistory[i].tli != targetHistory[i].tli ||
 899                         sourceHistory[i].begin != targetHistory[i].begin)
 900                         break;
 901         }
 902
 903         if (i > 0)
 904         {
 905                 i--;
 906                 *recptr = MinXLogRecPtr(sourceHistory[i].end, targetHistory[i].end);
 907                 *tliIndex = i;
 908
 909                 pg_free(sourceHistory);
 910                 return;
 911         }
 912         else
 913         {
 914                 pg_fatal("could not find common ancestor of the source and target cluster's timelines");
 915         }
 916 }
 917
 918
 919 /*
 920  * Create a backup_label file that forces recovery to begin at the last common
 921  * checkpoint.
 922  */
 923 static void
 924 createBackupLabel(XLogRecPtr startpoint, TimeLineID starttli, XLogRecPtr checkpointloc)
 925 {
 926         XLogSegNo       startsegno;
 927         time_t          stamp_time;
 928         char            strfbuf[128];
 929         char            xlogfilename[MAXFNAMELEN];
 930         struct tm  *tmp;
 931         char            buf[1000];
 932         int                     len;
 933
 934         XLByteToSeg(startpoint, startsegno, WalSegSz);
 935         XLogFileName(xlogfilename, starttli, startsegno, WalSegSz);
 936
 937         /*
 938          * Construct backup label file
 939          */
 940         stamp_time = time(NULL);
 941         tmp = localtime(&stamp_time);
 942         strftime(strfbuf, sizeof(strfbuf), "%Y-%m-%d %H:%M:%S %Z", tmp);
 943
 944         len = snprintf(buf, sizeof(buf),
 945                                    "START WAL LOCATION: %X/%X (file %s)\n"
 946                                    "CHECKPOINT LOCATION: %X/%X\n"
 947                                    "BACKUP METHOD: pg_rewind\n"
 948                                    "BACKUP FROM: standby\n"
 949                                    "START TIME: %s\n",
 950         /* omit LABEL: line */
 951                                    LSN_FORMAT_ARGS(startpoint), xlogfilename,
 952                                    LSN_FORMAT_ARGS(checkpointloc),
 953                                    strfbuf);
 954         if (len >= sizeof(buf))
 955                 pg_fatal("backup label buffer too small");      /* shouldn't happen */
 956
 957         /* TODO: move old file out of the way, if any. */
 958         open_target_file("backup_label", true); /* BACKUP_LABEL_FILE */
 959         write_target_range(buf, 0, len);
 960         close_target_file();
 961 }
 962
 963 /*
 964  * Check CRC of control file
 965  */
 966 static void
 967 checkControlFile(ControlFileData *ControlFile)
 968 {
 969         pg_crc32c       crc;
 970
 971         /* Calculate CRC */
 972         INIT_CRC32C(crc);
 973         COMP_CRC32C(crc, (char *) ControlFile, offsetof(ControlFileData, crc));
 974         FIN_CRC32C(crc);
 975
 976         /* And simply compare it */
 977         if (!EQ_CRC32C(crc, ControlFile->crc))
 978                 pg_fatal("unexpected control file CRC");
 979 }
 980
 981 /*
 982  * Verify control file contents in the buffer 'content', and copy it to
 983  * *ControlFile.
 984  */
 985 static void
 986 digestControlFile(ControlFileData *ControlFile, const char *content,
 987                                   size_t size)
 988 {
 989         if (size != PG_CONTROL_FILE_SIZE)
 990                 pg_fatal("unexpected control file size %d, expected %d",
 991                                  (int) size, PG_CONTROL_FILE_SIZE);
 992
 993         memcpy(ControlFile, content, sizeof(ControlFileData));
 994
 995         /* set and validate WalSegSz */
 996         WalSegSz = ControlFile->xlog_seg_size;
 997
 998         if (!IsValidWalSegSize(WalSegSz))
 999                 pg_fatal(ngettext("WAL segment size must be a power of two between 1 MB and 1 GB, but the control file specifies %d byte",
1000                                                   "WAL segment size must be a power of two between 1 MB and 1 GB, but the control file specifies %d bytes",
1001                                                   WalSegSz),
1002                                  WalSegSz);
1003
1004         /* Additional checks on control file */
1005         checkControlFile(ControlFile);
1006 }
1007
1008 /*
1009  * Get value of GUC parameter restore_command from the target cluster.
1010  *
1011  * This uses a logic based on "postgres -C" to get the value from the
1012  * cluster.
1013  */
1014 static void
1015 getRestoreCommand(const char *argv0)
1016 {
1017         int                     rc;
1018         char            postgres_exec_path[MAXPGPATH],
1019                                 postgres_cmd[MAXPGPATH],
1020                                 cmd_output[MAXPGPATH];
1021
1022         if (!restore_wal)
1023                 return;
1024
1025         /* find postgres executable */
1026         rc = find_other_exec(argv0, "postgres",
1027                                                  PG_BACKEND_VERSIONSTR,
1028                                                  postgres_exec_path);
1029
1030         if (rc < 0)
1031         {
1032                 char            full_path[MAXPGPATH];
1033
1034                 if (find_my_exec(argv0, full_path) < 0)
1035                         strlcpy(full_path, progname, sizeof(full_path));
1036
1037                 if (rc == -1)
1038                         pg_log_error("The program \"%s\" is needed by %s but was not found in the\n"
1039                                                  "same directory as \"%s\".\n"
1040                                                  "Check your installation.",
1041                                                  "postgres", progname, full_path);
1042                 else
1043                         pg_log_error("The program \"%s\" was found by \"%s\"\n"
1044                                                  "but was not the same version as %s.\n"
1045                                                  "Check your installation.",
1046                                                  "postgres", full_path, progname);
1047                 exit(1);
1048         }
1049
1050         /*
1051          * Build a command able to retrieve the value of GUC parameter
1052          * restore_command, if set.
1053          */
1054         snprintf(postgres_cmd, sizeof(postgres_cmd),
1055                          "\"%s\" -D \"%s\" -C restore_command",
1056                          postgres_exec_path, datadir_target);
1057
1058         if (!pipe_read_line(postgres_cmd, cmd_output, sizeof(cmd_output)))
1059                 exit(1);
1060
1061         (void) pg_strip_crlf(cmd_output);
1062
1063         if (strcmp(cmd_output, "") == 0)
1064                 pg_fatal("restore_command is not set in the target cluster");
1065
1066         restore_command = pg_strdup(cmd_output);
1067
1068         pg_log_debug("using for rewind restore_command = \'%s\'",
1069                                  restore_command);
1070 }
1071
1072
1073 /*
1074  * Ensure clean shutdown of target instance by launching single-user mode
1075  * postgres to do crash recovery.
1076  */
1077 static void
1078 ensureCleanShutdown(const char *argv0)
1079 {
1080         int                     ret;
1081 #define MAXCMDLEN (2 * MAXPGPATH)
1082         char            exec_path[MAXPGPATH];
1083         char            cmd[MAXCMDLEN];
1084
1085         /* locate postgres binary */
1086         if ((ret = find_other_exec(argv0, "postgres",
1087                                                            PG_BACKEND_VERSIONSTR,
1088                                                            exec_path)) < 0)
1089         {
1090                 char            full_path[MAXPGPATH];
1091
1092                 if (find_my_exec(argv0, full_path) < 0)
1093                         strlcpy(full_path, progname, sizeof(full_path));
1094
1095                 if (ret == -1)
1096                         pg_fatal("The program \"%s\" is needed by %s but was not found in the\n"
1097                                          "same directory as \"%s\".\n"
1098                                          "Check your installation.",
1099                                          "postgres", progname, full_path);
1100                 else
1101                         pg_fatal("The program \"%s\" was found by \"%s\"\n"
1102                                          "but was not the same version as %s.\n"
1103                                          "Check your installation.",
1104                                          "postgres", full_path, progname);
1105         }
1106
1107         pg_log_info("executing \"%s\" for target server to complete crash recovery",
1108                                 exec_path);
1109
1110         /*
1111          * Skip processing if requested, but only after ensuring presence of
1112          * postgres.
1113          */
1114         if (dry_run)
1115                 return;
1116
1117         /*
1118          * Finally run postgres in single-user mode.  There is no need to use
1119          * fsync here.  This makes the recovery faster, and the target data folder
1120          * is synced at the end anyway.
1121          */
1122         snprintf(cmd, MAXCMDLEN, "\"%s\" --single -F -D \"%s\" template1 < \"%s\"",
1123                          exec_path, datadir_target, DEVNULL);
1124
1125         if (system(cmd) != 0)
1126         {
1127                 pg_log_error("postgres single-user mode in target cluster failed");
1128                 pg_fatal("Command was: %s", cmd);
1129         }
1130 }
1131
1132 static void
1133 disconnect_atexit(void)
1134 {
1135         if (conn != NULL)
1136                 PQfinish(conn);
1137 }