Update copyright for 2022
[pgsql.git] / src / bin / pg_rewind / pg_rewind.c
blobefb82a403416588a2df47645da8a5c134df73806
1 /*-------------------------------------------------------------------------
3 * pg_rewind.c
4 * Synchronizes a PostgreSQL data directory to a new timeline
6 * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
8 *-------------------------------------------------------------------------
9 */
10 #include "postgres_fe.h"
12 #include <sys/stat.h>
13 #include <fcntl.h>
14 #include <time.h>
15 #include <unistd.h>
17 #include "access/timeline.h"
18 #include "access/xlog_internal.h"
19 #include "catalog/catversion.h"
20 #include "catalog/pg_control.h"
21 #include "common/controldata_utils.h"
22 #include "common/file_perm.h"
23 #include "common/restricted_token.h"
24 #include "common/string.h"
25 #include "fe_utils/recovery_gen.h"
26 #include "file_ops.h"
27 #include "filemap.h"
28 #include "getopt_long.h"
29 #include "pg_rewind.h"
30 #include "rewind_source.h"
31 #include "storage/bufpage.h"
33 static void usage(const char *progname);
35 static void perform_rewind(filemap_t *filemap, rewind_source *source,
36 XLogRecPtr chkptrec,
37 TimeLineID chkpttli,
38 XLogRecPtr chkptredo);
40 static void createBackupLabel(XLogRecPtr startpoint, TimeLineID starttli,
41 XLogRecPtr checkpointloc);
43 static void digestControlFile(ControlFileData *ControlFile,
44 const char *content, size_t size);
45 static void getRestoreCommand(const char *argv0);
46 static void sanityChecks(void);
47 static void findCommonAncestorTimeline(XLogRecPtr *recptr, int *tliIndex);
48 static void ensureCleanShutdown(const char *argv0);
49 static void disconnect_atexit(void);
51 static ControlFileData ControlFile_target;
52 static ControlFileData ControlFile_source;
53 static ControlFileData ControlFile_source_after;
55 const char *progname;
56 int WalSegSz;
58 /* Configuration options */
59 char *datadir_target = NULL;
60 char *datadir_source = NULL;
61 char *connstr_source = NULL;
62 char *restore_command = NULL;
64 static bool debug = false;
65 bool showprogress = false;
66 bool dry_run = false;
67 bool do_sync = true;
68 bool restore_wal = false;
70 /* Target history */
71 TimeLineHistoryEntry *targetHistory;
72 int targetNentries;
74 /* Progress counters */
75 uint64 fetch_size;
76 uint64 fetch_done;
78 static PGconn *conn;
79 static rewind_source *source;
81 static void
82 usage(const char *progname)
84 printf(_("%s resynchronizes a PostgreSQL cluster with another copy of the cluster.\n\n"), progname);
85 printf(_("Usage:\n %s [OPTION]...\n\n"), progname);
86 printf(_("Options:\n"));
87 printf(_(" -c, --restore-target-wal use restore_command in target configuration to\n"
88 " retrieve WAL files from archives\n"));
89 printf(_(" -D, --target-pgdata=DIRECTORY existing data directory to modify\n"));
90 printf(_(" --source-pgdata=DIRECTORY source data directory to synchronize with\n"));
91 printf(_(" --source-server=CONNSTR source server to synchronize with\n"));
92 printf(_(" -n, --dry-run stop before modifying anything\n"));
93 printf(_(" -N, --no-sync do not wait for changes to be written\n"
94 " safely to disk\n"));
95 printf(_(" -P, --progress write progress messages\n"));
96 printf(_(" -R, --write-recovery-conf write configuration for replication\n"
97 " (requires --source-server)\n"));
98 printf(_(" --debug write a lot of debug messages\n"));
99 printf(_(" --no-ensure-shutdown do not automatically fix unclean shutdown\n"));
100 printf(_(" -V, --version output version information, then exit\n"));
101 printf(_(" -?, --help show this help, then exit\n"));
102 printf(_("\nReport bugs to <%s>.\n"), PACKAGE_BUGREPORT);
103 printf(_("%s home page: <%s>\n"), PACKAGE_NAME, PACKAGE_URL);
108 main(int argc, char **argv)
110 static struct option long_options[] = {
111 {"help", no_argument, NULL, '?'},
112 {"target-pgdata", required_argument, NULL, 'D'},
113 {"write-recovery-conf", no_argument, NULL, 'R'},
114 {"source-pgdata", required_argument, NULL, 1},
115 {"source-server", required_argument, NULL, 2},
116 {"no-ensure-shutdown", no_argument, NULL, 4},
117 {"version", no_argument, NULL, 'V'},
118 {"restore-target-wal", no_argument, NULL, 'c'},
119 {"dry-run", no_argument, NULL, 'n'},
120 {"no-sync", no_argument, NULL, 'N'},
121 {"progress", no_argument, NULL, 'P'},
122 {"debug", no_argument, NULL, 3},
123 {NULL, 0, NULL, 0}
125 int option_index;
126 int c;
127 XLogRecPtr divergerec;
128 int lastcommontliIndex;
129 XLogRecPtr chkptrec;
130 TimeLineID chkpttli;
131 XLogRecPtr chkptredo;
132 XLogRecPtr target_wal_endrec;
133 size_t size;
134 char *buffer;
135 bool no_ensure_shutdown = false;
136 bool rewind_needed;
137 bool writerecoveryconf = false;
138 filemap_t *filemap;
140 pg_logging_init(argv[0]);
141 set_pglocale_pgservice(argv[0], PG_TEXTDOMAIN("pg_rewind"));
142 progname = get_progname(argv[0]);
144 /* Process command-line arguments */
145 if (argc > 1)
147 if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-?") == 0)
149 usage(progname);
150 exit(0);
152 if (strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-V") == 0)
154 puts("pg_rewind (PostgreSQL) " PG_VERSION);
155 exit(0);
159 while ((c = getopt_long(argc, argv, "cD:nNPR", long_options, &option_index)) != -1)
161 switch (c)
163 case '?':
164 fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname);
165 exit(1);
167 case 'c':
168 restore_wal = true;
169 break;
171 case 'P':
172 showprogress = true;
173 break;
175 case 'n':
176 dry_run = true;
177 break;
179 case 'N':
180 do_sync = false;
181 break;
183 case 'R':
184 writerecoveryconf = true;
185 break;
187 case 3:
188 debug = true;
189 pg_logging_increase_verbosity();
190 break;
192 case 'D': /* -D or --target-pgdata */
193 datadir_target = pg_strdup(optarg);
194 break;
196 case 1: /* --source-pgdata */
197 datadir_source = pg_strdup(optarg);
198 break;
200 case 2: /* --source-server */
201 connstr_source = pg_strdup(optarg);
202 break;
204 case 4:
205 no_ensure_shutdown = true;
206 break;
210 if (datadir_source == NULL && connstr_source == NULL)
212 pg_log_error("no source specified (--source-pgdata or --source-server)");
213 fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname);
214 exit(1);
217 if (datadir_source != NULL && connstr_source != NULL)
219 pg_log_error("only one of --source-pgdata or --source-server can be specified");
220 fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname);
221 exit(1);
224 if (datadir_target == NULL)
226 pg_log_error("no target data directory specified (--target-pgdata)");
227 fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname);
228 exit(1);
231 if (writerecoveryconf && connstr_source == NULL)
233 pg_log_error("no source server information (--source-server) specified for --write-recovery-conf");
234 fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname);
235 exit(1);
238 if (optind < argc)
240 pg_log_error("too many command-line arguments (first is \"%s\")",
241 argv[optind]);
242 fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname);
243 exit(1);
247 * Don't allow pg_rewind to be run as root, to avoid overwriting the
248 * ownership of files in the data directory. We need only check for root
249 * -- any other user won't have sufficient permissions to modify files in
250 * the data directory.
252 #ifndef WIN32
253 if (geteuid() == 0)
255 pg_log_error("cannot be executed by \"root\"");
256 fprintf(stderr, _("You must run %s as the PostgreSQL superuser.\n"),
257 progname);
258 exit(1);
260 #endif
262 get_restricted_token();
264 /* Set mask based on PGDATA permissions */
265 if (!GetDataDirectoryCreatePerm(datadir_target))
267 pg_log_error("could not read permissions of directory \"%s\": %m",
268 datadir_target);
269 exit(1);
272 umask(pg_mode_mask);
274 getRestoreCommand(argv[0]);
276 atexit(disconnect_atexit);
279 * Ok, we have all the options and we're ready to start. First, connect to
280 * remote server.
282 if (connstr_source)
284 conn = PQconnectdb(connstr_source);
286 if (PQstatus(conn) == CONNECTION_BAD)
287 pg_fatal("%s", PQerrorMessage(conn));
289 if (showprogress)
290 pg_log_info("connected to server");
292 source = init_libpq_source(conn);
294 else
295 source = init_local_source(datadir_source);
298 * Check the status of the target instance.
300 * If the target instance was not cleanly shut down, start and stop the
301 * target cluster once in single-user mode to enforce recovery to finish,
302 * ensuring that the cluster can be used by pg_rewind. Note that if
303 * no_ensure_shutdown is specified, pg_rewind ignores this step, and users
304 * need to make sure by themselves that the target cluster is in a clean
305 * state.
307 buffer = slurpFile(datadir_target, "global/pg_control", &size);
308 digestControlFile(&ControlFile_target, buffer, size);
309 pg_free(buffer);
311 if (!no_ensure_shutdown &&
312 ControlFile_target.state != DB_SHUTDOWNED &&
313 ControlFile_target.state != DB_SHUTDOWNED_IN_RECOVERY)
315 ensureCleanShutdown(argv[0]);
317 buffer = slurpFile(datadir_target, "global/pg_control", &size);
318 digestControlFile(&ControlFile_target, buffer, size);
319 pg_free(buffer);
322 buffer = source->fetch_file(source, "global/pg_control", &size);
323 digestControlFile(&ControlFile_source, buffer, size);
324 pg_free(buffer);
326 sanityChecks();
329 * Find the common ancestor timeline between the clusters.
331 * If both clusters are already on the same timeline, there's nothing to
332 * do.
334 if (ControlFile_target.checkPointCopy.ThisTimeLineID ==
335 ControlFile_source.checkPointCopy.ThisTimeLineID)
337 pg_log_info("source and target cluster are on the same timeline");
338 rewind_needed = false;
339 target_wal_endrec = 0;
341 else
343 XLogRecPtr chkptendrec;
345 findCommonAncestorTimeline(&divergerec, &lastcommontliIndex);
346 pg_log_info("servers diverged at WAL location %X/%X on timeline %u",
347 LSN_FORMAT_ARGS(divergerec),
348 targetHistory[lastcommontliIndex].tli);
351 * Determine the end-of-WAL on the target.
353 * The WAL ends at the last shutdown checkpoint, or at
354 * minRecoveryPoint if it was a standby. (If we supported rewinding a
355 * server that was not shut down cleanly, we would need to replay
356 * until we reach the first invalid record, like crash recovery does.)
359 /* read the checkpoint record on the target to see where it ends. */
360 chkptendrec = readOneRecord(datadir_target,
361 ControlFile_target.checkPoint,
362 targetNentries - 1,
363 restore_command);
365 if (ControlFile_target.minRecoveryPoint > chkptendrec)
367 target_wal_endrec = ControlFile_target.minRecoveryPoint;
369 else
371 target_wal_endrec = chkptendrec;
375 * Check for the possibility that the target is in fact a direct
376 * ancestor of the source. In that case, there is no divergent history
377 * in the target that needs rewinding.
379 if (target_wal_endrec > divergerec)
381 rewind_needed = true;
383 else
385 /* the last common checkpoint record must be part of target WAL */
386 Assert(target_wal_endrec == divergerec);
388 rewind_needed = false;
392 if (!rewind_needed)
394 pg_log_info("no rewind required");
395 if (writerecoveryconf && !dry_run)
396 WriteRecoveryConfig(conn, datadir_target,
397 GenerateRecoveryConfig(conn, NULL));
398 exit(0);
401 findLastCheckpoint(datadir_target, divergerec, lastcommontliIndex,
402 &chkptrec, &chkpttli, &chkptredo, restore_command);
403 pg_log_info("rewinding from last common checkpoint at %X/%X on timeline %u",
404 LSN_FORMAT_ARGS(chkptrec), chkpttli);
406 /* Initialize the hash table to track the status of each file */
407 filehash_init();
410 * Collect information about all files in the both data directories.
412 if (showprogress)
413 pg_log_info("reading source file list");
414 source->traverse_files(source, &process_source_file);
416 if (showprogress)
417 pg_log_info("reading target file list");
418 traverse_datadir(datadir_target, &process_target_file);
421 * Read the target WAL from last checkpoint before the point of fork, to
422 * extract all the pages that were modified on the target cluster after
423 * the fork.
425 if (showprogress)
426 pg_log_info("reading WAL in target");
427 extractPageMap(datadir_target, chkptrec, lastcommontliIndex,
428 target_wal_endrec, restore_command);
431 * We have collected all information we need from both systems. Decide
432 * what to do with each file.
434 filemap = decide_file_actions();
435 if (showprogress)
436 calculate_totals(filemap);
438 /* this is too verbose even for verbose mode */
439 if (debug)
440 print_filemap(filemap);
443 * Ok, we're ready to start copying things over.
445 if (showprogress)
447 pg_log_info("need to copy %lu MB (total source directory size is %lu MB)",
448 (unsigned long) (filemap->fetch_size / (1024 * 1024)),
449 (unsigned long) (filemap->total_size / (1024 * 1024)));
451 fetch_size = filemap->fetch_size;
452 fetch_done = 0;
456 * We have now collected all the information we need from both systems,
457 * and we are ready to start modifying the target directory.
459 * This is the point of no return. Once we start copying things, there is
460 * no turning back!
462 perform_rewind(filemap, source, chkptrec, chkpttli, chkptredo);
464 if (showprogress)
465 pg_log_info("syncing target data directory");
466 sync_target_dir();
468 /* Also update the standby configuration, if requested. */
469 if (writerecoveryconf && !dry_run)
470 WriteRecoveryConfig(conn, datadir_target,
471 GenerateRecoveryConfig(conn, NULL));
473 /* don't need the source connection anymore */
474 source->destroy(source);
475 if (conn)
477 PQfinish(conn);
478 conn = NULL;
481 pg_log_info("Done!");
483 return 0;
487 * Perform the rewind.
489 * We have already collected all the information we need from the
490 * target and the source.
492 static void
493 perform_rewind(filemap_t *filemap, rewind_source *source,
494 XLogRecPtr chkptrec,
495 TimeLineID chkpttli,
496 XLogRecPtr chkptredo)
498 XLogRecPtr endrec;
499 TimeLineID endtli;
500 ControlFileData ControlFile_new;
501 size_t size;
502 char *buffer;
505 * Execute the actions in the file map, fetching data from the source
506 * system as needed.
508 for (int i = 0; i < filemap->nentries; i++)
510 file_entry_t *entry = filemap->entries[i];
513 * If this is a relation file, copy the modified blocks.
515 * This is in addition to any other changes.
517 if (entry->target_pages_to_overwrite.bitmapsize > 0)
519 datapagemap_iterator_t *iter;
520 BlockNumber blkno;
521 off_t offset;
523 iter = datapagemap_iterate(&entry->target_pages_to_overwrite);
524 while (datapagemap_next(iter, &blkno))
526 offset = blkno * BLCKSZ;
527 source->queue_fetch_range(source, entry->path, offset, BLCKSZ);
529 pg_free(iter);
532 switch (entry->action)
534 case FILE_ACTION_NONE:
535 /* nothing else to do */
536 break;
538 case FILE_ACTION_COPY:
539 /* Truncate the old file out of the way, if any */
540 open_target_file(entry->path, true);
541 source->queue_fetch_range(source, entry->path,
542 0, entry->source_size);
543 break;
545 case FILE_ACTION_TRUNCATE:
546 truncate_target_file(entry->path, entry->source_size);
547 break;
549 case FILE_ACTION_COPY_TAIL:
550 source->queue_fetch_range(source, entry->path,
551 entry->target_size,
552 entry->source_size - entry->target_size);
553 break;
555 case FILE_ACTION_REMOVE:
556 remove_target(entry);
557 break;
559 case FILE_ACTION_CREATE:
560 create_target(entry);
561 break;
563 case FILE_ACTION_UNDECIDED:
564 pg_fatal("no action decided for file \"%s\"", entry->path);
565 break;
569 /* Complete any remaining range-fetches that we queued up above. */
570 source->finish_fetch(source);
572 close_target_file();
574 progress_report(true);
577 * Fetch the control file from the source last. This ensures that the
578 * minRecoveryPoint is up-to-date.
580 buffer = source->fetch_file(source, "global/pg_control", &size);
581 digestControlFile(&ControlFile_source_after, buffer, size);
582 pg_free(buffer);
585 * Sanity check: If the source is a local system, the control file should
586 * not have changed since we started.
588 * XXX: We assume it hasn't been modified, but actually, what could go
589 * wrong? The logic handles a libpq source that's modified concurrently,
590 * why not a local datadir?
592 if (datadir_source &&
593 memcmp(&ControlFile_source, &ControlFile_source_after,
594 sizeof(ControlFileData)) != 0)
596 pg_fatal("source system was modified while pg_rewind was running");
599 if (showprogress)
600 pg_log_info("creating backup label and updating control file");
603 * Create a backup label file, to tell the target where to begin the WAL
604 * replay. Normally, from the last common checkpoint between the source
605 * and the target. But if the source is a standby server, it's possible
606 * that the last common checkpoint is *after* the standby's restartpoint.
607 * That implies that the source server has applied the checkpoint record,
608 * but hasn't performed a corresponding restartpoint yet. Make sure we
609 * start at the restartpoint's redo point in that case.
611 * Use the old version of the source's control file for this. The server
612 * might have finished the restartpoint after we started copying files,
613 * but we must begin from the redo point at the time that started copying.
615 if (ControlFile_source.checkPointCopy.redo < chkptredo)
617 chkptredo = ControlFile_source.checkPointCopy.redo;
618 chkpttli = ControlFile_source.checkPointCopy.ThisTimeLineID;
619 chkptrec = ControlFile_source.checkPoint;
621 createBackupLabel(chkptredo, chkpttli, chkptrec);
624 * Update control file of target, to tell the target how far it must
625 * replay the WAL (minRecoveryPoint).
627 if (connstr_source)
630 * The source is a live server. Like in an online backup, it's
631 * important that we recover all the WAL that was generated while we
632 * were copying files.
634 if (ControlFile_source_after.state == DB_IN_ARCHIVE_RECOVERY)
637 * Source is a standby server. We must replay to its
638 * minRecoveryPoint.
640 endrec = ControlFile_source_after.minRecoveryPoint;
641 endtli = ControlFile_source_after.minRecoveryPointTLI;
643 else
646 * Source is a production, non-standby, server. We must replay to
647 * the last WAL insert location.
649 if (ControlFile_source_after.state != DB_IN_PRODUCTION)
650 pg_fatal("source system was in unexpected state at end of rewind");
652 endrec = source->get_current_wal_insert_lsn(source);
653 endtli = ControlFile_source_after.checkPointCopy.ThisTimeLineID;
656 else
659 * Source is a local data directory. It should've shut down cleanly,
660 * and we must replay to the latest shutdown checkpoint.
662 endrec = ControlFile_source_after.checkPoint;
663 endtli = ControlFile_source_after.checkPointCopy.ThisTimeLineID;
666 memcpy(&ControlFile_new, &ControlFile_source_after, sizeof(ControlFileData));
667 ControlFile_new.minRecoveryPoint = endrec;
668 ControlFile_new.minRecoveryPointTLI = endtli;
669 ControlFile_new.state = DB_IN_ARCHIVE_RECOVERY;
670 if (!dry_run)
671 update_controlfile(datadir_target, &ControlFile_new, do_sync);
674 static void
675 sanityChecks(void)
677 /* TODO Check that there's no backup_label in either cluster */
679 /* Check system_identifier match */
680 if (ControlFile_target.system_identifier != ControlFile_source.system_identifier)
681 pg_fatal("source and target clusters are from different systems");
683 /* check version */
684 if (ControlFile_target.pg_control_version != PG_CONTROL_VERSION ||
685 ControlFile_source.pg_control_version != PG_CONTROL_VERSION ||
686 ControlFile_target.catalog_version_no != CATALOG_VERSION_NO ||
687 ControlFile_source.catalog_version_no != CATALOG_VERSION_NO)
689 pg_fatal("clusters are not compatible with this version of pg_rewind");
693 * Target cluster need to use checksums or hint bit wal-logging, this to
694 * prevent from data corruption that could occur because of hint bits.
696 if (ControlFile_target.data_checksum_version != PG_DATA_CHECKSUM_VERSION &&
697 !ControlFile_target.wal_log_hints)
699 pg_fatal("target server needs to use either data checksums or \"wal_log_hints = on\"");
703 * Target cluster better not be running. This doesn't guard against
704 * someone starting the cluster concurrently. Also, this is probably more
705 * strict than necessary; it's OK if the target node was not shut down
706 * cleanly, as long as it isn't running at the moment.
708 if (ControlFile_target.state != DB_SHUTDOWNED &&
709 ControlFile_target.state != DB_SHUTDOWNED_IN_RECOVERY)
710 pg_fatal("target server must be shut down cleanly");
713 * When the source is a data directory, also require that the source
714 * server is shut down. There isn't any very strong reason for this
715 * limitation, but better safe than sorry.
717 if (datadir_source &&
718 ControlFile_source.state != DB_SHUTDOWNED &&
719 ControlFile_source.state != DB_SHUTDOWNED_IN_RECOVERY)
720 pg_fatal("source data directory must be shut down cleanly");
724 * Print a progress report based on the fetch_size and fetch_done variables.
726 * Progress report is written at maximum once per second, except that the
727 * last progress report is always printed.
729 * If finished is set to true, this is the last progress report. The cursor
730 * is moved to the next line.
732 void
733 progress_report(bool finished)
735 static pg_time_t last_progress_report = 0;
736 int percent;
737 char fetch_done_str[32];
738 char fetch_size_str[32];
739 pg_time_t now;
741 if (!showprogress)
742 return;
744 now = time(NULL);
745 if (now == last_progress_report && !finished)
746 return; /* Max once per second */
748 last_progress_report = now;
749 percent = fetch_size ? (int) ((fetch_done) * 100 / fetch_size) : 0;
752 * Avoid overflowing past 100% or the full size. This may make the total
753 * size number change as we approach the end of the backup (the estimate
754 * will always be wrong if WAL is included), but that's better than having
755 * the done column be bigger than the total.
757 if (percent > 100)
758 percent = 100;
759 if (fetch_done > fetch_size)
760 fetch_size = fetch_done;
762 snprintf(fetch_done_str, sizeof(fetch_done_str), UINT64_FORMAT,
763 fetch_done / 1024);
764 snprintf(fetch_size_str, sizeof(fetch_size_str), UINT64_FORMAT,
765 fetch_size / 1024);
767 fprintf(stderr, _("%*s/%s kB (%d%%) copied"),
768 (int) strlen(fetch_size_str), fetch_done_str, fetch_size_str,
769 percent);
772 * Stay on the same line if reporting to a terminal and we're not done
773 * yet.
775 fputc((!finished && isatty(fileno(stderr))) ? '\r' : '\n', stderr);
779 * Find minimum from two WAL locations assuming InvalidXLogRecPtr means
780 * infinity as src/include/access/timeline.h states. This routine should
781 * be used only when comparing WAL locations related to history files.
783 static XLogRecPtr
784 MinXLogRecPtr(XLogRecPtr a, XLogRecPtr b)
786 if (XLogRecPtrIsInvalid(a))
787 return b;
788 else if (XLogRecPtrIsInvalid(b))
789 return a;
790 else
791 return Min(a, b);
795 * Retrieve timeline history for given control file which should behold
796 * either source or target.
798 static TimeLineHistoryEntry *
799 getTimelineHistory(ControlFileData *controlFile, int *nentries)
801 TimeLineHistoryEntry *history;
802 TimeLineID tli;
804 tli = controlFile->checkPointCopy.ThisTimeLineID;
807 * Timeline 1 does not have a history file, so there is no need to check
808 * and fake an entry with infinite start and end positions.
810 if (tli == 1)
812 history = (TimeLineHistoryEntry *) pg_malloc(sizeof(TimeLineHistoryEntry));
813 history->tli = tli;
814 history->begin = history->end = InvalidXLogRecPtr;
815 *nentries = 1;
817 else
819 char path[MAXPGPATH];
820 char *histfile;
822 TLHistoryFilePath(path, tli);
824 /* Get history file from appropriate source */
825 if (controlFile == &ControlFile_source)
826 histfile = source->fetch_file(source, path, NULL);
827 else if (controlFile == &ControlFile_target)
828 histfile = slurpFile(datadir_target, path, NULL);
829 else
830 pg_fatal("invalid control file");
832 history = rewind_parseTimeLineHistory(histfile, tli, nentries);
833 pg_free(histfile);
836 if (debug)
838 int i;
840 if (controlFile == &ControlFile_source)
841 pg_log_debug("Source timeline history:");
842 else if (controlFile == &ControlFile_target)
843 pg_log_debug("Target timeline history:");
844 else
845 Assert(false);
848 * Print the target timeline history.
850 for (i = 0; i < targetNentries; i++)
852 TimeLineHistoryEntry *entry;
854 entry = &history[i];
855 pg_log_debug("%u: %X/%X - %X/%X", entry->tli,
856 LSN_FORMAT_ARGS(entry->begin),
857 LSN_FORMAT_ARGS(entry->end));
861 return history;
865 * Determine the TLI of the last common timeline in the timeline history of the
866 * two clusters. targetHistory is filled with target timeline history and
867 * targetNentries is number of items in targetHistory. *tliIndex is set to the
868 * index of last common timeline in targetHistory array, and *recptr is set to
869 * the position where the timeline history diverged (ie. the first WAL record
870 * that's not the same in both clusters).
872 * Control files of both clusters must be read into ControlFile_target/source
873 * before calling this routine.
875 static void
876 findCommonAncestorTimeline(XLogRecPtr *recptr, int *tliIndex)
878 TimeLineHistoryEntry *sourceHistory;
879 int sourceNentries;
880 int i,
883 /* Retrieve timelines for both source and target */
884 sourceHistory = getTimelineHistory(&ControlFile_source, &sourceNentries);
885 targetHistory = getTimelineHistory(&ControlFile_target, &targetNentries);
888 * Trace the history forward, until we hit the timeline diverge. It may
889 * still be possible that the source and target nodes used the same
890 * timeline number in their history but with different start position
891 * depending on the history files that each node has fetched in previous
892 * recovery processes. Hence check the start position of the new timeline
893 * as well and move down by one extra timeline entry if they do not match.
895 n = Min(sourceNentries, targetNentries);
896 for (i = 0; i < n; i++)
898 if (sourceHistory[i].tli != targetHistory[i].tli ||
899 sourceHistory[i].begin != targetHistory[i].begin)
900 break;
903 if (i > 0)
905 i--;
906 *recptr = MinXLogRecPtr(sourceHistory[i].end, targetHistory[i].end);
907 *tliIndex = i;
909 pg_free(sourceHistory);
910 return;
912 else
914 pg_fatal("could not find common ancestor of the source and target cluster's timelines");
920 * Create a backup_label file that forces recovery to begin at the last common
921 * checkpoint.
923 static void
924 createBackupLabel(XLogRecPtr startpoint, TimeLineID starttli, XLogRecPtr checkpointloc)
926 XLogSegNo startsegno;
927 time_t stamp_time;
928 char strfbuf[128];
929 char xlogfilename[MAXFNAMELEN];
930 struct tm *tmp;
931 char buf[1000];
932 int len;
934 XLByteToSeg(startpoint, startsegno, WalSegSz);
935 XLogFileName(xlogfilename, starttli, startsegno, WalSegSz);
938 * Construct backup label file
940 stamp_time = time(NULL);
941 tmp = localtime(&stamp_time);
942 strftime(strfbuf, sizeof(strfbuf), "%Y-%m-%d %H:%M:%S %Z", tmp);
944 len = snprintf(buf, sizeof(buf),
945 "START WAL LOCATION: %X/%X (file %s)\n"
946 "CHECKPOINT LOCATION: %X/%X\n"
947 "BACKUP METHOD: pg_rewind\n"
948 "BACKUP FROM: standby\n"
949 "START TIME: %s\n",
950 /* omit LABEL: line */
951 LSN_FORMAT_ARGS(startpoint), xlogfilename,
952 LSN_FORMAT_ARGS(checkpointloc),
953 strfbuf);
954 if (len >= sizeof(buf))
955 pg_fatal("backup label buffer too small"); /* shouldn't happen */
957 /* TODO: move old file out of the way, if any. */
958 open_target_file("backup_label", true); /* BACKUP_LABEL_FILE */
959 write_target_range(buf, 0, len);
960 close_target_file();
964 * Check CRC of control file
966 static void
967 checkControlFile(ControlFileData *ControlFile)
969 pg_crc32c crc;
971 /* Calculate CRC */
972 INIT_CRC32C(crc);
973 COMP_CRC32C(crc, (char *) ControlFile, offsetof(ControlFileData, crc));
974 FIN_CRC32C(crc);
976 /* And simply compare it */
977 if (!EQ_CRC32C(crc, ControlFile->crc))
978 pg_fatal("unexpected control file CRC");
982 * Verify control file contents in the buffer 'content', and copy it to
983 * *ControlFile.
985 static void
986 digestControlFile(ControlFileData *ControlFile, const char *content,
987 size_t size)
989 if (size != PG_CONTROL_FILE_SIZE)
990 pg_fatal("unexpected control file size %d, expected %d",
991 (int) size, PG_CONTROL_FILE_SIZE);
993 memcpy(ControlFile, content, sizeof(ControlFileData));
995 /* set and validate WalSegSz */
996 WalSegSz = ControlFile->xlog_seg_size;
998 if (!IsValidWalSegSize(WalSegSz))
999 pg_fatal(ngettext("WAL segment size must be a power of two between 1 MB and 1 GB, but the control file specifies %d byte",
1000 "WAL segment size must be a power of two between 1 MB and 1 GB, but the control file specifies %d bytes",
1001 WalSegSz),
1002 WalSegSz);
1004 /* Additional checks on control file */
1005 checkControlFile(ControlFile);
1009 * Get value of GUC parameter restore_command from the target cluster.
1011 * This uses a logic based on "postgres -C" to get the value from the
1012 * cluster.
1014 static void
1015 getRestoreCommand(const char *argv0)
1017 int rc;
1018 char postgres_exec_path[MAXPGPATH],
1019 postgres_cmd[MAXPGPATH],
1020 cmd_output[MAXPGPATH];
1022 if (!restore_wal)
1023 return;
1025 /* find postgres executable */
1026 rc = find_other_exec(argv0, "postgres",
1027 PG_BACKEND_VERSIONSTR,
1028 postgres_exec_path);
1030 if (rc < 0)
1032 char full_path[MAXPGPATH];
1034 if (find_my_exec(argv0, full_path) < 0)
1035 strlcpy(full_path, progname, sizeof(full_path));
1037 if (rc == -1)
1038 pg_log_error("The program \"%s\" is needed by %s but was not found in the\n"
1039 "same directory as \"%s\".\n"
1040 "Check your installation.",
1041 "postgres", progname, full_path);
1042 else
1043 pg_log_error("The program \"%s\" was found by \"%s\"\n"
1044 "but was not the same version as %s.\n"
1045 "Check your installation.",
1046 "postgres", full_path, progname);
1047 exit(1);
1051 * Build a command able to retrieve the value of GUC parameter
1052 * restore_command, if set.
1054 snprintf(postgres_cmd, sizeof(postgres_cmd),
1055 "\"%s\" -D \"%s\" -C restore_command",
1056 postgres_exec_path, datadir_target);
1058 if (!pipe_read_line(postgres_cmd, cmd_output, sizeof(cmd_output)))
1059 exit(1);
1061 (void) pg_strip_crlf(cmd_output);
1063 if (strcmp(cmd_output, "") == 0)
1064 pg_fatal("restore_command is not set in the target cluster");
1066 restore_command = pg_strdup(cmd_output);
1068 pg_log_debug("using for rewind restore_command = \'%s\'",
1069 restore_command);
1074 * Ensure clean shutdown of target instance by launching single-user mode
1075 * postgres to do crash recovery.
1077 static void
1078 ensureCleanShutdown(const char *argv0)
1080 int ret;
1081 #define MAXCMDLEN (2 * MAXPGPATH)
1082 char exec_path[MAXPGPATH];
1083 char cmd[MAXCMDLEN];
1085 /* locate postgres binary */
1086 if ((ret = find_other_exec(argv0, "postgres",
1087 PG_BACKEND_VERSIONSTR,
1088 exec_path)) < 0)
1090 char full_path[MAXPGPATH];
1092 if (find_my_exec(argv0, full_path) < 0)
1093 strlcpy(full_path, progname, sizeof(full_path));
1095 if (ret == -1)
1096 pg_fatal("The program \"%s\" is needed by %s but was not found in the\n"
1097 "same directory as \"%s\".\n"
1098 "Check your installation.",
1099 "postgres", progname, full_path);
1100 else
1101 pg_fatal("The program \"%s\" was found by \"%s\"\n"
1102 "but was not the same version as %s.\n"
1103 "Check your installation.",
1104 "postgres", full_path, progname);
1107 pg_log_info("executing \"%s\" for target server to complete crash recovery",
1108 exec_path);
1111 * Skip processing if requested, but only after ensuring presence of
1112 * postgres.
1114 if (dry_run)
1115 return;
1118 * Finally run postgres in single-user mode. There is no need to use
1119 * fsync here. This makes the recovery faster, and the target data folder
1120 * is synced at the end anyway.
1122 snprintf(cmd, MAXCMDLEN, "\"%s\" --single -F -D \"%s\" template1 < \"%s\"",
1123 exec_path, datadir_target, DEVNULL);
1125 if (system(cmd) != 0)
1127 pg_log_error("postgres single-user mode in target cluster failed");
1128 pg_fatal("Command was: %s", cmd);
1132 static void
1133 disconnect_atexit(void)
1135 if (conn != NULL)
1136 PQfinish(conn);