Minor improvements in backup and recovery:
[PostgreSQL.git] / src / backend / postmaster / pgarch.c
blob4e0411a7126519731b2aa1030a04532f9bd02883
1 /*-------------------------------------------------------------------------
3 * pgarch.c
5 * PostgreSQL WAL archiver
7 * All functions relating to archiver are included here
9 * - All functions executed by archiver process
11 * - archiver is forked from postmaster, and the two
12 * processes then communicate using signals. All functions
13 * executed by postmaster are included in this file.
15 * Initial author: Simon Riggs simon@2ndquadrant.com
17 * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
18 * Portions Copyright (c) 1994, Regents of the University of California
21 * IDENTIFICATION
22 * $PostgreSQL$
24 *-------------------------------------------------------------------------
26 #include "postgres.h"
28 #include <fcntl.h>
29 #include <signal.h>
30 #include <time.h>
31 #include <sys/time.h>
32 #include <sys/wait.h>
33 #include <unistd.h>
35 #include "access/xlog_internal.h"
36 #include "libpq/pqsignal.h"
37 #include "miscadmin.h"
38 #include "postmaster/fork_process.h"
39 #include "postmaster/pgarch.h"
40 #include "postmaster/postmaster.h"
41 #include "storage/fd.h"
42 #include "storage/ipc.h"
43 #include "storage/pg_shmem.h"
44 #include "storage/pmsignal.h"
45 #include "utils/guc.h"
46 #include "utils/ps_status.h"
49 /* ----------
50 * Timer definitions.
51 * ----------
53 #define PGARCH_AUTOWAKE_INTERVAL 60 /* How often to force a poll of the
54 * archive status directory; in
55 * seconds. */
56 #define PGARCH_RESTART_INTERVAL 10 /* How often to attempt to restart a
57 * failed archiver; in seconds. */
59 /* ----------
60 * Archiver control info.
62 * We expect that archivable files within pg_xlog will have names between
63 * MIN_XFN_CHARS and MAX_XFN_CHARS in length, consisting only of characters
64 * appearing in VALID_XFN_CHARS. The status files in archive_status have
65 * corresponding names with ".ready" or ".done" appended.
66 * ----------
68 #define MIN_XFN_CHARS 16
69 #define MAX_XFN_CHARS 40
70 #define VALID_XFN_CHARS "0123456789ABCDEF.history.backup"
72 #define NUM_ARCHIVE_RETRIES 3
75 /* ----------
76 * Local data
77 * ----------
79 static time_t last_pgarch_start_time;
82 * Flags set by interrupt handlers for later service in the main loop.
84 static volatile sig_atomic_t got_SIGHUP = false;
85 static volatile sig_atomic_t wakened = false;
87 /* ----------
88 * Local function forward declarations
89 * ----------
91 #ifdef EXEC_BACKEND
92 static pid_t pgarch_forkexec(void);
93 #endif
95 NON_EXEC_STATIC void PgArchiverMain(int argc, char *argv[]);
96 static void pgarch_exit(SIGNAL_ARGS);
97 static void ArchSigHupHandler(SIGNAL_ARGS);
98 static void pgarch_waken(SIGNAL_ARGS);
99 static void pgarch_MainLoop(void);
100 static void pgarch_ArchiverCopyLoop(void);
101 static bool pgarch_archiveXlog(char *xlog);
102 static bool pgarch_readyXlog(char *xlog);
103 static void pgarch_archiveDone(char *xlog);
106 /* ------------------------------------------------------------
107 * Public functions called from postmaster follow
108 * ------------------------------------------------------------
112 * pgarch_start
114 * Called from postmaster at startup or after an existing archiver
115 * died. Attempt to fire up a fresh archiver process.
117 * Returns PID of child process, or 0 if fail.
119 * Note: if fail, we will be called again from the postmaster main loop.
122 pgarch_start(void)
124 time_t curtime;
125 pid_t pgArchPid;
128 * Do nothing if no archiver needed
130 if (!XLogArchivingActive())
131 return 0;
134 * Do nothing if too soon since last archiver start. This is a safety
135 * valve to protect against continuous respawn attempts if the archiver is
136 * dying immediately at launch. Note that since we will be re-called from
137 * the postmaster main loop, we will get another chance later.
139 curtime = time(NULL);
140 if ((unsigned int) (curtime - last_pgarch_start_time) <
141 (unsigned int) PGARCH_RESTART_INTERVAL)
142 return 0;
143 last_pgarch_start_time = curtime;
145 #ifdef EXEC_BACKEND
146 switch ((pgArchPid = pgarch_forkexec()))
147 #else
148 switch ((pgArchPid = fork_process()))
149 #endif
151 case -1:
152 ereport(LOG,
153 (errmsg("could not fork archiver: %m")));
154 return 0;
156 #ifndef EXEC_BACKEND
157 case 0:
158 /* in postmaster child ... */
159 /* Close the postmaster's sockets */
160 ClosePostmasterPorts(false);
162 /* Lose the postmaster's on-exit routines */
163 on_exit_reset();
165 /* Drop our connection to postmaster's shared memory, as well */
166 PGSharedMemoryDetach();
168 PgArchiverMain(0, NULL);
169 break;
170 #endif
172 default:
173 return (int) pgArchPid;
176 /* shouldn't get here */
177 return 0;
180 /* ------------------------------------------------------------
181 * Local functions called by archiver follow
182 * ------------------------------------------------------------
186 #ifdef EXEC_BACKEND
189 * pgarch_forkexec() -
191 * Format up the arglist for, then fork and exec, archive process
193 static pid_t
194 pgarch_forkexec(void)
196 char *av[10];
197 int ac = 0;
199 av[ac++] = "postgres";
201 av[ac++] = "--forkarch";
203 av[ac++] = NULL; /* filled in by postmaster_forkexec */
205 av[ac] = NULL;
206 Assert(ac < lengthof(av));
208 return postmaster_forkexec(ac, av);
210 #endif /* EXEC_BACKEND */
214 * PgArchiverMain
216 * The argc/argv parameters are valid only in EXEC_BACKEND case. However,
217 * since we don't use 'em, it hardly matters...
219 NON_EXEC_STATIC void
220 PgArchiverMain(int argc, char *argv[])
222 IsUnderPostmaster = true; /* we are a postmaster subprocess now */
224 MyProcPid = getpid(); /* reset MyProcPid */
226 MyStartTime = time(NULL); /* record Start Time for logging */
229 * If possible, make this process a group leader, so that the postmaster
230 * can signal any child processes too.
232 #ifdef HAVE_SETSID
233 if (setsid() < 0)
234 elog(FATAL, "setsid() failed: %m");
235 #endif
238 * Ignore all signals usually bound to some action in the postmaster,
239 * except for SIGHUP, SIGUSR1 and SIGQUIT.
241 pqsignal(SIGHUP, ArchSigHupHandler);
242 pqsignal(SIGINT, SIG_IGN);
243 pqsignal(SIGTERM, SIG_IGN);
244 pqsignal(SIGQUIT, pgarch_exit);
245 pqsignal(SIGALRM, SIG_IGN);
246 pqsignal(SIGPIPE, SIG_IGN);
247 pqsignal(SIGUSR1, pgarch_waken);
248 pqsignal(SIGUSR2, SIG_IGN);
249 pqsignal(SIGCHLD, SIG_DFL);
250 pqsignal(SIGTTIN, SIG_DFL);
251 pqsignal(SIGTTOU, SIG_DFL);
252 pqsignal(SIGCONT, SIG_DFL);
253 pqsignal(SIGWINCH, SIG_DFL);
254 PG_SETMASK(&UnBlockSig);
257 * Identify myself via ps
259 init_ps_display("archiver process", "", "", "");
261 pgarch_MainLoop();
263 exit(0);
266 /* SIGQUIT signal handler for archiver process */
267 static void
268 pgarch_exit(SIGNAL_ARGS)
271 * For now, we just nail the doors shut and get out of town. It might
272 * seem cleaner to finish up any pending archive copies, but there's a
273 * nontrivial risk that init will kill us partway through.
275 exit(0);
278 /* SIGHUP: set flag to re-read config file at next convenient time */
279 static void
280 ArchSigHupHandler(SIGNAL_ARGS)
282 got_SIGHUP = true;
285 /* SIGUSR1 signal handler for archiver process */
286 static void
287 pgarch_waken(SIGNAL_ARGS)
289 wakened = true;
293 * pgarch_MainLoop
295 * Main loop for archiver
297 static void
298 pgarch_MainLoop(void)
300 time_t last_copy_time = 0;
303 * We run the copy loop immediately upon entry, in case there are
304 * unarchived files left over from a previous database run (or maybe the
305 * archiver died unexpectedly). After that we wait for a signal or
306 * timeout before doing more.
308 wakened = true;
312 /* Check for config update */
313 if (got_SIGHUP)
315 got_SIGHUP = false;
316 ProcessConfigFile(PGC_SIGHUP);
319 /* Do what we're here for */
320 if (wakened)
322 wakened = false;
323 pgarch_ArchiverCopyLoop();
324 last_copy_time = time(NULL);
328 * There shouldn't be anything for the archiver to do except to wait
329 * for a signal ... however, the archiver exists to protect our data,
330 * so she wakes up occasionally to allow herself to be proactive.
332 * On some platforms, signals won't interrupt the sleep. To ensure we
333 * respond reasonably promptly when someone signals us, break down the
334 * sleep into 1-second increments, and check for interrupts after each
335 * nap.
337 while (!(wakened || got_SIGHUP))
339 time_t curtime;
341 pg_usleep(1000000L);
342 curtime = time(NULL);
343 if ((unsigned int) (curtime - last_copy_time) >=
344 (unsigned int) PGARCH_AUTOWAKE_INTERVAL)
345 wakened = true;
347 } while (PostmasterIsAlive(true));
351 * pgarch_ArchiverCopyLoop
353 * Archives all outstanding xlogs then returns
355 static void
356 pgarch_ArchiverCopyLoop(void)
358 char xlog[MAX_XFN_CHARS + 1];
360 if (!XLogArchiveCommandSet())
362 ereport(WARNING,
363 (errmsg("archive_mode enabled, yet archive_command is not set")));
364 /* can't do anything if no command ... */
365 return;
369 * loop through all xlogs with archive_status of .ready and archive
370 * them...mostly we expect this to be a single file, though it is possible
371 * some backend will add files onto the list of those that need archiving
372 * while we are still copying earlier archives
374 while (pgarch_readyXlog(xlog))
376 int failures = 0;
378 for (;;)
380 /* Abandon processing if we notice our postmaster has died */
381 if (!PostmasterIsAlive(true))
382 return;
384 if (pgarch_archiveXlog(xlog))
386 /* successful */
387 pgarch_archiveDone(xlog);
388 break; /* out of inner retry loop */
390 else
392 if (++failures >= NUM_ARCHIVE_RETRIES)
394 ereport(WARNING,
395 (errmsg("transaction log file \"%s\" could not be archived: too many failures",
396 xlog)));
397 return; /* give up archiving for now */
399 pg_usleep(1000000L); /* wait a bit before retrying */
406 * pgarch_archiveXlog
408 * Invokes system(3) to copy one archive file to wherever it should go
410 * Returns true if successful
412 static bool
413 pgarch_archiveXlog(char *xlog)
415 char xlogarchcmd[MAXPGPATH];
416 char pathname[MAXPGPATH];
417 char *dp;
418 char *endp;
419 const char *sp;
420 int rc;
422 snprintf(pathname, MAXPGPATH, XLOGDIR "/%s", xlog);
425 * construct the command to be executed
427 dp = xlogarchcmd;
428 endp = xlogarchcmd + MAXPGPATH - 1;
429 *endp = '\0';
431 for (sp = XLogArchiveCommand; *sp; sp++)
433 if (*sp == '%')
435 switch (sp[1])
437 case 'p':
438 /* %p: relative path of source file */
439 sp++;
440 strlcpy(dp, pathname, endp - dp);
441 make_native_path(dp);
442 dp += strlen(dp);
443 break;
444 case 'f':
445 /* %f: filename of source file */
446 sp++;
447 strlcpy(dp, xlog, endp - dp);
448 dp += strlen(dp);
449 break;
450 case '%':
451 /* convert %% to a single % */
452 sp++;
453 if (dp < endp)
454 *dp++ = *sp;
455 break;
456 default:
457 /* otherwise treat the % as not special */
458 if (dp < endp)
459 *dp++ = *sp;
460 break;
463 else
465 if (dp < endp)
466 *dp++ = *sp;
469 *dp = '\0';
471 ereport(DEBUG3,
472 (errmsg_internal("executing archive command \"%s\"",
473 xlogarchcmd)));
474 rc = system(xlogarchcmd);
475 if (rc != 0)
478 * If either the shell itself, or a called command, died on a signal,
479 * abort the archiver. We do this because system() ignores SIGINT and
480 * SIGQUIT while waiting; so a signal is very likely something that
481 * should have interrupted us too. If we overreact it's no big deal,
482 * the postmaster will just start the archiver again.
484 * Per the Single Unix Spec, shells report exit status > 128 when
485 * a called command died on a signal.
487 bool signaled = WIFSIGNALED(rc) || WEXITSTATUS(rc) > 128;
489 ereport(signaled ? FATAL : LOG,
490 (errmsg("archive command \"%s\" failed: return code %d",
491 xlogarchcmd, rc)));
493 return false;
495 ereport(LOG,
496 (errmsg("archived transaction log file \"%s\"", xlog)));
498 return true;
502 * pgarch_readyXlog
504 * Return name of the oldest xlog file that has not yet been archived.
505 * No notification is set that file archiving is now in progress, so
506 * this would need to be extended if multiple concurrent archival
507 * tasks were created. If a failure occurs, we will completely
508 * re-copy the file at the next available opportunity.
510 * It is important that we return the oldest, so that we archive xlogs
511 * in order that they were written, for two reasons:
512 * 1) to maintain the sequential chain of xlogs required for recovery
513 * 2) because the oldest ones will sooner become candidates for
514 * recycling at time of checkpoint
516 * NOTE: the "oldest" comparison will presently consider all segments of
517 * a timeline with a smaller ID to be older than all segments of a timeline
518 * with a larger ID; the net result being that past timelines are given
519 * higher priority for archiving. This seems okay, or at least not
520 * obviously worth changing.
522 static bool
523 pgarch_readyXlog(char *xlog)
526 * open xlog status directory and read through list of xlogs that have the
527 * .ready suffix, looking for earliest file. It is possible to optimise
528 * this code, though only a single file is expected on the vast majority
529 * of calls, so....
531 char XLogArchiveStatusDir[MAXPGPATH];
532 char newxlog[MAX_XFN_CHARS + 6 + 1];
533 DIR *rldir;
534 struct dirent *rlde;
535 bool found = false;
537 snprintf(XLogArchiveStatusDir, MAXPGPATH, XLOGDIR "/archive_status");
538 rldir = AllocateDir(XLogArchiveStatusDir);
539 if (rldir == NULL)
540 ereport(ERROR,
541 (errcode_for_file_access(),
542 errmsg("could not open archive status directory \"%s\": %m",
543 XLogArchiveStatusDir)));
545 while ((rlde = ReadDir(rldir, XLogArchiveStatusDir)) != NULL)
547 int basenamelen = (int) strlen(rlde->d_name) - 6;
549 if (basenamelen >= MIN_XFN_CHARS &&
550 basenamelen <= MAX_XFN_CHARS &&
551 strspn(rlde->d_name, VALID_XFN_CHARS) >= basenamelen &&
552 strcmp(rlde->d_name + basenamelen, ".ready") == 0)
554 if (!found)
556 strcpy(newxlog, rlde->d_name);
557 found = true;
559 else
561 if (strcmp(rlde->d_name, newxlog) < 0)
562 strcpy(newxlog, rlde->d_name);
566 FreeDir(rldir);
568 if (found)
570 /* truncate off the .ready */
571 newxlog[strlen(newxlog) - 6] = '\0';
572 strcpy(xlog, newxlog);
574 return found;
578 * pgarch_archiveDone
580 * Emit notification that an xlog file has been successfully archived.
581 * We do this by renaming the status file from NNN.ready to NNN.done.
582 * Eventually, a checkpoint process will notice this and delete both the
583 * NNN.done file and the xlog file itself.
585 static void
586 pgarch_archiveDone(char *xlog)
588 char rlogready[MAXPGPATH];
589 char rlogdone[MAXPGPATH];
591 StatusFilePath(rlogready, xlog, ".ready");
592 StatusFilePath(rlogdone, xlog, ".done");
593 if (rename(rlogready, rlogdone) < 0)
594 ereport(WARNING,
595 (errcode_for_file_access(),
596 errmsg("could not rename file \"%s\" to \"%s\": %m",
597 rlogready, rlogdone)));