Minor improvements in backup and recovery:
[PostgreSQL.git] / src / backend / postmaster / autovacuum.c
blob5b4c2145f097ac00327116d9a7a61f4f098bcc81
1 /*-------------------------------------------------------------------------
3 * autovacuum.c
5 * PostgreSQL Integrated Autovacuum Daemon
7 * The autovacuum system is structured in two different kinds of processes: the
8 * autovacuum launcher and the autovacuum worker. The launcher is an
9 * always-running process, started by the postmaster when the autovacuum GUC
10 * parameter is set. The launcher schedules autovacuum workers to be started
11 * when appropriate. The workers are the processes which execute the actual
12 * vacuuming; they connect to a database as determined in the launcher, and
13 * once connected they examine the catalogs to select the tables to vacuum.
15 * The autovacuum launcher cannot start the worker processes by itself,
16 * because doing so would cause robustness issues (namely, failure to shut
17 * them down on exceptional conditions, and also, since the launcher is
18 * connected to shared memory and is thus subject to corruption there, it is
19 * not as robust as the postmaster). So it leaves that task to the postmaster.
21 * There is an autovacuum shared memory area, where the launcher stores
22 * information about the database it wants vacuumed. When it wants a new
23 * worker to start, it sets a flag in shared memory and sends a signal to the
24 * postmaster. Then postmaster knows nothing more than it must start a worker;
25 * so it forks a new child, which turns into a worker. This new process
26 * connects to shared memory, and there it can inspect the information that the
27 * launcher has set up.
29 * If the fork() call fails in the postmaster, it sets a flag in the shared
30 * memory area, and sends a signal to the launcher. The launcher, upon
31 * noticing the flag, can try starting the worker again by resending the
32 * signal. Note that the failure can only be transient (fork failure due to
33 * high load, memory pressure, too many processes, etc); more permanent
34 * problems, like failure to connect to a database, are detected later in the
35 * worker and dealt with just by having the worker exit normally. The launcher
36 * will launch a new worker again later, per schedule.
38 * When the worker is done vacuuming it sends SIGUSR1 to the launcher. The
39 * launcher then wakes up and is able to launch another worker, if the schedule
40 * is so tight that a new worker is needed immediately. At this time the
41 * launcher can also balance the settings for the various remaining workers'
42 * cost-based vacuum delay feature.
44 * Note that there can be more than one worker in a database concurrently.
45 * They will store the table they are currently vacuuming in shared memory, so
46 * that other workers avoid being blocked waiting for the vacuum lock for that
47 * table. They will also reload the pgstats data just before vacuuming each
48 * table, to avoid vacuuming a table that was just finished being vacuumed by
49 * another worker and thus is no longer noted in shared memory. However,
50 * there is a window (caused by pgstat delay) on which a worker may choose a
51 * table that was already vacuumed; this is a bug in the current design.
53 * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
54 * Portions Copyright (c) 1994, Regents of the University of California
57 * IDENTIFICATION
58 * $PostgreSQL$
60 *-------------------------------------------------------------------------
62 #include "postgres.h"
64 #include <signal.h>
65 #include <sys/types.h>
66 #include <sys/time.h>
67 #include <time.h>
68 #include <unistd.h>
70 #include "access/genam.h"
71 #include "access/heapam.h"
72 #include "access/transam.h"
73 #include "access/xact.h"
74 #include "catalog/indexing.h"
75 #include "catalog/namespace.h"
76 #include "catalog/pg_autovacuum.h"
77 #include "catalog/pg_database.h"
78 #include "commands/dbcommands.h"
79 #include "commands/vacuum.h"
80 #include "libpq/hba.h"
81 #include "libpq/pqsignal.h"
82 #include "miscadmin.h"
83 #include "pgstat.h"
84 #include "postmaster/autovacuum.h"
85 #include "postmaster/fork_process.h"
86 #include "postmaster/postmaster.h"
87 #include "storage/fd.h"
88 #include "storage/ipc.h"
89 #include "storage/pmsignal.h"
90 #include "storage/proc.h"
91 #include "storage/procarray.h"
92 #include "storage/sinval.h"
93 #include "tcop/tcopprot.h"
94 #include "utils/flatfiles.h"
95 #include "utils/fmgroids.h"
96 #include "utils/lsyscache.h"
97 #include "utils/memutils.h"
98 #include "utils/ps_status.h"
99 #include "utils/syscache.h"
103 * GUC parameters
105 bool autovacuum_start_daemon = false;
106 int autovacuum_max_workers;
107 int autovacuum_naptime;
108 int autovacuum_vac_thresh;
109 double autovacuum_vac_scale;
110 int autovacuum_anl_thresh;
111 double autovacuum_anl_scale;
112 int autovacuum_freeze_max_age;
114 int autovacuum_vac_cost_delay;
115 int autovacuum_vac_cost_limit;
117 int Log_autovacuum_min_duration = -1;
119 /* how long to keep pgstat data in the launcher, in milliseconds */
120 #define STATS_READ_DELAY 1000
123 /* Flags to tell if we are in an autovacuum process */
124 static bool am_autovacuum_launcher = false;
125 static bool am_autovacuum_worker = false;
127 /* Flags set by signal handlers */
128 static volatile sig_atomic_t got_SIGHUP = false;
129 static volatile sig_atomic_t got_SIGUSR1 = false;
130 static volatile sig_atomic_t got_SIGTERM = false;
132 /* Comparison point for determining whether freeze_max_age is exceeded */
133 static TransactionId recentXid;
135 /* Default freeze_min_age to use for autovacuum (varies by database) */
136 static int default_freeze_min_age;
138 /* Memory context for long-lived data */
139 static MemoryContext AutovacMemCxt;
141 /* struct to keep track of databases in launcher */
142 typedef struct avl_dbase
144 Oid adl_datid; /* hash key -- must be first */
145 TimestampTz adl_next_worker;
146 int adl_score;
147 } avl_dbase;
149 /* struct to keep track of databases in worker */
150 typedef struct avw_dbase
152 Oid adw_datid;
153 char *adw_name;
154 TransactionId adw_frozenxid;
155 PgStat_StatDBEntry *adw_entry;
156 } avw_dbase;
158 /* struct to keep track of tables to vacuum and/or analyze, in 1st pass */
159 typedef struct av_relation
161 Oid ar_relid;
162 Oid ar_toastrelid;
163 } av_relation;
165 /* struct to keep track of tables to vacuum and/or analyze, after rechecking */
166 typedef struct autovac_table
168 Oid at_relid;
169 Oid at_toastrelid;
170 bool at_dovacuum;
171 bool at_doanalyze;
172 int at_freeze_min_age;
173 int at_vacuum_cost_delay;
174 int at_vacuum_cost_limit;
175 } autovac_table;
177 /*-------------
178 * This struct holds information about a single worker's whereabouts. We keep
179 * an array of these in shared memory, sized according to
180 * autovacuum_max_workers.
182 * wi_links entry into free list or running list
183 * wi_dboid OID of the database this worker is supposed to work on
184 * wi_tableoid OID of the table currently being vacuumed
185 * wi_workerpid PID of the running worker, 0 if not yet started
186 * wi_launchtime Time at which this worker was launched
187 * wi_cost_* Vacuum cost-based delay parameters current in this worker
189 * All fields are protected by AutovacuumLock, except for wi_tableoid which is
190 * protected by AutovacuumScheduleLock (which is read-only for everyone except
191 * that worker itself).
192 *-------------
194 typedef struct WorkerInfoData
196 SHM_QUEUE wi_links;
197 Oid wi_dboid;
198 Oid wi_tableoid;
199 int wi_workerpid;
200 TimestampTz wi_launchtime;
201 int wi_cost_delay;
202 int wi_cost_limit;
203 int wi_cost_limit_base;
204 } WorkerInfoData;
206 typedef struct WorkerInfoData *WorkerInfo;
209 * Possible signals received by the launcher from remote processes. These are
210 * stored atomically in shared memory so that other processes can set them
211 * without locking.
213 typedef enum
215 AutoVacForkFailed, /* failed trying to start a worker */
216 AutoVacRebalance, /* rebalance the cost limits */
217 AutoVacNumSignals = AutoVacRebalance /* must be last */
218 } AutoVacuumSignal;
220 /*-------------
221 * The main autovacuum shmem struct. On shared memory we store this main
222 * struct and the array of WorkerInfo structs. This struct keeps:
224 * av_signal set by other processes to indicate various conditions
225 * av_launcherpid the PID of the autovacuum launcher
226 * av_freeWorkers the WorkerInfo freelist
227 * av_runningWorkers the WorkerInfo non-free queue
228 * av_startingWorker pointer to WorkerInfo currently being started (cleared by
229 * the worker itself as soon as it's up and running)
231 * This struct is protected by AutovacuumLock, except for av_signal and parts
232 * of the worker list (see above).
233 *-------------
235 typedef struct
237 sig_atomic_t av_signal[AutoVacNumSignals];
238 pid_t av_launcherpid;
239 SHMEM_OFFSET av_freeWorkers;
240 SHM_QUEUE av_runningWorkers;
241 SHMEM_OFFSET av_startingWorker;
242 } AutoVacuumShmemStruct;
244 static AutoVacuumShmemStruct *AutoVacuumShmem;
246 /* the database list in the launcher, and the context that contains it */
247 static Dllist *DatabaseList = NULL;
248 static MemoryContext DatabaseListCxt = NULL;
250 /* Pointer to my own WorkerInfo, valid on each worker */
251 static WorkerInfo MyWorkerInfo = NULL;
253 /* PID of launcher, valid only in worker while shutting down */
254 int AutovacuumLauncherPid = 0;
256 #ifdef EXEC_BACKEND
257 static pid_t avlauncher_forkexec(void);
258 static pid_t avworker_forkexec(void);
259 #endif
260 NON_EXEC_STATIC void AutoVacWorkerMain(int argc, char *argv[]);
261 NON_EXEC_STATIC void AutoVacLauncherMain(int argc, char *argv[]);
263 static Oid do_start_worker(void);
264 static void launcher_determine_sleep(bool canlaunch, bool recursing,
265 struct timeval *nap);
266 static void launch_worker(TimestampTz now);
267 static List *get_database_list(void);
268 static void rebuild_database_list(Oid newdb);
269 static int db_comparator(const void *a, const void *b);
270 static void autovac_balance_cost(void);
272 static void do_autovacuum(void);
273 static void FreeWorkerInfo(int code, Datum arg);
275 static void relation_check_autovac(Oid relid, Form_pg_class classForm,
276 Form_pg_autovacuum avForm, PgStat_StatTabEntry *tabentry,
277 List **table_oids, List **table_toast_list,
278 List **toast_oids);
279 static autovac_table *table_recheck_autovac(Oid relid);
280 static void relation_needs_vacanalyze(Oid relid, Form_pg_autovacuum avForm,
281 Form_pg_class classForm,
282 PgStat_StatTabEntry *tabentry, bool *dovacuum,
283 bool *doanalyze);
285 static void autovacuum_do_vac_analyze(Oid relid, bool dovacuum,
286 bool doanalyze, int freeze_min_age,
287 BufferAccessStrategy bstrategy);
288 static HeapTuple get_pg_autovacuum_tuple_relid(Relation avRel, Oid relid);
289 static PgStat_StatTabEntry *get_pgstat_tabentry_relid(Oid relid, bool isshared,
290 PgStat_StatDBEntry *shared,
291 PgStat_StatDBEntry *dbentry);
292 static void autovac_report_activity(VacuumStmt *vacstmt, Oid relid);
293 static void avl_sighup_handler(SIGNAL_ARGS);
294 static void avl_sigusr1_handler(SIGNAL_ARGS);
295 static void avl_sigterm_handler(SIGNAL_ARGS);
296 static void avl_quickdie(SIGNAL_ARGS);
297 static void autovac_refresh_stats(void);
301 /********************************************************************
302 * AUTOVACUUM LAUNCHER CODE
303 ********************************************************************/
305 #ifdef EXEC_BACKEND
307 * forkexec routine for the autovacuum launcher process.
309 * Format up the arglist, then fork and exec.
311 static pid_t
312 avlauncher_forkexec(void)
314 char *av[10];
315 int ac = 0;
317 av[ac++] = "postgres";
318 av[ac++] = "--forkavlauncher";
319 av[ac++] = NULL; /* filled in by postmaster_forkexec */
320 av[ac] = NULL;
322 Assert(ac < lengthof(av));
324 return postmaster_forkexec(ac, av);
328 * We need this set from the outside, before InitProcess is called
330 void
331 AutovacuumLauncherIAm(void)
333 am_autovacuum_launcher = true;
335 #endif
338 * Main entry point for autovacuum launcher process, to be called from the
339 * postmaster.
342 StartAutoVacLauncher(void)
344 pid_t AutoVacPID;
346 #ifdef EXEC_BACKEND
347 switch ((AutoVacPID = avlauncher_forkexec()))
348 #else
349 switch ((AutoVacPID = fork_process()))
350 #endif
352 case -1:
353 ereport(LOG,
354 (errmsg("could not fork autovacuum process: %m")));
355 return 0;
357 #ifndef EXEC_BACKEND
358 case 0:
359 /* in postmaster child ... */
360 /* Close the postmaster's sockets */
361 ClosePostmasterPorts(false);
363 /* Lose the postmaster's on-exit routines */
364 on_exit_reset();
366 AutoVacLauncherMain(0, NULL);
367 break;
368 #endif
369 default:
370 return (int) AutoVacPID;
373 /* shouldn't get here */
374 return 0;
378 * Main loop for the autovacuum launcher process.
380 NON_EXEC_STATIC void
381 AutoVacLauncherMain(int argc, char *argv[])
383 sigjmp_buf local_sigjmp_buf;
385 /* we are a postmaster subprocess now */
386 IsUnderPostmaster = true;
387 am_autovacuum_launcher = true;
389 /* reset MyProcPid */
390 MyProcPid = getpid();
392 /* record Start Time for logging */
393 MyStartTime = time(NULL);
395 /* Identify myself via ps */
396 init_ps_display("autovacuum launcher process", "", "", "");
398 SetProcessingMode(InitProcessing);
401 * If possible, make this process a group leader, so that the postmaster
402 * can signal any child processes too. (autovacuum probably never has
403 * any child processes, but for consistency we make all postmaster
404 * child processes do this.)
406 #ifdef HAVE_SETSID
407 if (setsid() < 0)
408 elog(FATAL, "setsid() failed: %m");
409 #endif
412 * Set up signal handlers. Since this is an auxiliary process, it has
413 * particular signal requirements -- no deadlock checker or sinval
414 * catchup, for example.
416 pqsignal(SIGHUP, avl_sighup_handler);
418 pqsignal(SIGINT, SIG_IGN);
419 pqsignal(SIGTERM, avl_sigterm_handler);
420 pqsignal(SIGQUIT, avl_quickdie);
421 pqsignal(SIGALRM, SIG_IGN);
423 pqsignal(SIGPIPE, SIG_IGN);
424 pqsignal(SIGUSR1, avl_sigusr1_handler);
425 /* We don't listen for async notifies */
426 pqsignal(SIGUSR2, SIG_IGN);
427 pqsignal(SIGFPE, FloatExceptionHandler);
428 pqsignal(SIGCHLD, SIG_DFL);
430 /* Early initialization */
431 BaseInit();
434 * Create a per-backend PGPROC struct in shared memory, except in the
435 * EXEC_BACKEND case where this was done in SubPostmasterMain. We must do
436 * this before we can use LWLocks (and in the EXEC_BACKEND case we already
437 * had to do some stuff with LWLocks).
439 #ifndef EXEC_BACKEND
440 InitAuxiliaryProcess();
441 #endif
444 * Create a memory context that we will do all our work in. We do this so
445 * that we can reset the context during error recovery and thereby avoid
446 * possible memory leaks.
448 AutovacMemCxt = AllocSetContextCreate(TopMemoryContext,
449 "Autovacuum Launcher",
450 ALLOCSET_DEFAULT_MINSIZE,
451 ALLOCSET_DEFAULT_INITSIZE,
452 ALLOCSET_DEFAULT_MAXSIZE);
453 MemoryContextSwitchTo(AutovacMemCxt);
457 * If an exception is encountered, processing resumes here.
459 * This code is heavily based on bgwriter.c, q.v.
461 if (sigsetjmp(local_sigjmp_buf, 1) != 0)
463 /* since not using PG_TRY, must reset error stack by hand */
464 error_context_stack = NULL;
466 /* Prevents interrupts while cleaning up */
467 HOLD_INTERRUPTS();
469 /* Report the error to the server log */
470 EmitErrorReport();
473 * These operations are really just a minimal subset of
474 * AbortTransaction(). We don't have very many resources to worry
475 * about, but we do have LWLocks.
477 LWLockReleaseAll();
478 AtEOXact_Files();
479 AtEOXact_HashTables(false);
482 * Now return to normal top-level context and clear ErrorContext for
483 * next time.
485 MemoryContextSwitchTo(AutovacMemCxt);
486 FlushErrorState();
488 /* Flush any leaked data in the top-level context */
489 MemoryContextResetAndDeleteChildren(AutovacMemCxt);
491 /* don't leave dangling pointers to freed memory */
492 DatabaseListCxt = NULL;
493 DatabaseList = NULL;
496 * Make sure pgstat also considers our stat data as gone. Note: we
497 * mustn't use autovac_refresh_stats here.
499 pgstat_clear_snapshot();
501 /* Now we can allow interrupts again */
502 RESUME_INTERRUPTS();
505 * Sleep at least 1 second after any error. We don't want to be
506 * filling the error logs as fast as we can.
508 pg_usleep(1000000L);
511 /* We can now handle ereport(ERROR) */
512 PG_exception_stack = &local_sigjmp_buf;
514 ereport(LOG,
515 (errmsg("autovacuum launcher started")));
517 /* must unblock signals before calling rebuild_database_list */
518 PG_SETMASK(&UnBlockSig);
520 /* in emergency mode, just start a worker and go away */
521 if (!AutoVacuumingActive())
523 do_start_worker();
524 proc_exit(0); /* done */
527 AutoVacuumShmem->av_launcherpid = MyProcPid;
530 * Create the initial database list. The invariant we want this list to
531 * keep is that it's ordered by decreasing next_time. As soon as an entry
532 * is updated to a higher time, it will be moved to the front (which is
533 * correct because the only operation is to add autovacuum_naptime to the
534 * entry, and time always increases).
536 rebuild_database_list(InvalidOid);
538 for (;;)
540 struct timeval nap;
541 TimestampTz current_time = 0;
542 bool can_launch;
543 Dlelem *elem;
546 * Emergency bailout if postmaster has died. This is to avoid the
547 * necessity for manual cleanup of all postmaster children.
549 if (!PostmasterIsAlive(true))
550 exit(1);
552 launcher_determine_sleep(AutoVacuumShmem->av_freeWorkers !=
553 INVALID_OFFSET, false, &nap);
556 * Sleep for a while according to schedule.
558 * On some platforms, signals won't interrupt the sleep. To ensure we
559 * respond reasonably promptly when someone signals us, break down the
560 * sleep into 1-second increments, and check for interrupts after each
561 * nap.
563 while (nap.tv_sec > 0 || nap.tv_usec > 0)
565 uint32 sleeptime;
567 if (nap.tv_sec > 0)
569 sleeptime = 1000000;
570 nap.tv_sec--;
572 else
574 sleeptime = nap.tv_usec;
575 nap.tv_usec = 0;
577 pg_usleep(sleeptime);
580 * Emergency bailout if postmaster has died. This is to avoid the
581 * necessity for manual cleanup of all postmaster children.
583 if (!PostmasterIsAlive(true))
584 exit(1);
586 if (got_SIGTERM || got_SIGHUP || got_SIGUSR1)
587 break;
590 /* the normal shutdown case */
591 if (got_SIGTERM)
592 break;
594 if (got_SIGHUP)
596 got_SIGHUP = false;
597 ProcessConfigFile(PGC_SIGHUP);
599 /* shutdown requested in config file */
600 if (!AutoVacuumingActive())
601 break;
603 /* rebalance in case the default cost parameters changed */
604 LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
605 autovac_balance_cost();
606 LWLockRelease(AutovacuumLock);
608 /* rebuild the list in case the naptime changed */
609 rebuild_database_list(InvalidOid);
613 * a worker finished, or postmaster signalled failure to start a
614 * worker
616 if (got_SIGUSR1)
618 got_SIGUSR1 = false;
620 /* rebalance cost limits, if needed */
621 if (AutoVacuumShmem->av_signal[AutoVacRebalance])
623 LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
624 AutoVacuumShmem->av_signal[AutoVacRebalance] = false;
625 autovac_balance_cost();
626 LWLockRelease(AutovacuumLock);
629 if (AutoVacuumShmem->av_signal[AutoVacForkFailed])
632 * If the postmaster failed to start a new worker, we sleep
633 * for a little while and resend the signal. The new worker's
634 * state is still in memory, so this is sufficient. After
635 * that, we restart the main loop.
637 * XXX should we put a limit to the number of times we retry?
638 * I don't think it makes much sense, because a future start
639 * of a worker will continue to fail in the same way.
641 AutoVacuumShmem->av_signal[AutoVacForkFailed] = false;
642 pg_usleep(100000L); /* 100ms */
643 SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_WORKER);
644 continue;
649 * There are some conditions that we need to check before trying to
650 * start a launcher. First, we need to make sure that there is a
651 * launcher slot available. Second, we need to make sure that no other
652 * worker failed while starting up.
655 current_time = GetCurrentTimestamp();
656 LWLockAcquire(AutovacuumLock, LW_SHARED);
658 can_launch = (AutoVacuumShmem->av_freeWorkers != INVALID_OFFSET);
660 if (AutoVacuumShmem->av_startingWorker != INVALID_OFFSET)
662 int waittime;
664 WorkerInfo worker = (WorkerInfo) MAKE_PTR(AutoVacuumShmem->av_startingWorker);
667 * We can't launch another worker when another one is still
668 * starting up (or failed while doing so), so just sleep for a bit
669 * more; that worker will wake us up again as soon as it's ready.
670 * We will only wait autovacuum_naptime seconds (up to a maximum of
671 * 60 seconds) for this to happen however. Note that failure to
672 * connect to a particular database is not a problem here, because
673 * the worker removes itself from the startingWorker pointer before
674 * trying to connect. Problems detected by the postmaster (like
675 * fork() failure) are also reported and handled differently. The
676 * only problems that may cause this code to fire are errors in the
677 * earlier sections of AutoVacWorkerMain, before the worker removes
678 * the WorkerInfo from the startingWorker pointer.
680 waittime = Min(autovacuum_naptime, 60) * 1000;
681 if (TimestampDifferenceExceeds(worker->wi_launchtime, current_time,
682 waittime))
684 LWLockRelease(AutovacuumLock);
685 LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
687 * No other process can put a worker in starting mode, so if
688 * startingWorker is still INVALID after exchanging our lock,
689 * we assume it's the same one we saw above (so we don't
690 * recheck the launch time).
692 if (AutoVacuumShmem->av_startingWorker != INVALID_OFFSET)
694 worker = (WorkerInfo) MAKE_PTR(AutoVacuumShmem->av_startingWorker);
695 worker->wi_dboid = InvalidOid;
696 worker->wi_tableoid = InvalidOid;
697 worker->wi_workerpid = 0;
698 worker->wi_launchtime = 0;
699 worker->wi_links.next = AutoVacuumShmem->av_freeWorkers;
700 AutoVacuumShmem->av_freeWorkers = MAKE_OFFSET(worker);
701 AutoVacuumShmem->av_startingWorker = INVALID_OFFSET;
702 elog(WARNING, "worker took too long to start; cancelled");
705 else
706 can_launch = false;
708 LWLockRelease(AutovacuumLock); /* either shared or exclusive */
710 /* if we can't do anything, just go back to sleep */
711 if (!can_launch)
712 continue;
714 /* We're OK to start a new worker */
716 elem = DLGetTail(DatabaseList);
717 if (elem != NULL)
719 avl_dbase *avdb = DLE_VAL(elem);
722 * launch a worker if next_worker is right now or it is in the past
724 if (TimestampDifferenceExceeds(avdb->adl_next_worker,
725 current_time, 0))
726 launch_worker(current_time);
728 else
731 * Special case when the list is empty: start a worker right away.
732 * This covers the initial case, when no database is in pgstats
733 * (thus the list is empty). Note that the constraints in
734 * launcher_determine_sleep keep us from starting workers too
735 * quickly (at most once every autovacuum_naptime when the list is
736 * empty).
738 launch_worker(current_time);
742 /* Normal exit from the autovac launcher is here */
743 ereport(LOG,
744 (errmsg("autovacuum launcher shutting down")));
745 AutoVacuumShmem->av_launcherpid = 0;
747 proc_exit(0); /* done */
751 * Determine the time to sleep, based on the database list.
753 * The "canlaunch" parameter indicates whether we can start a worker right now,
754 * for example due to the workers being all busy. If this is false, we will
755 * cause a long sleep, which will be interrupted when a worker exits.
757 static void
758 launcher_determine_sleep(bool canlaunch, bool recursing, struct timeval *nap)
760 Dlelem *elem;
763 * We sleep until the next scheduled vacuum. We trust that when the
764 * database list was built, care was taken so that no entries have times in
765 * the past; if the first entry has too close a next_worker value, or a
766 * time in the past, we will sleep a small nominal time.
768 if (!canlaunch)
770 nap->tv_sec = autovacuum_naptime;
771 nap->tv_usec = 0;
773 else if ((elem = DLGetTail(DatabaseList)) != NULL)
775 avl_dbase *avdb = DLE_VAL(elem);
776 TimestampTz current_time = GetCurrentTimestamp();
777 TimestampTz next_wakeup;
778 long secs;
779 int usecs;
781 next_wakeup = avdb->adl_next_worker;
782 TimestampDifference(current_time, next_wakeup, &secs, &usecs);
784 nap->tv_sec = secs;
785 nap->tv_usec = usecs;
787 else
789 /* list is empty, sleep for whole autovacuum_naptime seconds */
790 nap->tv_sec = autovacuum_naptime;
791 nap->tv_usec = 0;
795 * If the result is exactly zero, it means a database had an entry with
796 * time in the past. Rebuild the list so that the databases are evenly
797 * distributed again, and recalculate the time to sleep. This can happen
798 * if there are more tables needing vacuum than workers, and they all take
799 * longer to vacuum than autovacuum_naptime.
801 * We only recurse once. rebuild_database_list should always return times
802 * in the future, but it seems best not to trust too much on that.
804 if (nap->tv_sec == 0 && nap->tv_usec == 0 && !recursing)
806 rebuild_database_list(InvalidOid);
807 launcher_determine_sleep(canlaunch, true, nap);
808 return;
811 /* 100ms is the smallest time we'll allow the launcher to sleep */
812 if (nap->tv_sec <= 0 && nap->tv_usec <= 100000)
814 nap->tv_sec = 0;
815 nap->tv_usec = 100000; /* 100 ms */
820 * Build an updated DatabaseList. It must only contain databases that appear
821 * in pgstats, and must be sorted by next_worker from highest to lowest,
822 * distributed regularly across the next autovacuum_naptime interval.
824 * Receives the Oid of the database that made this list be generated (we call
825 * this the "new" database, because when the database was already present on
826 * the list, we expect that this function is not called at all). The
827 * preexisting list, if any, will be used to preserve the order of the
828 * databases in the autovacuum_naptime period. The new database is put at the
829 * end of the interval. The actual values are not saved, which should not be
830 * much of a problem.
832 static void
833 rebuild_database_list(Oid newdb)
835 List *dblist;
836 ListCell *cell;
837 MemoryContext newcxt;
838 MemoryContext oldcxt;
839 MemoryContext tmpcxt;
840 HASHCTL hctl;
841 int score;
842 int nelems;
843 HTAB *dbhash;
845 /* use fresh stats */
846 autovac_refresh_stats();
848 newcxt = AllocSetContextCreate(AutovacMemCxt,
849 "AV dblist",
850 ALLOCSET_DEFAULT_MINSIZE,
851 ALLOCSET_DEFAULT_INITSIZE,
852 ALLOCSET_DEFAULT_MAXSIZE);
853 tmpcxt = AllocSetContextCreate(newcxt,
854 "tmp AV dblist",
855 ALLOCSET_DEFAULT_MINSIZE,
856 ALLOCSET_DEFAULT_INITSIZE,
857 ALLOCSET_DEFAULT_MAXSIZE);
858 oldcxt = MemoryContextSwitchTo(tmpcxt);
861 * Implementing this is not as simple as it sounds, because we need to put
862 * the new database at the end of the list; next the databases that were
863 * already on the list, and finally (at the tail of the list) all the other
864 * databases that are not on the existing list.
866 * To do this, we build an empty hash table of scored databases. We will
867 * start with the lowest score (zero) for the new database, then increasing
868 * scores for the databases in the existing list, in order, and lastly
869 * increasing scores for all databases gotten via get_database_list() that
870 * are not already on the hash.
872 * Then we will put all the hash elements into an array, sort the array by
873 * score, and finally put the array elements into the new doubly linked
874 * list.
876 hctl.keysize = sizeof(Oid);
877 hctl.entrysize = sizeof(avl_dbase);
878 hctl.hash = oid_hash;
879 hctl.hcxt = tmpcxt;
880 dbhash = hash_create("db hash", 20, &hctl, /* magic number here FIXME */
881 HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
883 /* start by inserting the new database */
884 score = 0;
885 if (OidIsValid(newdb))
887 avl_dbase *db;
888 PgStat_StatDBEntry *entry;
890 /* only consider this database if it has a pgstat entry */
891 entry = pgstat_fetch_stat_dbentry(newdb);
892 if (entry != NULL)
894 /* we assume it isn't found because the hash was just created */
895 db = hash_search(dbhash, &newdb, HASH_ENTER, NULL);
897 /* hash_search already filled in the key */
898 db->adl_score = score++;
899 /* next_worker is filled in later */
903 /* Now insert the databases from the existing list */
904 if (DatabaseList != NULL)
906 Dlelem *elem;
908 elem = DLGetHead(DatabaseList);
909 while (elem != NULL)
911 avl_dbase *avdb = DLE_VAL(elem);
912 avl_dbase *db;
913 bool found;
914 PgStat_StatDBEntry *entry;
916 elem = DLGetSucc(elem);
919 * skip databases with no stat entries -- in particular, this
920 * gets rid of dropped databases
922 entry = pgstat_fetch_stat_dbentry(avdb->adl_datid);
923 if (entry == NULL)
924 continue;
926 db = hash_search(dbhash, &(avdb->adl_datid), HASH_ENTER, &found);
928 if (!found)
930 /* hash_search already filled in the key */
931 db->adl_score = score++;
932 /* next_worker is filled in later */
937 /* finally, insert all qualifying databases not previously inserted */
938 dblist = get_database_list();
939 foreach(cell, dblist)
941 avw_dbase *avdb = lfirst(cell);
942 avl_dbase *db;
943 bool found;
944 PgStat_StatDBEntry *entry;
946 /* only consider databases with a pgstat entry */
947 entry = pgstat_fetch_stat_dbentry(avdb->adw_datid);
948 if (entry == NULL)
949 continue;
951 db = hash_search(dbhash, &(avdb->adw_datid), HASH_ENTER, &found);
952 /* only update the score if the database was not already on the hash */
953 if (!found)
955 /* hash_search already filled in the key */
956 db->adl_score = score++;
957 /* next_worker is filled in later */
960 nelems = score;
962 /* from here on, the allocated memory belongs to the new list */
963 MemoryContextSwitchTo(newcxt);
964 DatabaseList = DLNewList();
966 if (nelems > 0)
968 TimestampTz current_time;
969 int millis_increment;
970 avl_dbase *dbary;
971 avl_dbase *db;
972 HASH_SEQ_STATUS seq;
973 int i;
975 /* put all the hash elements into an array */
976 dbary = palloc(nelems * sizeof(avl_dbase));
978 i = 0;
979 hash_seq_init(&seq, dbhash);
980 while ((db = hash_seq_search(&seq)) != NULL)
981 memcpy(&(dbary[i++]), db, sizeof(avl_dbase));
983 /* sort the array */
984 qsort(dbary, nelems, sizeof(avl_dbase), db_comparator);
986 /* this is the time interval between databases in the schedule */
987 millis_increment = 1000.0 * autovacuum_naptime / nelems;
988 current_time = GetCurrentTimestamp();
991 * move the elements from the array into the dllist, setting the
992 * next_worker while walking the array
994 for (i = 0; i < nelems; i++)
996 avl_dbase *db = &(dbary[i]);
997 Dlelem *elem;
999 current_time = TimestampTzPlusMilliseconds(current_time,
1000 millis_increment);
1001 db->adl_next_worker = current_time;
1003 elem = DLNewElem(db);
1004 /* later elements should go closer to the head of the list */
1005 DLAddHead(DatabaseList, elem);
1009 /* all done, clean up memory */
1010 if (DatabaseListCxt != NULL)
1011 MemoryContextDelete(DatabaseListCxt);
1012 MemoryContextDelete(tmpcxt);
1013 DatabaseListCxt = newcxt;
1014 MemoryContextSwitchTo(oldcxt);
1017 /* qsort comparator for avl_dbase, using adl_score */
1018 static int
1019 db_comparator(const void *a, const void *b)
1021 if (((avl_dbase *) a)->adl_score == ((avl_dbase *) b)->adl_score)
1022 return 0;
1023 else
1024 return (((avl_dbase *) a)->adl_score < ((avl_dbase *) b)->adl_score) ? 1 : -1;
1028 * do_start_worker
1030 * Bare-bones procedure for starting an autovacuum worker from the launcher.
1031 * It determines what database to work on, sets up shared memory stuff and
1032 * signals postmaster to start the worker. It fails gracefully if invoked when
1033 * autovacuum_workers are already active.
1035 * Return value is the OID of the database that the worker is going to process,
1036 * or InvalidOid if no worker was actually started.
1038 static Oid
1039 do_start_worker(void)
1041 List *dblist;
1042 ListCell *cell;
1043 TransactionId xidForceLimit;
1044 bool for_xid_wrap;
1045 avw_dbase *avdb;
1046 TimestampTz current_time;
1047 bool skipit = false;
1048 Oid retval = InvalidOid;
1049 MemoryContext tmpcxt,
1050 oldcxt;
1052 /* return quickly when there are no free workers */
1053 LWLockAcquire(AutovacuumLock, LW_SHARED);
1054 if (AutoVacuumShmem->av_freeWorkers == INVALID_OFFSET)
1056 LWLockRelease(AutovacuumLock);
1057 return InvalidOid;
1059 LWLockRelease(AutovacuumLock);
1062 * Create and switch to a temporary context to avoid leaking the memory
1063 * allocated for the database list.
1065 tmpcxt = AllocSetContextCreate(CurrentMemoryContext,
1066 "Start worker tmp cxt",
1067 ALLOCSET_DEFAULT_MINSIZE,
1068 ALLOCSET_DEFAULT_INITSIZE,
1069 ALLOCSET_DEFAULT_MAXSIZE);
1070 oldcxt = MemoryContextSwitchTo(tmpcxt);
1072 /* use fresh stats */
1073 autovac_refresh_stats();
1075 /* Get a list of databases */
1076 dblist = get_database_list();
1079 * Determine the oldest datfrozenxid/relfrozenxid that we will allow
1080 * to pass without forcing a vacuum. (This limit can be tightened for
1081 * particular tables, but not loosened.)
1083 recentXid = ReadNewTransactionId();
1084 xidForceLimit = recentXid - autovacuum_freeze_max_age;
1085 /* ensure it's a "normal" XID, else TransactionIdPrecedes misbehaves */
1086 if (xidForceLimit < FirstNormalTransactionId)
1087 xidForceLimit -= FirstNormalTransactionId;
1090 * Choose a database to connect to. We pick the database that was least
1091 * recently auto-vacuumed, or one that needs vacuuming to prevent Xid
1092 * wraparound-related data loss. If any db at risk of wraparound is
1093 * found, we pick the one with oldest datfrozenxid, independently of
1094 * autovacuum times.
1096 * Note that a database with no stats entry is not considered, except for
1097 * Xid wraparound purposes. The theory is that if no one has ever
1098 * connected to it since the stats were last initialized, it doesn't need
1099 * vacuuming.
1101 * XXX This could be improved if we had more info about whether it needs
1102 * vacuuming before connecting to it. Perhaps look through the pgstats
1103 * data for the database's tables? One idea is to keep track of the
1104 * number of new and dead tuples per database in pgstats. However it
1105 * isn't clear how to construct a metric that measures that and not cause
1106 * starvation for less busy databases.
1108 avdb = NULL;
1109 for_xid_wrap = false;
1110 current_time = GetCurrentTimestamp();
1111 foreach(cell, dblist)
1113 avw_dbase *tmp = lfirst(cell);
1114 Dlelem *elem;
1116 /* Check to see if this one is at risk of wraparound */
1117 if (TransactionIdPrecedes(tmp->adw_frozenxid, xidForceLimit))
1119 if (avdb == NULL ||
1120 TransactionIdPrecedes(tmp->adw_frozenxid, avdb->adw_frozenxid))
1121 avdb = tmp;
1122 for_xid_wrap = true;
1123 continue;
1125 else if (for_xid_wrap)
1126 continue; /* ignore not-at-risk DBs */
1128 /* Find pgstat entry if any */
1129 tmp->adw_entry = pgstat_fetch_stat_dbentry(tmp->adw_datid);
1132 * Skip a database with no pgstat entry; it means it hasn't seen any
1133 * activity.
1135 if (!tmp->adw_entry)
1136 continue;
1139 * Also, skip a database that appears on the database list as having
1140 * been processed recently (less than autovacuum_naptime seconds ago).
1141 * We do this so that we don't select a database which we just
1142 * selected, but that pgstat hasn't gotten around to updating the last
1143 * autovacuum time yet.
1145 skipit = false;
1146 elem = DatabaseList ? DLGetTail(DatabaseList) : NULL;
1148 while (elem != NULL)
1150 avl_dbase *dbp = DLE_VAL(elem);
1152 if (dbp->adl_datid == tmp->adw_datid)
1155 * Skip this database if its next_worker value falls between
1156 * the current time and the current time plus naptime.
1158 if (!TimestampDifferenceExceeds(dbp->adl_next_worker,
1159 current_time, 0) &&
1160 !TimestampDifferenceExceeds(current_time,
1161 dbp->adl_next_worker,
1162 autovacuum_naptime * 1000))
1163 skipit = true;
1165 break;
1167 elem = DLGetPred(elem);
1169 if (skipit)
1170 continue;
1173 * Remember the db with oldest autovac time. (If we are here,
1174 * both tmp->entry and db->entry must be non-null.)
1176 if (avdb == NULL ||
1177 tmp->adw_entry->last_autovac_time < avdb->adw_entry->last_autovac_time)
1178 avdb = tmp;
1181 /* Found a database -- process it */
1182 if (avdb != NULL)
1184 WorkerInfo worker;
1185 SHMEM_OFFSET sworker;
1187 LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
1190 * Get a worker entry from the freelist. We checked above, so there
1191 * really should be a free slot -- complain very loudly if there isn't.
1193 sworker = AutoVacuumShmem->av_freeWorkers;
1194 if (sworker == INVALID_OFFSET)
1195 elog(FATAL, "no free worker found");
1197 worker = (WorkerInfo) MAKE_PTR(sworker);
1198 AutoVacuumShmem->av_freeWorkers = worker->wi_links.next;
1200 worker->wi_dboid = avdb->adw_datid;
1201 worker->wi_workerpid = 0;
1202 worker->wi_launchtime = GetCurrentTimestamp();
1204 AutoVacuumShmem->av_startingWorker = sworker;
1206 LWLockRelease(AutovacuumLock);
1208 SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_WORKER);
1210 retval = avdb->adw_datid;
1212 else if (skipit)
1215 * If we skipped all databases on the list, rebuild it, because it
1216 * probably contains a dropped database.
1218 rebuild_database_list(InvalidOid);
1221 MemoryContextSwitchTo(oldcxt);
1222 MemoryContextDelete(tmpcxt);
1224 return retval;
1228 * launch_worker
1230 * Wrapper for starting a worker from the launcher. Besides actually starting
1231 * it, update the database list to reflect the next time that another one will
1232 * need to be started on the selected database. The actual database choice is
1233 * left to do_start_worker.
1235 * This routine is also expected to insert an entry into the database list if
1236 * the selected database was previously absent from the list. It returns the
1237 * new database list.
1239 static void
1240 launch_worker(TimestampTz now)
1242 Oid dbid;
1243 Dlelem *elem;
1245 dbid = do_start_worker();
1246 if (OidIsValid(dbid))
1249 * Walk the database list and update the corresponding entry. If the
1250 * database is not on the list, we'll recreate the list.
1252 elem = (DatabaseList == NULL) ? NULL : DLGetHead(DatabaseList);
1253 while (elem != NULL)
1255 avl_dbase *avdb = DLE_VAL(elem);
1257 if (avdb->adl_datid == dbid)
1260 * add autovacuum_naptime seconds to the current time, and use
1261 * that as the new "next_worker" field for this database.
1263 avdb->adl_next_worker =
1264 TimestampTzPlusMilliseconds(now, autovacuum_naptime * 1000);
1266 DLMoveToFront(elem);
1267 break;
1269 elem = DLGetSucc(elem);
1273 * If the database was not present in the database list, we rebuild the
1274 * list. It's possible that the database does not get into the list
1275 * anyway, for example if it's a database that doesn't have a pgstat
1276 * entry, but this is not a problem because we don't want to schedule
1277 * workers regularly into those in any case.
1279 if (elem == NULL)
1280 rebuild_database_list(dbid);
1285 * Called from postmaster to signal a failure to fork a process to become
1286 * worker. The postmaster should kill(SIGUSR1) the launcher shortly
1287 * after calling this function.
1289 void
1290 AutoVacWorkerFailed(void)
1292 AutoVacuumShmem->av_signal[AutoVacForkFailed] = true;
1295 /* SIGHUP: set flag to re-read config file at next convenient time */
1296 static void
1297 avl_sighup_handler(SIGNAL_ARGS)
1299 got_SIGHUP = true;
1302 /* SIGUSR1: a worker is up and running, or just finished */
1303 static void
1304 avl_sigusr1_handler(SIGNAL_ARGS)
1306 got_SIGUSR1 = true;
1309 /* SIGTERM: time to die */
1310 static void
1311 avl_sigterm_handler(SIGNAL_ARGS)
1313 got_SIGTERM = true;
1317 * avl_quickdie occurs when signalled SIGQUIT from postmaster.
1319 * Some backend has bought the farm, so we need to stop what we're doing
1320 * and exit.
1322 static void
1323 avl_quickdie(SIGNAL_ARGS)
1325 PG_SETMASK(&BlockSig);
1328 * DO NOT proc_exit() -- we're here because shared memory may be
1329 * corrupted, so we don't want to try to clean up our transaction. Just
1330 * nail the windows shut and get out of town.
1332 * Note we do exit(2) not exit(0). This is to force the postmaster into a
1333 * system reset cycle if some idiot DBA sends a manual SIGQUIT to a random
1334 * backend. This is necessary precisely because we don't clean up our
1335 * shared memory state.
1337 exit(2);
1341 /********************************************************************
1342 * AUTOVACUUM WORKER CODE
1343 ********************************************************************/
1345 #ifdef EXEC_BACKEND
1347 * forkexec routines for the autovacuum worker.
1349 * Format up the arglist, then fork and exec.
1351 static pid_t
1352 avworker_forkexec(void)
1354 char *av[10];
1355 int ac = 0;
1357 av[ac++] = "postgres";
1358 av[ac++] = "--forkavworker";
1359 av[ac++] = NULL; /* filled in by postmaster_forkexec */
1360 av[ac] = NULL;
1362 Assert(ac < lengthof(av));
1364 return postmaster_forkexec(ac, av);
1368 * We need this set from the outside, before InitProcess is called
1370 void
1371 AutovacuumWorkerIAm(void)
1373 am_autovacuum_worker = true;
1375 #endif
1378 * Main entry point for autovacuum worker process.
1380 * This code is heavily based on pgarch.c, q.v.
1383 StartAutoVacWorker(void)
1385 pid_t worker_pid;
1387 #ifdef EXEC_BACKEND
1388 switch ((worker_pid = avworker_forkexec()))
1389 #else
1390 switch ((worker_pid = fork_process()))
1391 #endif
1393 case -1:
1394 ereport(LOG,
1395 (errmsg("could not fork autovacuum process: %m")));
1396 return 0;
1398 #ifndef EXEC_BACKEND
1399 case 0:
1400 /* in postmaster child ... */
1401 /* Close the postmaster's sockets */
1402 ClosePostmasterPorts(false);
1404 /* Lose the postmaster's on-exit routines */
1405 on_exit_reset();
1407 AutoVacWorkerMain(0, NULL);
1408 break;
1409 #endif
1410 default:
1411 return (int) worker_pid;
1414 /* shouldn't get here */
1415 return 0;
1419 * AutoVacWorkerMain
1421 NON_EXEC_STATIC void
1422 AutoVacWorkerMain(int argc, char *argv[])
1424 sigjmp_buf local_sigjmp_buf;
1425 Oid dbid;
1427 /* we are a postmaster subprocess now */
1428 IsUnderPostmaster = true;
1429 am_autovacuum_worker = true;
1431 /* reset MyProcPid */
1432 MyProcPid = getpid();
1434 /* record Start Time for logging */
1435 MyStartTime = time(NULL);
1437 /* Identify myself via ps */
1438 init_ps_display("autovacuum worker process", "", "", "");
1440 SetProcessingMode(InitProcessing);
1443 * If possible, make this process a group leader, so that the postmaster
1444 * can signal any child processes too. (autovacuum probably never has
1445 * any child processes, but for consistency we make all postmaster
1446 * child processes do this.)
1448 #ifdef HAVE_SETSID
1449 if (setsid() < 0)
1450 elog(FATAL, "setsid() failed: %m");
1451 #endif
1454 * Set up signal handlers. We operate on databases much like a regular
1455 * backend, so we use the same signal handling. See equivalent code in
1456 * tcop/postgres.c.
1458 * Currently, we don't pay attention to postgresql.conf changes that
1459 * happen during a single daemon iteration, so we can ignore SIGHUP.
1461 pqsignal(SIGHUP, SIG_IGN);
1464 * SIGINT is used to signal cancelling the current table's vacuum;
1465 * SIGTERM means abort and exit cleanly, and SIGQUIT means abandon ship.
1467 pqsignal(SIGINT, StatementCancelHandler);
1468 pqsignal(SIGTERM, die);
1469 pqsignal(SIGQUIT, quickdie);
1470 pqsignal(SIGALRM, handle_sig_alarm);
1472 pqsignal(SIGPIPE, SIG_IGN);
1473 pqsignal(SIGUSR1, CatchupInterruptHandler);
1474 /* We don't listen for async notifies */
1475 pqsignal(SIGUSR2, SIG_IGN);
1476 pqsignal(SIGFPE, FloatExceptionHandler);
1477 pqsignal(SIGCHLD, SIG_DFL);
1479 /* Early initialization */
1480 BaseInit();
1483 * Create a per-backend PGPROC struct in shared memory, except in the
1484 * EXEC_BACKEND case where this was done in SubPostmasterMain. We must do
1485 * this before we can use LWLocks (and in the EXEC_BACKEND case we already
1486 * had to do some stuff with LWLocks).
1488 #ifndef EXEC_BACKEND
1489 InitProcess();
1490 #endif
1493 * If an exception is encountered, processing resumes here.
1495 * See notes in postgres.c about the design of this coding.
1497 if (sigsetjmp(local_sigjmp_buf, 1) != 0)
1499 /* Prevents interrupts while cleaning up */
1500 HOLD_INTERRUPTS();
1502 /* Report the error to the server log */
1503 EmitErrorReport();
1506 * We can now go away. Note that because we called InitProcess, a
1507 * callback was registered to do ProcKill, which will clean up
1508 * necessary state.
1510 proc_exit(0);
1513 /* We can now handle ereport(ERROR) */
1514 PG_exception_stack = &local_sigjmp_buf;
1516 PG_SETMASK(&UnBlockSig);
1519 * Force zero_damaged_pages OFF in the autovac process, even if it is set
1520 * in postgresql.conf. We don't really want such a dangerous option being
1521 * applied non-interactively.
1523 SetConfigOption("zero_damaged_pages", "false", PGC_SUSET, PGC_S_OVERRIDE);
1526 * Force statement_timeout to zero to avoid a timeout setting from
1527 * preventing regular maintenance from being executed.
1529 SetConfigOption("statement_timeout", "0", PGC_SUSET, PGC_S_OVERRIDE);
1532 * Get the info about the database we're going to work on.
1534 LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
1537 * beware of startingWorker being INVALID; this should normally not happen,
1538 * but if a worker fails after forking and before this, the launcher might
1539 * have decided to remove it from the queue and start again.
1541 if (AutoVacuumShmem->av_startingWorker != INVALID_OFFSET)
1543 MyWorkerInfo = (WorkerInfo) MAKE_PTR(AutoVacuumShmem->av_startingWorker);
1544 dbid = MyWorkerInfo->wi_dboid;
1545 MyWorkerInfo->wi_workerpid = MyProcPid;
1547 /* insert into the running list */
1548 SHMQueueInsertBefore(&AutoVacuumShmem->av_runningWorkers,
1549 &MyWorkerInfo->wi_links);
1552 * remove from the "starting" pointer, so that the launcher can start
1553 * a new worker if required
1555 AutoVacuumShmem->av_startingWorker = INVALID_OFFSET;
1556 LWLockRelease(AutovacuumLock);
1558 on_shmem_exit(FreeWorkerInfo, 0);
1560 /* wake up the launcher */
1561 if (AutoVacuumShmem->av_launcherpid != 0)
1562 kill(AutoVacuumShmem->av_launcherpid, SIGUSR1);
1564 else
1566 /* no worker entry for me, go away */
1567 elog(WARNING, "autovacuum worker started without a worker entry");
1568 dbid = InvalidOid;
1569 LWLockRelease(AutovacuumLock);
1572 if (OidIsValid(dbid))
1574 char *dbname;
1577 * Report autovac startup to the stats collector. We deliberately do
1578 * this before InitPostgres, so that the last_autovac_time will get
1579 * updated even if the connection attempt fails. This is to prevent
1580 * autovac from getting "stuck" repeatedly selecting an unopenable
1581 * database, rather than making any progress on stuff it can connect
1582 * to.
1584 pgstat_report_autovac(dbid);
1587 * Connect to the selected database
1589 * Note: if we have selected a just-deleted database (due to using
1590 * stale stats info), we'll fail and exit here.
1592 InitPostgres(NULL, dbid, NULL, &dbname);
1593 SetProcessingMode(NormalProcessing);
1594 set_ps_display(dbname, false);
1595 ereport(DEBUG1,
1596 (errmsg("autovacuum: processing database \"%s\"", dbname)));
1598 /* And do an appropriate amount of work */
1599 recentXid = ReadNewTransactionId();
1600 do_autovacuum();
1604 * The launcher will be notified of my death in ProcKill, *if* we managed
1605 * to get a worker slot at all
1608 /* All done, go away */
1609 proc_exit(0);
1613 * Return a WorkerInfo to the free list
1615 static void
1616 FreeWorkerInfo(int code, Datum arg)
1618 if (MyWorkerInfo != NULL)
1620 LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
1623 * Wake the launcher up so that he can launch a new worker immediately
1624 * if required. We only save the launcher's PID in local memory here;
1625 * the actual signal will be sent when the PGPROC is recycled. Note
1626 * that we always do this, so that the launcher can rebalance the cost
1627 * limit setting of the remaining workers.
1629 * We somewhat ignore the risk that the launcher changes its PID
1630 * between we reading it and the actual kill; we expect ProcKill to be
1631 * called shortly after us, and we assume that PIDs are not reused too
1632 * quickly after a process exits.
1634 AutovacuumLauncherPid = AutoVacuumShmem->av_launcherpid;
1636 SHMQueueDelete(&MyWorkerInfo->wi_links);
1637 MyWorkerInfo->wi_links.next = AutoVacuumShmem->av_freeWorkers;
1638 MyWorkerInfo->wi_dboid = InvalidOid;
1639 MyWorkerInfo->wi_tableoid = InvalidOid;
1640 MyWorkerInfo->wi_workerpid = 0;
1641 MyWorkerInfo->wi_launchtime = 0;
1642 MyWorkerInfo->wi_cost_delay = 0;
1643 MyWorkerInfo->wi_cost_limit = 0;
1644 MyWorkerInfo->wi_cost_limit_base = 0;
1645 AutoVacuumShmem->av_freeWorkers = MAKE_OFFSET(MyWorkerInfo);
1646 /* not mine anymore */
1647 MyWorkerInfo = NULL;
1650 * now that we're inactive, cause a rebalancing of the surviving
1651 * workers
1653 AutoVacuumShmem->av_signal[AutoVacRebalance] = true;
1654 LWLockRelease(AutovacuumLock);
1659 * Update the cost-based delay parameters, so that multiple workers consume
1660 * each a fraction of the total available I/O.
1662 void
1663 AutoVacuumUpdateDelay(void)
1665 if (MyWorkerInfo)
1667 VacuumCostDelay = MyWorkerInfo->wi_cost_delay;
1668 VacuumCostLimit = MyWorkerInfo->wi_cost_limit;
1673 * autovac_balance_cost
1674 * Recalculate the cost limit setting for each active workers.
1676 * Caller must hold the AutovacuumLock in exclusive mode.
1678 static void
1679 autovac_balance_cost(void)
1681 WorkerInfo worker;
1683 * note: in cost_limit, zero also means use value from elsewhere, because
1684 * zero is not a valid value.
1686 int vac_cost_limit = (autovacuum_vac_cost_limit > 0 ?
1687 autovacuum_vac_cost_limit : VacuumCostLimit);
1688 int vac_cost_delay = (autovacuum_vac_cost_delay >= 0 ?
1689 autovacuum_vac_cost_delay : VacuumCostDelay);
1690 double cost_total;
1691 double cost_avail;
1693 /* not set? nothing to do */
1694 if (vac_cost_limit <= 0 || vac_cost_delay <= 0)
1695 return;
1697 /* caculate the total base cost limit of active workers */
1698 cost_total = 0.0;
1699 worker = (WorkerInfo) SHMQueueNext(&AutoVacuumShmem->av_runningWorkers,
1700 &AutoVacuumShmem->av_runningWorkers,
1701 offsetof(WorkerInfoData, wi_links));
1702 while (worker)
1704 if (worker->wi_workerpid != 0 &&
1705 worker->wi_cost_limit_base > 0 && worker->wi_cost_delay > 0)
1706 cost_total +=
1707 (double) worker->wi_cost_limit_base / worker->wi_cost_delay;
1709 worker = (WorkerInfo) SHMQueueNext(&AutoVacuumShmem->av_runningWorkers,
1710 &worker->wi_links,
1711 offsetof(WorkerInfoData, wi_links));
1713 /* there are no cost limits -- nothing to do */
1714 if (cost_total <= 0)
1715 return;
1718 * Adjust each cost limit of active workers to balance the total of
1719 * cost limit to autovacuum_vacuum_cost_limit.
1721 cost_avail = (double) vac_cost_limit / vac_cost_delay;
1722 worker = (WorkerInfo) SHMQueueNext(&AutoVacuumShmem->av_runningWorkers,
1723 &AutoVacuumShmem->av_runningWorkers,
1724 offsetof(WorkerInfoData, wi_links));
1725 while (worker)
1727 if (worker->wi_workerpid != 0 &&
1728 worker->wi_cost_limit_base > 0 && worker->wi_cost_delay > 0)
1730 int limit = (int)
1731 (cost_avail * worker->wi_cost_limit_base / cost_total);
1734 * We put a lower bound of 1 to the cost_limit, to avoid division-
1735 * by-zero in the vacuum code.
1737 worker->wi_cost_limit = Max(Min(limit, worker->wi_cost_limit_base), 1);
1739 elog(DEBUG2, "autovac_balance_cost(pid=%u db=%u, rel=%u, cost_limit=%d, cost_delay=%d)",
1740 worker->wi_workerpid, worker->wi_dboid,
1741 worker->wi_tableoid, worker->wi_cost_limit, worker->wi_cost_delay);
1744 worker = (WorkerInfo) SHMQueueNext(&AutoVacuumShmem->av_runningWorkers,
1745 &worker->wi_links,
1746 offsetof(WorkerInfoData, wi_links));
1751 * get_database_list
1753 * Return a list of all databases. Note we cannot use pg_database,
1754 * because we aren't connected; we use the flat database file.
1756 static List *
1757 get_database_list(void)
1759 char *filename;
1760 List *dblist = NIL;
1761 char thisname[NAMEDATALEN];
1762 FILE *db_file;
1763 Oid db_id;
1764 Oid db_tablespace;
1765 TransactionId db_frozenxid;
1767 filename = database_getflatfilename();
1768 db_file = AllocateFile(filename, "r");
1769 if (db_file == NULL)
1770 ereport(FATAL,
1771 (errcode_for_file_access(),
1772 errmsg("could not open file \"%s\": %m", filename)));
1774 while (read_pg_database_line(db_file, thisname, &db_id,
1775 &db_tablespace, &db_frozenxid))
1777 avw_dbase *avdb;
1779 avdb = (avw_dbase *) palloc(sizeof(avw_dbase));
1781 avdb->adw_datid = db_id;
1782 avdb->adw_name = pstrdup(thisname);
1783 avdb->adw_frozenxid = db_frozenxid;
1784 /* this gets set later: */
1785 avdb->adw_entry = NULL;
1787 dblist = lappend(dblist, avdb);
1790 FreeFile(db_file);
1791 pfree(filename);
1793 return dblist;
1797 * Process a database table-by-table
1799 * Note that CHECK_FOR_INTERRUPTS is supposed to be used in certain spots in
1800 * order not to ignore shutdown commands for too long.
1802 static void
1803 do_autovacuum(void)
1805 Relation classRel,
1806 avRel;
1807 HeapTuple tuple;
1808 HeapScanDesc relScan;
1809 Form_pg_database dbForm;
1810 List *table_oids = NIL;
1811 List *toast_oids = NIL;
1812 List *table_toast_list = NIL;
1813 ListCell * volatile cell;
1814 PgStat_StatDBEntry *shared;
1815 PgStat_StatDBEntry *dbentry;
1816 BufferAccessStrategy bstrategy;
1819 * StartTransactionCommand and CommitTransactionCommand will automatically
1820 * switch to other contexts. We need this one to keep the list of
1821 * relations to vacuum/analyze across transactions.
1823 AutovacMemCxt = AllocSetContextCreate(TopMemoryContext,
1824 "AV worker",
1825 ALLOCSET_DEFAULT_MINSIZE,
1826 ALLOCSET_DEFAULT_INITSIZE,
1827 ALLOCSET_DEFAULT_MAXSIZE);
1828 MemoryContextSwitchTo(AutovacMemCxt);
1831 * may be NULL if we couldn't find an entry (only happens if we
1832 * are forcing a vacuum for anti-wrap purposes).
1834 dbentry = pgstat_fetch_stat_dbentry(MyDatabaseId);
1836 /* Start a transaction so our commands have one to play into. */
1837 StartTransactionCommand();
1839 /* functions in indexes may want a snapshot set */
1840 ActiveSnapshot = CopySnapshot(GetTransactionSnapshot());
1843 * Clean up any dead statistics collector entries for this DB. We always
1844 * want to do this exactly once per DB-processing cycle, even if we find
1845 * nothing worth vacuuming in the database.
1847 pgstat_vacuum_tabstat();
1850 * Find the pg_database entry and select the default freeze_min_age.
1851 * We use zero in template and nonconnectable databases,
1852 * else the system-wide default.
1854 tuple = SearchSysCache(DATABASEOID,
1855 ObjectIdGetDatum(MyDatabaseId),
1856 0, 0, 0);
1857 if (!HeapTupleIsValid(tuple))
1858 elog(ERROR, "cache lookup failed for database %u", MyDatabaseId);
1859 dbForm = (Form_pg_database) GETSTRUCT(tuple);
1861 if (dbForm->datistemplate || !dbForm->datallowconn)
1862 default_freeze_min_age = 0;
1863 else
1864 default_freeze_min_age = vacuum_freeze_min_age;
1866 ReleaseSysCache(tuple);
1868 /* StartTransactionCommand changed elsewhere */
1869 MemoryContextSwitchTo(AutovacMemCxt);
1871 /* The database hash where pgstat keeps shared relations */
1872 shared = pgstat_fetch_stat_dbentry(InvalidOid);
1874 classRel = heap_open(RelationRelationId, AccessShareLock);
1875 avRel = heap_open(AutovacuumRelationId, AccessShareLock);
1878 * Scan pg_class and determine which tables to vacuum.
1880 * The stats subsystem collects stats for toast tables independently of
1881 * the stats for their parent tables. We need to check those stats since
1882 * in cases with short, wide tables there might be proportionally much
1883 * more activity in the toast table than in its parent.
1885 * Since we can only issue VACUUM against the parent table, we need to
1886 * transpose a decision to vacuum a toast table into a decision to vacuum
1887 * its parent. There's no point in considering ANALYZE on a toast table,
1888 * either. To support this, we keep a list of OIDs of toast tables that
1889 * need vacuuming alongside the list of regular tables. Regular tables
1890 * will be entered into the table list even if they appear not to need
1891 * vacuuming; we go back and re-mark them after finding all the vacuumable
1892 * toast tables.
1894 relScan = heap_beginscan(classRel, SnapshotNow, 0, NULL);
1896 while ((tuple = heap_getnext(relScan, ForwardScanDirection)) != NULL)
1898 Form_pg_class classForm = (Form_pg_class) GETSTRUCT(tuple);
1899 Form_pg_autovacuum avForm = NULL;
1900 PgStat_StatTabEntry *tabentry;
1901 HeapTuple avTup;
1902 Oid relid;
1904 /* Consider only regular and toast tables. */
1905 if (classForm->relkind != RELKIND_RELATION &&
1906 classForm->relkind != RELKIND_TOASTVALUE)
1907 continue;
1910 * Skip temp tables (i.e. those in temp namespaces). We cannot safely
1911 * process other backends' temp tables.
1913 if (isAnyTempNamespace(classForm->relnamespace))
1914 continue;
1916 relid = HeapTupleGetOid(tuple);
1918 /* Fetch the pg_autovacuum tuple for the relation, if any */
1919 avTup = get_pg_autovacuum_tuple_relid(avRel, relid);
1920 if (HeapTupleIsValid(avTup))
1921 avForm = (Form_pg_autovacuum) GETSTRUCT(avTup);
1923 /* Fetch the pgstat entry for this table */
1924 tabentry = get_pgstat_tabentry_relid(relid, classForm->relisshared,
1925 shared, dbentry);
1927 relation_check_autovac(relid, classForm, avForm, tabentry,
1928 &table_oids, &table_toast_list, &toast_oids);
1930 if (HeapTupleIsValid(avTup))
1931 heap_freetuple(avTup);
1934 heap_endscan(relScan);
1935 heap_close(avRel, AccessShareLock);
1936 heap_close(classRel, AccessShareLock);
1939 * Add to the list of tables to vacuum, the OIDs of the tables that
1940 * correspond to the saved OIDs of toast tables needing vacuum.
1942 foreach(cell, toast_oids)
1944 Oid toastoid = lfirst_oid(cell);
1945 ListCell *cell2;
1947 foreach(cell2, table_toast_list)
1949 av_relation *ar = lfirst(cell2);
1951 if (ar->ar_toastrelid == toastoid)
1953 table_oids = lappend_oid(table_oids, ar->ar_relid);
1954 break;
1959 list_free_deep(table_toast_list);
1960 table_toast_list = NIL;
1961 list_free(toast_oids);
1962 toast_oids = NIL;
1965 * Create a buffer access strategy object for VACUUM to use. We want
1966 * to use the same one across all the vacuum operations we perform,
1967 * since the point is for VACUUM not to blow out the shared cache.
1969 bstrategy = GetAccessStrategy(BAS_VACUUM);
1972 * create a memory context to act as fake PortalContext, so that the
1973 * contexts created in the vacuum code are cleaned up for each table.
1975 PortalContext = AllocSetContextCreate(AutovacMemCxt,
1976 "Autovacuum Portal",
1977 ALLOCSET_DEFAULT_INITSIZE,
1978 ALLOCSET_DEFAULT_MINSIZE,
1979 ALLOCSET_DEFAULT_MAXSIZE);
1982 * Perform operations on collected tables.
1984 foreach(cell, table_oids)
1986 Oid relid = lfirst_oid(cell);
1987 autovac_table *tab;
1988 WorkerInfo worker;
1989 bool skipit;
1991 CHECK_FOR_INTERRUPTS();
1994 * hold schedule lock from here until we're sure that this table
1995 * still needs vacuuming. We also need the AutovacuumLock to walk
1996 * the worker array, but we'll let go of that one quickly.
1998 LWLockAcquire(AutovacuumScheduleLock, LW_EXCLUSIVE);
1999 LWLockAcquire(AutovacuumLock, LW_SHARED);
2002 * Check whether the table is being vacuumed concurrently by another
2003 * worker.
2005 skipit = false;
2006 worker = (WorkerInfo) SHMQueueNext(&AutoVacuumShmem->av_runningWorkers,
2007 &AutoVacuumShmem->av_runningWorkers,
2008 offsetof(WorkerInfoData, wi_links));
2009 while (worker)
2011 /* ignore myself */
2012 if (worker == MyWorkerInfo)
2013 goto next_worker;
2015 /* ignore workers in other databases */
2016 if (worker->wi_dboid != MyDatabaseId)
2017 goto next_worker;
2019 if (worker->wi_tableoid == relid)
2021 skipit = true;
2022 break;
2025 next_worker:
2026 worker = (WorkerInfo) SHMQueueNext(&AutoVacuumShmem->av_runningWorkers,
2027 &worker->wi_links,
2028 offsetof(WorkerInfoData, wi_links));
2030 LWLockRelease(AutovacuumLock);
2031 if (skipit)
2033 LWLockRelease(AutovacuumScheduleLock);
2034 continue;
2038 * Check whether pgstat data still says we need to vacuum this table.
2039 * It could have changed if something else processed the table while we
2040 * weren't looking.
2042 * FIXME we ignore the possibility that the table was finished being
2043 * vacuumed in the last 500ms (PGSTAT_STAT_INTERVAL). This is a bug.
2045 MemoryContextSwitchTo(AutovacMemCxt);
2046 tab = table_recheck_autovac(relid);
2047 if (tab == NULL)
2049 /* someone else vacuumed the table */
2050 LWLockRelease(AutovacuumScheduleLock);
2051 continue;
2055 * Ok, good to go. Store the table in shared memory before releasing
2056 * the lock so that other workers don't vacuum it concurrently.
2058 MyWorkerInfo->wi_tableoid = relid;
2059 LWLockRelease(AutovacuumScheduleLock);
2061 /* Set the initial vacuum cost parameters for this table */
2062 VacuumCostDelay = tab->at_vacuum_cost_delay;
2063 VacuumCostLimit = tab->at_vacuum_cost_limit;
2066 * Advertise my cost delay parameters for the balancing algorithm, and
2067 * do a balance
2069 LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
2070 MyWorkerInfo->wi_cost_delay = tab->at_vacuum_cost_delay;
2071 MyWorkerInfo->wi_cost_limit = tab->at_vacuum_cost_limit;
2072 MyWorkerInfo->wi_cost_limit_base = tab->at_vacuum_cost_limit;
2073 autovac_balance_cost();
2074 LWLockRelease(AutovacuumLock);
2076 /* clean up memory before each iteration */
2077 MemoryContextResetAndDeleteChildren(PortalContext);
2080 * We will abort vacuuming the current table if we are interrupted, and
2081 * continue with the next one in schedule; but if anything else
2082 * happens, we will do our usual error handling which is to cause the
2083 * worker process to exit.
2085 PG_TRY();
2087 /* have at it */
2088 MemoryContextSwitchTo(TopTransactionContext);
2089 autovacuum_do_vac_analyze(tab->at_relid,
2090 tab->at_dovacuum,
2091 tab->at_doanalyze,
2092 tab->at_freeze_min_age,
2093 bstrategy);
2095 PG_CATCH();
2097 ErrorData *errdata;
2099 MemoryContextSwitchTo(TopTransactionContext);
2100 errdata = CopyErrorData();
2103 * If we errored out due to a cancel request, abort and restart the
2104 * transaction and go to the next table. Otherwise rethrow the
2105 * error so that the outermost handler deals with it.
2107 if (errdata->sqlerrcode == ERRCODE_QUERY_CANCELED)
2109 HOLD_INTERRUPTS();
2110 elog(LOG, "cancelling autovacuum of table \"%s.%s.%s\"",
2111 get_database_name(MyDatabaseId),
2112 get_namespace_name(get_rel_namespace(tab->at_relid)),
2113 get_rel_name(tab->at_relid));
2115 AbortOutOfAnyTransaction();
2116 FlushErrorState();
2117 MemoryContextResetAndDeleteChildren(PortalContext);
2119 /* restart our transaction for the following operations */
2120 StartTransactionCommand();
2121 RESUME_INTERRUPTS();
2123 else
2124 PG_RE_THROW();
2126 PG_END_TRY();
2128 /* be tidy */
2129 pfree(tab);
2133 * Update pg_database.datfrozenxid, and truncate pg_clog if possible.
2134 * We only need to do this once, not after each table.
2136 vac_update_datfrozenxid();
2138 /* Finally close out the last transaction. */
2139 CommitTransactionCommand();
2143 * Returns a copy of the pg_autovacuum tuple for the given relid, or NULL if
2144 * there isn't any. avRel is pg_autovacuum, already open and suitably locked.
2146 static HeapTuple
2147 get_pg_autovacuum_tuple_relid(Relation avRel, Oid relid)
2149 ScanKeyData entry[1];
2150 SysScanDesc avScan;
2151 HeapTuple avTup;
2153 ScanKeyInit(&entry[0],
2154 Anum_pg_autovacuum_vacrelid,
2155 BTEqualStrategyNumber, F_OIDEQ,
2156 ObjectIdGetDatum(relid));
2158 avScan = systable_beginscan(avRel, AutovacuumRelidIndexId, true,
2159 SnapshotNow, 1, entry);
2161 avTup = systable_getnext(avScan);
2163 if (HeapTupleIsValid(avTup))
2164 avTup = heap_copytuple(avTup);
2166 systable_endscan(avScan);
2168 return avTup;
2172 * get_pgstat_tabentry_relid
2174 * Fetch the pgstat entry of a table, either local to a database or shared.
2176 static PgStat_StatTabEntry *
2177 get_pgstat_tabentry_relid(Oid relid, bool isshared, PgStat_StatDBEntry *shared,
2178 PgStat_StatDBEntry *dbentry)
2180 PgStat_StatTabEntry *tabentry = NULL;
2182 if (isshared)
2184 if (PointerIsValid(shared))
2185 tabentry = hash_search(shared->tables, &relid,
2186 HASH_FIND, NULL);
2188 else if (PointerIsValid(dbentry))
2189 tabentry = hash_search(dbentry->tables, &relid,
2190 HASH_FIND, NULL);
2192 return tabentry;
2196 * relation_check_autovac
2198 * For a given relation (either a plain table or TOAST table), check whether it
2199 * needs vacuum or analyze.
2201 * Plain tables that need either are added to the table_list. TOAST tables
2202 * that need vacuum are added to toast_list. Plain tables that don't need
2203 * either but which have a TOAST table are added, as a struct, to
2204 * table_toast_list. The latter is to allow appending the OIDs of the plain
2205 * tables whose TOAST table needs vacuuming into the plain tables list, which
2206 * allows us to substantially reduce the number of "rechecks" that we need to
2207 * do later on.
2209 static void
2210 relation_check_autovac(Oid relid, Form_pg_class classForm,
2211 Form_pg_autovacuum avForm, PgStat_StatTabEntry *tabentry,
2212 List **table_oids, List **table_toast_list,
2213 List **toast_oids)
2215 bool dovacuum;
2216 bool doanalyze;
2218 relation_needs_vacanalyze(relid, avForm, classForm, tabentry,
2219 &dovacuum, &doanalyze);
2221 if (classForm->relkind == RELKIND_TOASTVALUE)
2223 if (dovacuum)
2224 *toast_oids = lappend_oid(*toast_oids, relid);
2226 else
2228 Assert(classForm->relkind == RELKIND_RELATION);
2230 if (dovacuum || doanalyze)
2231 *table_oids = lappend_oid(*table_oids, relid);
2232 else if (OidIsValid(classForm->reltoastrelid))
2234 av_relation *rel = palloc(sizeof(av_relation));
2236 rel->ar_relid = relid;
2237 rel->ar_toastrelid = classForm->reltoastrelid;
2239 *table_toast_list = lappend(*table_toast_list, rel);
2245 * table_recheck_autovac
2247 * Recheck whether a plain table still needs vacuum or analyze; be it because
2248 * it does directly, or because its TOAST table does. Return value is a valid
2249 * autovac_table pointer if it does, NULL otherwise.
2251 static autovac_table *
2252 table_recheck_autovac(Oid relid)
2254 Form_pg_autovacuum avForm = NULL;
2255 Form_pg_class classForm;
2256 HeapTuple classTup;
2257 HeapTuple avTup;
2258 Relation avRel;
2259 bool dovacuum;
2260 bool doanalyze;
2261 autovac_table *tab = NULL;
2262 PgStat_StatTabEntry *tabentry;
2263 bool doit = false;
2264 PgStat_StatDBEntry *shared;
2265 PgStat_StatDBEntry *dbentry;
2267 /* use fresh stats */
2268 autovac_refresh_stats();
2270 shared = pgstat_fetch_stat_dbentry(InvalidOid);
2271 dbentry = pgstat_fetch_stat_dbentry(MyDatabaseId);
2273 /* fetch the relation's relcache entry */
2274 classTup = SearchSysCacheCopy(RELOID,
2275 ObjectIdGetDatum(relid),
2276 0, 0, 0);
2277 if (!HeapTupleIsValid(classTup))
2278 return NULL;
2279 classForm = (Form_pg_class) GETSTRUCT(classTup);
2281 /* fetch the pg_autovacuum entry, if any */
2282 avRel = heap_open(AutovacuumRelationId, AccessShareLock);
2283 avTup = get_pg_autovacuum_tuple_relid(avRel, relid);
2284 if (HeapTupleIsValid(avTup))
2285 avForm = (Form_pg_autovacuum) GETSTRUCT(avTup);
2287 /* fetch the pgstat table entry */
2288 tabentry = get_pgstat_tabentry_relid(relid, classForm->relisshared,
2289 shared, dbentry);
2291 relation_needs_vacanalyze(relid, avForm, classForm, tabentry,
2292 &dovacuum, &doanalyze);
2294 /* OK, it needs vacuum by itself */
2295 if (dovacuum)
2296 doit = true;
2297 /* it doesn't need vacuum, but what about it's TOAST table? */
2298 else if (OidIsValid(classForm->reltoastrelid))
2300 Oid toastrelid = classForm->reltoastrelid;
2301 HeapTuple toastClassTup;
2303 toastClassTup = SearchSysCacheCopy(RELOID,
2304 ObjectIdGetDatum(toastrelid),
2305 0, 0, 0);
2306 if (HeapTupleIsValid(toastClassTup))
2308 bool toast_dovacuum;
2309 bool toast_doanalyze;
2310 Form_pg_class toastClassForm;
2311 PgStat_StatTabEntry *toasttabentry;
2313 toastClassForm = (Form_pg_class) GETSTRUCT(toastClassTup);
2314 toasttabentry = get_pgstat_tabentry_relid(toastrelid,
2315 toastClassForm->relisshared,
2316 shared, dbentry);
2318 /* note we use the pg_autovacuum entry for the main table */
2319 relation_needs_vacanalyze(toastrelid, avForm, toastClassForm,
2320 toasttabentry, &toast_dovacuum,
2321 &toast_doanalyze);
2322 /* we only consider VACUUM for toast tables */
2323 if (toast_dovacuum)
2325 dovacuum = true;
2326 doit = true;
2329 heap_freetuple(toastClassTup);
2333 if (doanalyze)
2334 doit = true;
2336 if (doit)
2338 int freeze_min_age;
2339 int vac_cost_limit;
2340 int vac_cost_delay;
2343 * Calculate the vacuum cost parameters and the minimum freeze age. If
2344 * there is a tuple in pg_autovacuum, use it; else, use the GUC
2345 * defaults. Note that the fields may contain "-1" (or indeed any
2346 * negative value), which means use the GUC defaults for each setting.
2347 * In cost_limit, the value 0 also means to use the value from
2348 * elsewhere.
2350 if (avForm != NULL)
2352 vac_cost_limit = (avForm->vac_cost_limit > 0) ?
2353 avForm->vac_cost_limit :
2354 ((autovacuum_vac_cost_limit > 0) ?
2355 autovacuum_vac_cost_limit : VacuumCostLimit);
2357 vac_cost_delay = (avForm->vac_cost_delay >= 0) ?
2358 avForm->vac_cost_delay :
2359 ((autovacuum_vac_cost_delay >= 0) ?
2360 autovacuum_vac_cost_delay : VacuumCostDelay);
2362 freeze_min_age = (avForm->freeze_min_age >= 0) ?
2363 avForm->freeze_min_age : default_freeze_min_age;
2365 else
2367 vac_cost_limit = (autovacuum_vac_cost_limit > 0) ?
2368 autovacuum_vac_cost_limit : VacuumCostLimit;
2370 vac_cost_delay = (autovacuum_vac_cost_delay >= 0) ?
2371 autovacuum_vac_cost_delay : VacuumCostDelay;
2373 freeze_min_age = default_freeze_min_age;
2376 tab = palloc(sizeof(autovac_table));
2377 tab->at_relid = relid;
2378 tab->at_dovacuum = dovacuum;
2379 tab->at_doanalyze = doanalyze;
2380 tab->at_freeze_min_age = freeze_min_age;
2381 tab->at_vacuum_cost_limit = vac_cost_limit;
2382 tab->at_vacuum_cost_delay = vac_cost_delay;
2385 heap_close(avRel, AccessShareLock);
2386 if (HeapTupleIsValid(avTup))
2387 heap_freetuple(avTup);
2388 heap_freetuple(classTup);
2390 return tab;
2394 * relation_needs_vacanalyze
2396 * Check whether a relation needs to be vacuumed or analyzed; return each into
2397 * "dovacuum" and "doanalyze", respectively. avForm and tabentry can be NULL,
2398 * classForm shouldn't.
2400 * A table needs to be vacuumed if the number of dead tuples exceeds a
2401 * threshold. This threshold is calculated as
2403 * threshold = vac_base_thresh + vac_scale_factor * reltuples
2405 * For analyze, the analysis done is that the number of tuples inserted,
2406 * deleted and updated since the last analyze exceeds a threshold calculated
2407 * in the same fashion as above. Note that the collector actually stores
2408 * the number of tuples (both live and dead) that there were as of the last
2409 * analyze. This is asymmetric to the VACUUM case.
2411 * We also force vacuum if the table's relfrozenxid is more than freeze_max_age
2412 * transactions back.
2414 * A table whose pg_autovacuum.enabled value is false, is automatically
2415 * skipped (unless we have to vacuum it due to freeze_max_age). Thus
2416 * autovacuum can be disabled for specific tables. Also, when the stats
2417 * collector does not have data about a table, it will be skipped.
2419 * A table whose vac_base_thresh value is <0 takes the base value from the
2420 * autovacuum_vacuum_threshold GUC variable. Similarly, a vac_scale_factor
2421 * value <0 is substituted with the value of
2422 * autovacuum_vacuum_scale_factor GUC variable. Ditto for analyze.
2424 static void
2425 relation_needs_vacanalyze(Oid relid,
2426 Form_pg_autovacuum avForm,
2427 Form_pg_class classForm,
2428 PgStat_StatTabEntry *tabentry,
2429 /* output params below */
2430 bool *dovacuum,
2431 bool *doanalyze)
2433 bool force_vacuum;
2434 float4 reltuples; /* pg_class.reltuples */
2435 /* constants from pg_autovacuum or GUC variables */
2436 int vac_base_thresh,
2437 anl_base_thresh;
2438 float4 vac_scale_factor,
2439 anl_scale_factor;
2440 /* thresholds calculated from above constants */
2441 float4 vacthresh,
2442 anlthresh;
2443 /* number of vacuum (resp. analyze) tuples at this time */
2444 float4 vactuples,
2445 anltuples;
2446 /* freeze parameters */
2447 int freeze_max_age;
2448 TransactionId xidForceLimit;
2450 AssertArg(classForm != NULL);
2451 AssertArg(OidIsValid(relid));
2454 * Determine vacuum/analyze equation parameters. If there is a tuple in
2455 * pg_autovacuum, use it; else, use the GUC defaults. Note that the fields
2456 * may contain "-1" (or indeed any negative value), which means use the GUC
2457 * defaults for each setting.
2459 if (avForm != NULL)
2461 vac_scale_factor = (avForm->vac_scale_factor >= 0) ?
2462 avForm->vac_scale_factor : autovacuum_vac_scale;
2463 vac_base_thresh = (avForm->vac_base_thresh >= 0) ?
2464 avForm->vac_base_thresh : autovacuum_vac_thresh;
2466 anl_scale_factor = (avForm->anl_scale_factor >= 0) ?
2467 avForm->anl_scale_factor : autovacuum_anl_scale;
2468 anl_base_thresh = (avForm->anl_base_thresh >= 0) ?
2469 avForm->anl_base_thresh : autovacuum_anl_thresh;
2471 freeze_max_age = (avForm->freeze_max_age >= 0) ?
2472 Min(avForm->freeze_max_age, autovacuum_freeze_max_age) :
2473 autovacuum_freeze_max_age;
2475 else
2477 vac_scale_factor = autovacuum_vac_scale;
2478 vac_base_thresh = autovacuum_vac_thresh;
2480 anl_scale_factor = autovacuum_anl_scale;
2481 anl_base_thresh = autovacuum_anl_thresh;
2483 freeze_max_age = autovacuum_freeze_max_age;
2486 /* Force vacuum if table is at risk of wraparound */
2487 xidForceLimit = recentXid - freeze_max_age;
2488 if (xidForceLimit < FirstNormalTransactionId)
2489 xidForceLimit -= FirstNormalTransactionId;
2490 force_vacuum = (TransactionIdIsNormal(classForm->relfrozenxid) &&
2491 TransactionIdPrecedes(classForm->relfrozenxid,
2492 xidForceLimit));
2494 /* User disabled it in pg_autovacuum? (But ignore if at risk) */
2495 if (avForm && !avForm->enabled && !force_vacuum)
2497 *doanalyze = false;
2498 *dovacuum = false;
2499 return;
2502 if (PointerIsValid(tabentry))
2504 reltuples = classForm->reltuples;
2505 vactuples = tabentry->n_dead_tuples;
2506 anltuples = tabentry->n_live_tuples + tabentry->n_dead_tuples -
2507 tabentry->last_anl_tuples;
2509 vacthresh = (float4) vac_base_thresh + vac_scale_factor * reltuples;
2510 anlthresh = (float4) anl_base_thresh + anl_scale_factor * reltuples;
2513 * Note that we don't need to take special consideration for stat
2514 * reset, because if that happens, the last vacuum and analyze counts
2515 * will be reset too.
2517 elog(DEBUG3, "%s: vac: %.0f (threshold %.0f), anl: %.0f (threshold %.0f)",
2518 NameStr(classForm->relname),
2519 vactuples, vacthresh, anltuples, anlthresh);
2521 /* Determine if this table needs vacuum or analyze. */
2522 *dovacuum = force_vacuum || (vactuples > vacthresh);
2523 *doanalyze = (anltuples > anlthresh);
2525 else
2528 * Skip a table not found in stat hash, unless we have to force
2529 * vacuum for anti-wrap purposes. If it's not acted upon, there's
2530 * no need to vacuum it.
2532 *dovacuum = force_vacuum;
2533 *doanalyze = false;
2536 /* ANALYZE refuses to work with pg_statistics */
2537 if (relid == StatisticRelationId)
2538 *doanalyze = false;
2542 * autovacuum_do_vac_analyze
2543 * Vacuum and/or analyze the specified table
2545 static void
2546 autovacuum_do_vac_analyze(Oid relid, bool dovacuum, bool doanalyze,
2547 int freeze_min_age,
2548 BufferAccessStrategy bstrategy)
2550 VacuumStmt vacstmt;
2551 MemoryContext old_cxt;
2553 MemSet(&vacstmt, 0, sizeof(vacstmt));
2556 * The list must survive transaction boundaries, so make sure we create it
2557 * in a long-lived context
2559 old_cxt = MemoryContextSwitchTo(AutovacMemCxt);
2561 /* Set up command parameters */
2562 vacstmt.type = T_VacuumStmt;
2563 vacstmt.vacuum = dovacuum;
2564 vacstmt.full = false;
2565 vacstmt.analyze = doanalyze;
2566 vacstmt.freeze_min_age = freeze_min_age;
2567 vacstmt.verbose = false;
2568 vacstmt.relation = NULL; /* not used since we pass a relids list */
2569 vacstmt.va_cols = NIL;
2571 /* Let pgstat know what we're doing */
2572 autovac_report_activity(&vacstmt, relid);
2574 vacuum(&vacstmt, list_make1_oid(relid), bstrategy, true);
2575 MemoryContextSwitchTo(old_cxt);
2579 * autovac_report_activity
2580 * Report to pgstat what autovacuum is doing
2582 * We send a SQL string corresponding to what the user would see if the
2583 * equivalent command was to be issued manually.
2585 * Note we assume that we are going to report the next command as soon as we're
2586 * done with the current one, and exit right after the last one, so we don't
2587 * bother to report "<IDLE>" or some such.
2589 static void
2590 autovac_report_activity(VacuumStmt *vacstmt, Oid relid)
2592 char *relname = get_rel_name(relid);
2593 char *nspname = get_namespace_name(get_rel_namespace(relid));
2594 #define MAX_AUTOVAC_ACTIV_LEN (NAMEDATALEN * 2 + 32)
2595 char activity[MAX_AUTOVAC_ACTIV_LEN];
2597 /* Report the command and possible options */
2598 if (vacstmt->vacuum)
2599 snprintf(activity, MAX_AUTOVAC_ACTIV_LEN,
2600 "VACUUM%s",
2601 vacstmt->analyze ? " ANALYZE" : "");
2602 else
2603 snprintf(activity, MAX_AUTOVAC_ACTIV_LEN,
2604 "ANALYZE");
2607 * Report the qualified name of the relation.
2609 * Paranoia is appropriate here in case relation was recently dropped
2610 * --- the lsyscache routines we just invoked will return NULL rather
2611 * than failing.
2613 if (relname && nspname)
2615 int len = strlen(activity);
2617 snprintf(activity + len, MAX_AUTOVAC_ACTIV_LEN - len,
2618 " %s.%s", nspname, relname);
2621 /* Set statement_timestamp() to current time for pg_stat_activity */
2622 SetCurrentStatementStartTimestamp();
2624 pgstat_report_activity(activity);
2628 * AutoVacuumingActive
2629 * Check GUC vars and report whether the autovacuum process should be
2630 * running.
2632 bool
2633 AutoVacuumingActive(void)
2635 if (!autovacuum_start_daemon || !pgstat_track_counts)
2636 return false;
2637 return true;
2641 * autovac_init
2642 * This is called at postmaster initialization.
2644 * All we do here is annoy the user if he got it wrong.
2646 void
2647 autovac_init(void)
2649 if (autovacuum_start_daemon && !pgstat_track_counts)
2650 ereport(WARNING,
2651 (errmsg("autovacuum not started because of misconfiguration"),
2652 errhint("Enable the \"track_counts\" option.")));
2656 * IsAutoVacuum functions
2657 * Return whether this is either a launcher autovacuum process or a worker
2658 * process.
2660 bool
2661 IsAutoVacuumLauncherProcess(void)
2663 return am_autovacuum_launcher;
2666 bool
2667 IsAutoVacuumWorkerProcess(void)
2669 return am_autovacuum_worker;
2674 * AutoVacuumShmemSize
2675 * Compute space needed for autovacuum-related shared memory
2677 Size
2678 AutoVacuumShmemSize(void)
2680 Size size;
2683 * Need the fixed struct and the array of WorkerInfoData.
2685 size = sizeof(AutoVacuumShmemStruct);
2686 size = MAXALIGN(size);
2687 size = add_size(size, mul_size(autovacuum_max_workers,
2688 sizeof(WorkerInfoData)));
2689 return size;
2693 * AutoVacuumShmemInit
2694 * Allocate and initialize autovacuum-related shared memory
2696 void
2697 AutoVacuumShmemInit(void)
2699 bool found;
2701 AutoVacuumShmem = (AutoVacuumShmemStruct *)
2702 ShmemInitStruct("AutoVacuum Data",
2703 AutoVacuumShmemSize(),
2704 &found);
2705 if (AutoVacuumShmem == NULL)
2706 ereport(FATAL,
2707 (errcode(ERRCODE_OUT_OF_MEMORY),
2708 errmsg("not enough shared memory for autovacuum")));
2710 if (!IsUnderPostmaster)
2712 WorkerInfo worker;
2713 int i;
2715 Assert(!found);
2717 AutoVacuumShmem->av_launcherpid = 0;
2718 AutoVacuumShmem->av_freeWorkers = INVALID_OFFSET;
2719 SHMQueueInit(&AutoVacuumShmem->av_runningWorkers);
2720 AutoVacuumShmem->av_startingWorker = INVALID_OFFSET;
2722 worker = (WorkerInfo) ((char *) AutoVacuumShmem +
2723 MAXALIGN(sizeof(AutoVacuumShmemStruct)));
2725 /* initialize the WorkerInfo free list */
2726 for (i = 0; i < autovacuum_max_workers; i++)
2728 worker[i].wi_links.next = AutoVacuumShmem->av_freeWorkers;
2729 AutoVacuumShmem->av_freeWorkers = MAKE_OFFSET(&worker[i]);
2732 else
2733 Assert(found);
2737 * autovac_refresh_stats
2738 * Refresh pgstats data for an autovacuum process
2740 * Cause the next pgstats read operation to obtain fresh data, but throttle
2741 * such refreshing in the autovacuum launcher. This is mostly to avoid
2742 * rereading the pgstats files too many times in quick succession when there
2743 * are many databases.
2745 * Note: we avoid throttling in the autovac worker, as it would be
2746 * counterproductive in the recheck logic.
2748 static void
2749 autovac_refresh_stats(void)
2751 if (IsAutoVacuumLauncherProcess())
2753 static TimestampTz last_read = 0;
2754 TimestampTz current_time;
2756 current_time = GetCurrentTimestamp();
2758 if (!TimestampDifferenceExceeds(last_read, current_time,
2759 STATS_READ_DELAY))
2760 return;
2762 last_read = current_time;
2765 pgstat_clear_snapshot();