src/backend/postmaster/autovacuum.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * autovacuum.c
   4  *
   5  * PostgreSQL Integrated Autovacuum Daemon
   6  *
   7  * The autovacuum system is structured in two different kinds of processes: the
   8  * autovacuum launcher and the autovacuum worker.  The launcher is an
   9  * always-running process, started by the postmaster when the autovacuum GUC
  10  * parameter is set.  The launcher schedules autovacuum workers to be started
  11  * when appropriate.  The workers are the processes which execute the actual
  12  * vacuuming; they connect to a database as determined in the launcher, and
  13  * once connected they examine the catalogs to select the tables to vacuum.
  14  *
  15  * The autovacuum launcher cannot start the worker processes by itself,
  16  * because doing so would cause robustness issues (namely, failure to shut
  17  * them down on exceptional conditions, and also, since the launcher is
  18  * connected to shared memory and is thus subject to corruption there, it is
  19  * not as robust as the postmaster).  So it leaves that task to the postmaster.
  20  *
  21  * There is an autovacuum shared memory area, where the launcher stores
  22  * information about the database it wants vacuumed.  When it wants a new
  23  * worker to start, it sets a flag in shared memory and sends a signal to the
  24  * postmaster.  Then postmaster knows nothing more than it must start a worker;
  25  * so it forks a new child, which turns into a worker.  This new process
  26  * connects to shared memory, and there it can inspect the information that the
  27  * launcher has set up.
  28  *
  29  * If the fork() call fails in the postmaster, it sets a flag in the shared
  30  * memory area, and sends a signal to the launcher.  The launcher, upon
  31  * noticing the flag, can try starting the worker again by resending the
  32  * signal.  Note that the failure can only be transient (fork failure due to
  33  * high load, memory pressure, too many processes, etc); more permanent
  34  * problems, like failure to connect to a database, are detected later in the
  35  * worker and dealt with just by having the worker exit normally.  The launcher
  36  * will launch a new worker again later, per schedule.
  37  *
  38  * When the worker is done vacuuming it sends SIGUSR2 to the launcher.  The
  39  * launcher then wakes up and is able to launch another worker, if the schedule
  40  * is so tight that a new worker is needed immediately.  At this time the
  41  * launcher can also balance the settings for the various remaining workers'
  42  * cost-based vacuum delay feature.
  43  *
  44  * Note that there can be more than one worker in a database concurrently.
  45  * They will store the table they are currently vacuuming in shared memory, so
  46  * that other workers avoid being blocked waiting for the vacuum lock for that
  47  * table.  They will also fetch the last time the table was vacuumed from
  48  * pgstats just before vacuuming each table, to avoid vacuuming a table that
  49  * was just finished being vacuumed by another worker and thus is no longer
  50  * noted in shared memory.  However, there is a small window (due to not yet
  51  * holding the relation lock) during which a worker may choose a table that was
  52  * already vacuumed; this is a bug in the current design.
  53  *
  54  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
  55  * Portions Copyright (c) 1994, Regents of the University of California
  56  *
  57  *
  58  * IDENTIFICATION
  59  *        src/backend/postmaster/autovacuum.c
  60  *
  61  *-------------------------------------------------------------------------
  62  */
  63 #include "postgres.h"
  64
  65 #include <signal.h>
  66 #include <sys/time.h>
  67 #include <unistd.h>
  68
  69 #include "access/heapam.h"
  70 #include "access/htup_details.h"
  71 #include "access/multixact.h"
  72 #include "access/reloptions.h"
  73 #include "access/tableam.h"
  74 #include "access/transam.h"
  75 #include "access/xact.h"
  76 #include "catalog/dependency.h"
  77 #include "catalog/namespace.h"
  78 #include "catalog/pg_database.h"
  79 #include "commands/dbcommands.h"
  80 #include "commands/vacuum.h"
  81 #include "lib/ilist.h"
  82 #include "libpq/pqsignal.h"
  83 #include "miscadmin.h"
  84 #include "nodes/makefuncs.h"
  85 #include "pgstat.h"
  86 #include "postmaster/autovacuum.h"
  87 #include "postmaster/fork_process.h"
  88 #include "postmaster/interrupt.h"
  89 #include "postmaster/postmaster.h"
  90 #include "storage/bufmgr.h"
  91 #include "storage/ipc.h"
  92 #include "storage/latch.h"
  93 #include "storage/lmgr.h"
  94 #include "storage/pmsignal.h"
  95 #include "storage/proc.h"
  96 #include "storage/procsignal.h"
  97 #include "storage/sinvaladt.h"
  98 #include "storage/smgr.h"
  99 #include "tcop/tcopprot.h"
 100 #include "utils/fmgroids.h"
 101 #include "utils/fmgrprotos.h"
 102 #include "utils/guc_hooks.h"
 103 #include "utils/lsyscache.h"
 104 #include "utils/memutils.h"
 105 #include "utils/ps_status.h"
 106 #include "utils/rel.h"
 107 #include "utils/snapmgr.h"
 108 #include "utils/syscache.h"
 109 #include "utils/timeout.h"
 110 #include "utils/timestamp.h"
 111
 112
 113 /*
 114  * GUC parameters
 115  */
 116 bool            autovacuum_start_daemon = false;
 117 int                     autovacuum_max_workers;
 118 int                     autovacuum_work_mem = -1;
 119 int                     autovacuum_naptime;
 120 int                     autovacuum_vac_thresh;
 121 double          autovacuum_vac_scale;
 122 int                     autovacuum_vac_ins_thresh;
 123 double          autovacuum_vac_ins_scale;
 124 int                     autovacuum_anl_thresh;
 125 double          autovacuum_anl_scale;
 126 int                     autovacuum_freeze_max_age;
 127 int                     autovacuum_multixact_freeze_max_age;
 128
 129 double          autovacuum_vac_cost_delay;
 130 int                     autovacuum_vac_cost_limit;
 131
 132 int                     Log_autovacuum_min_duration = 600000;
 133
 134 /* the minimum allowed time between two awakenings of the launcher */
 135 #define MIN_AUTOVAC_SLEEPTIME 100.0 /* milliseconds */
 136 #define MAX_AUTOVAC_SLEEPTIME 300       /* seconds */
 137
 138 /* Flags to tell if we are in an autovacuum process */
 139 static bool am_autovacuum_launcher = false;
 140 static bool am_autovacuum_worker = false;
 141
 142 /* Flags set by signal handlers */
 143 static volatile sig_atomic_t got_SIGUSR2 = false;
 144
 145 /* Comparison points for determining whether freeze_max_age is exceeded */
 146 static TransactionId recentXid;
 147 static MultiXactId recentMulti;
 148
 149 /* Default freeze ages to use for autovacuum (varies by database) */
 150 static int      default_freeze_min_age;
 151 static int      default_freeze_table_age;
 152 static int      default_multixact_freeze_min_age;
 153 static int      default_multixact_freeze_table_age;
 154
 155 /* Memory context for long-lived data */
 156 static MemoryContext AutovacMemCxt;
 157
 158 /* struct to keep track of databases in launcher */
 159 typedef struct avl_dbase
 160 {
 161         Oid                     adl_datid;              /* hash key -- must be first */
 162         TimestampTz adl_next_worker;
 163         int                     adl_score;
 164         dlist_node      adl_node;
 165 } avl_dbase;
 166
 167 /* struct to keep track of databases in worker */
 168 typedef struct avw_dbase
 169 {
 170         Oid                     adw_datid;
 171         char       *adw_name;
 172         TransactionId adw_frozenxid;
 173         MultiXactId adw_minmulti;
 174         PgStat_StatDBEntry *adw_entry;
 175 } avw_dbase;
 176
 177 /* struct to keep track of tables to vacuum and/or analyze, in 1st pass */
 178 typedef struct av_relation
 179 {
 180         Oid                     ar_toastrelid;  /* hash key - must be first */
 181         Oid                     ar_relid;
 182         bool            ar_hasrelopts;
 183         AutoVacOpts ar_reloptions;      /* copy of AutoVacOpts from the main table's
 184                                                                  * reloptions, or NULL if none */
 185 } av_relation;
 186
 187 /* struct to keep track of tables to vacuum and/or analyze, after rechecking */
 188 typedef struct autovac_table
 189 {
 190         Oid                     at_relid;
 191         VacuumParams at_params;
 192         double          at_vacuum_cost_delay;
 193         int                     at_vacuum_cost_limit;
 194         bool            at_dobalance;
 195         bool            at_sharedrel;
 196         char       *at_relname;
 197         char       *at_nspname;
 198         char       *at_datname;
 199 } autovac_table;
 200
 201 /*-------------
 202  * This struct holds information about a single worker's whereabouts.  We keep
 203  * an array of these in shared memory, sized according to
 204  * autovacuum_max_workers.
 205  *
 206  * wi_links             entry into free list or running list
 207  * wi_dboid             OID of the database this worker is supposed to work on
 208  * wi_tableoid  OID of the table currently being vacuumed, if any
 209  * wi_sharedrel flag indicating whether table is marked relisshared
 210  * wi_proc              pointer to PGPROC of the running worker, NULL if not started
 211  * wi_launchtime Time at which this worker was launched
 212  * wi_cost_*    Vacuum cost-based delay parameters current in this worker
 213  *
 214  * All fields are protected by AutovacuumLock, except for wi_tableoid and
 215  * wi_sharedrel which are protected by AutovacuumScheduleLock (note these
 216  * two fields are read-only for everyone except that worker itself).
 217  *-------------
 218  */
 219 typedef struct WorkerInfoData
 220 {
 221         dlist_node      wi_links;
 222         Oid                     wi_dboid;
 223         Oid                     wi_tableoid;
 224         PGPROC     *wi_proc;
 225         TimestampTz wi_launchtime;
 226         bool            wi_dobalance;
 227         bool            wi_sharedrel;
 228         double          wi_cost_delay;
 229         int                     wi_cost_limit;
 230         int                     wi_cost_limit_base;
 231 } WorkerInfoData;
 232
 233 typedef struct WorkerInfoData *WorkerInfo;
 234
 235 /*
 236  * Possible signals received by the launcher from remote processes.  These are
 237  * stored atomically in shared memory so that other processes can set them
 238  * without locking.
 239  */
 240 typedef enum
 241 {
 242         AutoVacForkFailed,                      /* failed trying to start a worker */
 243         AutoVacRebalance,                       /* rebalance the cost limits */
 244         AutoVacNumSignals                       /* must be last */
 245 }                       AutoVacuumSignal;
 246
 247 /*
 248  * Autovacuum workitem array, stored in AutoVacuumShmem->av_workItems.  This
 249  * list is mostly protected by AutovacuumLock, except that if an item is
 250  * marked 'active' other processes must not modify the work-identifying
 251  * members.
 252  */
 253 typedef struct AutoVacuumWorkItem
 254 {
 255         AutoVacuumWorkItemType avw_type;
 256         bool            avw_used;               /* below data is valid */
 257         bool            avw_active;             /* being processed */
 258         Oid                     avw_database;
 259         Oid                     avw_relation;
 260         BlockNumber avw_blockNumber;
 261 } AutoVacuumWorkItem;
 262
 263 #define NUM_WORKITEMS   256
 264
 265 /*-------------
 266  * The main autovacuum shmem struct.  On shared memory we store this main
 267  * struct and the array of WorkerInfo structs.  This struct keeps:
 268  *
 269  * av_signal            set by other processes to indicate various conditions
 270  * av_launcherpid       the PID of the autovacuum launcher
 271  * av_freeWorkers       the WorkerInfo freelist
 272  * av_runningWorkers the WorkerInfo non-free queue
 273  * av_startingWorker pointer to WorkerInfo currently being started (cleared by
 274  *                                      the worker itself as soon as it's up and running)
 275  * av_workItems         work item array
 276  *
 277  * This struct is protected by AutovacuumLock, except for av_signal and parts
 278  * of the worker list (see above).
 279  *-------------
 280  */
 281 typedef struct
 282 {
 283         sig_atomic_t av_signal[AutoVacNumSignals];
 284         pid_t           av_launcherpid;
 285         dlist_head      av_freeWorkers;
 286         dlist_head      av_runningWorkers;
 287         WorkerInfo      av_startingWorker;
 288         AutoVacuumWorkItem av_workItems[NUM_WORKITEMS];
 289 } AutoVacuumShmemStruct;
 290
 291 static AutoVacuumShmemStruct *AutoVacuumShmem;
 292
 293 /*
 294  * the database list (of avl_dbase elements) in the launcher, and the context
 295  * that contains it
 296  */
 297 static dlist_head DatabaseList = DLIST_STATIC_INIT(DatabaseList);
 298 static MemoryContext DatabaseListCxt = NULL;
 299
 300 /* Pointer to my own WorkerInfo, valid on each worker */
 301 static WorkerInfo MyWorkerInfo = NULL;
 302
 303 /* PID of launcher, valid only in worker while shutting down */
 304 int                     AutovacuumLauncherPid = 0;
 305
 306 #ifdef EXEC_BACKEND
 307 static pid_t avlauncher_forkexec(void);
 308 static pid_t avworker_forkexec(void);
 309 #endif
 310 NON_EXEC_STATIC void AutoVacWorkerMain(int argc, char *argv[]) pg_attribute_noreturn();
 311 NON_EXEC_STATIC void AutoVacLauncherMain(int argc, char *argv[]) pg_attribute_noreturn();
 312
 313 static Oid      do_start_worker(void);
 314 static void HandleAutoVacLauncherInterrupts(void);
 315 static void AutoVacLauncherShutdown(void) pg_attribute_noreturn();
 316 static void launcher_determine_sleep(bool canlaunch, bool recursing,
 317                                                                          struct timeval *nap);
 318 static void launch_worker(TimestampTz now);
 319 static List *get_database_list(void);
 320 static void rebuild_database_list(Oid newdb);
 321 static int      db_comparator(const void *a, const void *b);
 322 static void autovac_balance_cost(void);
 323
 324 static void do_autovacuum(void);
 325 static void FreeWorkerInfo(int code, Datum arg);
 326
 327 static autovac_table *table_recheck_autovac(Oid relid, HTAB *table_toast_map,
 328                                                                                         TupleDesc pg_class_desc,
 329                                                                                         int effective_multixact_freeze_max_age);
 330 static void recheck_relation_needs_vacanalyze(Oid relid, AutoVacOpts *avopts,
 331                                                                                           Form_pg_class classForm,
 332                                                                                           int effective_multixact_freeze_max_age,
 333                                                                                           bool *dovacuum, bool *doanalyze, bool *wraparound);
 334 static void relation_needs_vacanalyze(Oid relid, AutoVacOpts *relopts,
 335                                                                           Form_pg_class classForm,
 336                                                                           PgStat_StatTabEntry *tabentry,
 337                                                                           int effective_multixact_freeze_max_age,
 338                                                                           bool *dovacuum, bool *doanalyze, bool *wraparound);
 339
 340 static void autovacuum_do_vac_analyze(autovac_table *tab,
 341                                                                           BufferAccessStrategy bstrategy);
 342 static AutoVacOpts *extract_autovac_opts(HeapTuple tup,
 343                                                                                  TupleDesc pg_class_desc);
 344 static void perform_work_item(AutoVacuumWorkItem *workitem);
 345 static void autovac_report_activity(autovac_table *tab);
 346 static void autovac_report_workitem(AutoVacuumWorkItem *workitem,
 347                                                                         const char *nspname, const char *relname);
 348 static void avl_sigusr2_handler(SIGNAL_ARGS);
 349
 350
 351
 352 /********************************************************************
 353  *                                        AUTOVACUUM LAUNCHER CODE
 354  ********************************************************************/
 355
 356 #ifdef EXEC_BACKEND
 357 /*
 358  * forkexec routine for the autovacuum launcher process.
 359  *
 360  * Format up the arglist, then fork and exec.
 361  */
 362 static pid_t
 363 avlauncher_forkexec(void)
 364 {
 365         char       *av[10];
 366         int                     ac = 0;
 367
 368         av[ac++] = "postgres";
 369         av[ac++] = "--forkavlauncher";
 370         av[ac++] = NULL;                        /* filled in by postmaster_forkexec */
 371         av[ac] = NULL;
 372
 373         Assert(ac < lengthof(av));
 374
 375         return postmaster_forkexec(ac, av);
 376 }
 377
 378 /*
 379  * We need this set from the outside, before InitProcess is called
 380  */
 381 void
 382 AutovacuumLauncherIAm(void)
 383 {
 384         am_autovacuum_launcher = true;
 385 }
 386 #endif
 387
 388 /*
 389  * Main entry point for autovacuum launcher process, to be called from the
 390  * postmaster.
 391  */
 392 int
 393 StartAutoVacLauncher(void)
 394 {
 395         pid_t           AutoVacPID;
 396
 397 #ifdef EXEC_BACKEND
 398         switch ((AutoVacPID = avlauncher_forkexec()))
 399 #else
 400         switch ((AutoVacPID = fork_process()))
 401 #endif
 402         {
 403                 case -1:
 404                         ereport(LOG,
 405                                         (errmsg("could not fork autovacuum launcher process: %m")));
 406                         return 0;
 407
 408 #ifndef EXEC_BACKEND
 409                 case 0:
 410                         /* in postmaster child ... */
 411                         InitPostmasterChild();
 412
 413                         /* Close the postmaster's sockets */
 414                         ClosePostmasterPorts(false);
 415
 416                         AutoVacLauncherMain(0, NULL);
 417                         break;
 418 #endif
 419                 default:
 420                         return (int) AutoVacPID;
 421         }
 422
 423         /* shouldn't get here */
 424         return 0;
 425 }
 426
 427 /*
 428  * Main loop for the autovacuum launcher process.
 429  */
 430 NON_EXEC_STATIC void
 431 AutoVacLauncherMain(int argc, char *argv[])
 432 {
 433         sigjmp_buf      local_sigjmp_buf;
 434
 435         am_autovacuum_launcher = true;
 436
 437         MyBackendType = B_AUTOVAC_LAUNCHER;
 438         init_ps_display(NULL);
 439
 440         ereport(DEBUG1,
 441                         (errmsg_internal("autovacuum launcher started")));
 442
 443         if (PostAuthDelay)
 444                 pg_usleep(PostAuthDelay * 1000000L);
 445
 446         SetProcessingMode(InitProcessing);
 447
 448         /*
 449          * Set up signal handlers.  We operate on databases much like a regular
 450          * backend, so we use the same signal handling.  See equivalent code in
 451          * tcop/postgres.c.
 452          */
 453         pqsignal(SIGHUP, SignalHandlerForConfigReload);
 454         pqsignal(SIGINT, StatementCancelHandler);
 455         pqsignal(SIGTERM, SignalHandlerForShutdownRequest);
 456         /* SIGQUIT handler was already set up by InitPostmasterChild */
 457
 458         InitializeTimeouts();           /* establishes SIGALRM handler */
 459
 460         pqsignal(SIGPIPE, SIG_IGN);
 461         pqsignal(SIGUSR1, procsignal_sigusr1_handler);
 462         pqsignal(SIGUSR2, avl_sigusr2_handler);
 463         pqsignal(SIGFPE, FloatExceptionHandler);
 464         pqsignal(SIGCHLD, SIG_DFL);
 465
 466         /*
 467          * Create a per-backend PGPROC struct in shared memory, except in the
 468          * EXEC_BACKEND case where this was done in SubPostmasterMain. We must do
 469          * this before we can use LWLocks (and in the EXEC_BACKEND case we already
 470          * had to do some stuff with LWLocks).
 471          */
 472 #ifndef EXEC_BACKEND
 473         InitProcess();
 474 #endif
 475
 476         /* Early initialization */
 477         BaseInit();
 478
 479         InitPostgres(NULL, InvalidOid, NULL, InvalidOid, false, false, NULL);
 480
 481         SetProcessingMode(NormalProcessing);
 482
 483         /*
 484          * Create a memory context that we will do all our work in.  We do this so
 485          * that we can reset the context during error recovery and thereby avoid
 486          * possible memory leaks.
 487          */
 488         AutovacMemCxt = AllocSetContextCreate(TopMemoryContext,
 489                                                                                   "Autovacuum Launcher",
 490                                                                                   ALLOCSET_DEFAULT_SIZES);
 491         MemoryContextSwitchTo(AutovacMemCxt);
 492
 493         /*
 494          * If an exception is encountered, processing resumes here.
 495          *
 496          * This code is a stripped down version of PostgresMain error recovery.
 497          *
 498          * Note that we use sigsetjmp(..., 1), so that the prevailing signal mask
 499          * (to wit, BlockSig) will be restored when longjmp'ing to here.  Thus,
 500          * signals other than SIGQUIT will be blocked until we complete error
 501          * recovery.  It might seem that this policy makes the HOLD_INTERRUPTS()
 502          * call redundant, but it is not since InterruptPending might be set
 503          * already.
 504          */
 505         if (sigsetjmp(local_sigjmp_buf, 1) != 0)
 506         {
 507                 /* since not using PG_TRY, must reset error stack by hand */
 508                 error_context_stack = NULL;
 509
 510                 /* Prevents interrupts while cleaning up */
 511                 HOLD_INTERRUPTS();
 512
 513                 /* Forget any pending QueryCancel or timeout request */
 514                 disable_all_timeouts(false);
 515                 QueryCancelPending = false; /* second to avoid race condition */
 516
 517                 /* Report the error to the server log */
 518                 EmitErrorReport();
 519
 520                 /* Abort the current transaction in order to recover */
 521                 AbortCurrentTransaction();
 522
 523                 /*
 524                  * Release any other resources, for the case where we were not in a
 525                  * transaction.
 526                  */
 527                 LWLockReleaseAll();
 528                 pgstat_report_wait_end();
 529                 AbortBufferIO();
 530                 UnlockBuffers();
 531                 /* this is probably dead code, but let's be safe: */
 532                 if (AuxProcessResourceOwner)
 533                         ReleaseAuxProcessResources(false);
 534                 AtEOXact_Buffers(false);
 535                 AtEOXact_SMgr();
 536                 AtEOXact_Files(false);
 537                 AtEOXact_HashTables(false);
 538
 539                 /*
 540                  * Now return to normal top-level context and clear ErrorContext for
 541                  * next time.
 542                  */
 543                 MemoryContextSwitchTo(AutovacMemCxt);
 544                 FlushErrorState();
 545
 546                 /* Flush any leaked data in the top-level context */
 547                 MemoryContextResetAndDeleteChildren(AutovacMemCxt);
 548
 549                 /* don't leave dangling pointers to freed memory */
 550                 DatabaseListCxt = NULL;
 551                 dlist_init(&DatabaseList);
 552
 553                 /* Now we can allow interrupts again */
 554                 RESUME_INTERRUPTS();
 555
 556                 /* if in shutdown mode, no need for anything further; just go away */
 557                 if (ShutdownRequestPending)
 558                         AutoVacLauncherShutdown();
 559
 560                 /*
 561                  * Sleep at least 1 second after any error.  We don't want to be
 562                  * filling the error logs as fast as we can.
 563                  */
 564                 pg_usleep(1000000L);
 565         }
 566
 567         /* We can now handle ereport(ERROR) */
 568         PG_exception_stack = &local_sigjmp_buf;
 569
 570         /* must unblock signals before calling rebuild_database_list */
 571         sigprocmask(SIG_SETMASK, &UnBlockSig, NULL);
 572
 573         /*
 574          * Set always-secure search path.  Launcher doesn't connect to a database,
 575          * so this has no effect.
 576          */
 577         SetConfigOption("search_path", "", PGC_SUSET, PGC_S_OVERRIDE);
 578
 579         /*
 580          * Force zero_damaged_pages OFF in the autovac process, even if it is set
 581          * in postgresql.conf.  We don't really want such a dangerous option being
 582          * applied non-interactively.
 583          */
 584         SetConfigOption("zero_damaged_pages", "false", PGC_SUSET, PGC_S_OVERRIDE);
 585
 586         /*
 587          * Force settable timeouts off to avoid letting these settings prevent
 588          * regular maintenance from being executed.
 589          */
 590         SetConfigOption("statement_timeout", "0", PGC_SUSET, PGC_S_OVERRIDE);
 591         SetConfigOption("lock_timeout", "0", PGC_SUSET, PGC_S_OVERRIDE);
 592         SetConfigOption("idle_in_transaction_session_timeout", "0",
 593                                         PGC_SUSET, PGC_S_OVERRIDE);
 594
 595         /*
 596          * Force default_transaction_isolation to READ COMMITTED.  We don't want
 597          * to pay the overhead of serializable mode, nor add any risk of causing
 598          * deadlocks or delaying other transactions.
 599          */
 600         SetConfigOption("default_transaction_isolation", "read committed",
 601                                         PGC_SUSET, PGC_S_OVERRIDE);
 602
 603         /*
 604          * Even when system is configured to use a different fetch consistency,
 605          * for autovac we always want fresh stats.
 606          */
 607         SetConfigOption("stats_fetch_consistency", "none", PGC_SUSET, PGC_S_OVERRIDE);
 608
 609         /*
 610          * In emergency mode, just start a worker (unless shutdown was requested)
 611          * and go away.
 612          */
 613         if (!AutoVacuumingActive())
 614         {
 615                 if (!ShutdownRequestPending)
 616                         do_start_worker();
 617                 proc_exit(0);                   /* done */
 618         }
 619
 620         AutoVacuumShmem->av_launcherpid = MyProcPid;
 621
 622         /*
 623          * Create the initial database list.  The invariant we want this list to
 624          * keep is that it's ordered by decreasing next_time.  As soon as an entry
 625          * is updated to a higher time, it will be moved to the front (which is
 626          * correct because the only operation is to add autovacuum_naptime to the
 627          * entry, and time always increases).
 628          */
 629         rebuild_database_list(InvalidOid);
 630
 631         /* loop until shutdown request */
 632         while (!ShutdownRequestPending)
 633         {
 634                 struct timeval nap;
 635                 TimestampTz current_time = 0;
 636                 bool            can_launch;
 637
 638                 /*
 639                  * This loop is a bit different from the normal use of WaitLatch,
 640                  * because we'd like to sleep before the first launch of a child
 641                  * process.  So it's WaitLatch, then ResetLatch, then check for
 642                  * wakening conditions.
 643                  */
 644
 645                 launcher_determine_sleep(!dlist_is_empty(&AutoVacuumShmem->av_freeWorkers),
 646                                                                  false, &nap);
 647
 648                 /*
 649                  * Wait until naptime expires or we get some type of signal (all the
 650                  * signal handlers will wake us by calling SetLatch).
 651                  */
 652                 (void) WaitLatch(MyLatch,
 653                                                  WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
 654                                                  (nap.tv_sec * 1000L) + (nap.tv_usec / 1000L),
 655                                                  WAIT_EVENT_AUTOVACUUM_MAIN);
 656
 657                 ResetLatch(MyLatch);
 658
 659                 HandleAutoVacLauncherInterrupts();
 660
 661                 /*
 662                  * a worker finished, or postmaster signaled failure to start a worker
 663                  */
 664                 if (got_SIGUSR2)
 665                 {
 666                         got_SIGUSR2 = false;
 667
 668                         /* rebalance cost limits, if needed */
 669                         if (AutoVacuumShmem->av_signal[AutoVacRebalance])
 670                         {
 671                                 LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
 672                                 AutoVacuumShmem->av_signal[AutoVacRebalance] = false;
 673                                 autovac_balance_cost();
 674                                 LWLockRelease(AutovacuumLock);
 675                         }
 676
 677                         if (AutoVacuumShmem->av_signal[AutoVacForkFailed])
 678                         {
 679                                 /*
 680                                  * If the postmaster failed to start a new worker, we sleep
 681                                  * for a little while and resend the signal.  The new worker's
 682                                  * state is still in memory, so this is sufficient.  After
 683                                  * that, we restart the main loop.
 684                                  *
 685                                  * XXX should we put a limit to the number of times we retry?
 686                                  * I don't think it makes much sense, because a future start
 687                                  * of a worker will continue to fail in the same way.
 688                                  */
 689                                 AutoVacuumShmem->av_signal[AutoVacForkFailed] = false;
 690                                 pg_usleep(1000000L);    /* 1s */
 691                                 SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_WORKER);
 692                                 continue;
 693                         }
 694                 }
 695
 696                 /*
 697                  * There are some conditions that we need to check before trying to
 698                  * start a worker.  First, we need to make sure that there is a worker
 699                  * slot available.  Second, we need to make sure that no other worker
 700                  * failed while starting up.
 701                  */
 702
 703                 current_time = GetCurrentTimestamp();
 704                 LWLockAcquire(AutovacuumLock, LW_SHARED);
 705
 706                 can_launch = !dlist_is_empty(&AutoVacuumShmem->av_freeWorkers);
 707
 708                 if (AutoVacuumShmem->av_startingWorker != NULL)
 709                 {
 710                         int                     waittime;
 711                         WorkerInfo      worker = AutoVacuumShmem->av_startingWorker;
 712
 713                         /*
 714                          * We can't launch another worker when another one is still
 715                          * starting up (or failed while doing so), so just sleep for a bit
 716                          * more; that worker will wake us up again as soon as it's ready.
 717                          * We will only wait autovacuum_naptime seconds (up to a maximum
 718                          * of 60 seconds) for this to happen however.  Note that failure
 719                          * to connect to a particular database is not a problem here,
 720                          * because the worker removes itself from the startingWorker
 721                          * pointer before trying to connect.  Problems detected by the
 722                          * postmaster (like fork() failure) are also reported and handled
 723                          * differently.  The only problems that may cause this code to
 724                          * fire are errors in the earlier sections of AutoVacWorkerMain,
 725                          * before the worker removes the WorkerInfo from the
 726                          * startingWorker pointer.
 727                          */
 728                         waittime = Min(autovacuum_naptime, 60) * 1000;
 729                         if (TimestampDifferenceExceeds(worker->wi_launchtime, current_time,
 730                                                                                    waittime))
 731                         {
 732                                 LWLockRelease(AutovacuumLock);
 733                                 LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
 734
 735                                 /*
 736                                  * No other process can put a worker in starting mode, so if
 737                                  * startingWorker is still INVALID after exchanging our lock,
 738                                  * we assume it's the same one we saw above (so we don't
 739                                  * recheck the launch time).
 740                                  */
 741                                 if (AutoVacuumShmem->av_startingWorker != NULL)
 742                                 {
 743                                         worker = AutoVacuumShmem->av_startingWorker;
 744                                         worker->wi_dboid = InvalidOid;
 745                                         worker->wi_tableoid = InvalidOid;
 746                                         worker->wi_sharedrel = false;
 747                                         worker->wi_proc = NULL;
 748                                         worker->wi_launchtime = 0;
 749                                         dlist_push_head(&AutoVacuumShmem->av_freeWorkers,
 750                                                                         &worker->wi_links);
 751                                         AutoVacuumShmem->av_startingWorker = NULL;
 752                                         ereport(WARNING,
 753                                                         errmsg("autovacuum worker took too long to start; canceled"));
 754                                 }
 755                         }
 756                         else
 757                                 can_launch = false;
 758                 }
 759                 LWLockRelease(AutovacuumLock);  /* either shared or exclusive */
 760
 761                 /* if we can't do anything, just go back to sleep */
 762                 if (!can_launch)
 763                         continue;
 764
 765                 /* We're OK to start a new worker */
 766
 767                 if (dlist_is_empty(&DatabaseList))
 768                 {
 769                         /*
 770                          * Special case when the list is empty: start a worker right away.
 771                          * This covers the initial case, when no database is in pgstats
 772                          * (thus the list is empty).  Note that the constraints in
 773                          * launcher_determine_sleep keep us from starting workers too
 774                          * quickly (at most once every autovacuum_naptime when the list is
 775                          * empty).
 776                          */
 777                         launch_worker(current_time);
 778                 }
 779                 else
 780                 {
 781                         /*
 782                          * because rebuild_database_list constructs a list with most
 783                          * distant adl_next_worker first, we obtain our database from the
 784                          * tail of the list.
 785                          */
 786                         avl_dbase  *avdb;
 787
 788                         avdb = dlist_tail_element(avl_dbase, adl_node, &DatabaseList);
 789
 790                         /*
 791                          * launch a worker if next_worker is right now or it is in the
 792                          * past
 793                          */
 794                         if (TimestampDifferenceExceeds(avdb->adl_next_worker,
 795                                                                                    current_time, 0))
 796                                 launch_worker(current_time);
 797                 }
 798         }
 799
 800         AutoVacLauncherShutdown();
 801 }
 802
 803 /*
 804  * Process any new interrupts.
 805  */
 806 static void
 807 HandleAutoVacLauncherInterrupts(void)
 808 {
 809         /* the normal shutdown case */
 810         if (ShutdownRequestPending)
 811                 AutoVacLauncherShutdown();
 812
 813         if (ConfigReloadPending)
 814         {
 815                 ConfigReloadPending = false;
 816                 ProcessConfigFile(PGC_SIGHUP);
 817
 818                 /* shutdown requested in config file? */
 819                 if (!AutoVacuumingActive())
 820                         AutoVacLauncherShutdown();
 821
 822                 /* rebalance in case the default cost parameters changed */
 823                 LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
 824                 autovac_balance_cost();
 825                 LWLockRelease(AutovacuumLock);
 826
 827                 /* rebuild the list in case the naptime changed */
 828                 rebuild_database_list(InvalidOid);
 829         }
 830
 831         /* Process barrier events */
 832         if (ProcSignalBarrierPending)
 833                 ProcessProcSignalBarrier();
 834
 835         /* Perform logging of memory contexts of this process */
 836         if (LogMemoryContextPending)
 837                 ProcessLogMemoryContextInterrupt();
 838
 839         /* Process sinval catchup interrupts that happened while sleeping */
 840         ProcessCatchupInterrupt();
 841 }
 842
 843 /*
 844  * Perform a normal exit from the autovac launcher.
 845  */
 846 static void
 847 AutoVacLauncherShutdown(void)
 848 {
 849         ereport(DEBUG1,
 850                         (errmsg_internal("autovacuum launcher shutting down")));
 851         AutoVacuumShmem->av_launcherpid = 0;
 852
 853         proc_exit(0);                           /* done */
 854 }
 855
 856 /*
 857  * Determine the time to sleep, based on the database list.
 858  *
 859  * The "canlaunch" parameter indicates whether we can start a worker right now,
 860  * for example due to the workers being all busy.  If this is false, we will
 861  * cause a long sleep, which will be interrupted when a worker exits.
 862  */
 863 static void
 864 launcher_determine_sleep(bool canlaunch, bool recursing, struct timeval *nap)
 865 {
 866         /*
 867          * We sleep until the next scheduled vacuum.  We trust that when the
 868          * database list was built, care was taken so that no entries have times
 869          * in the past; if the first entry has too close a next_worker value, or a
 870          * time in the past, we will sleep a small nominal time.
 871          */
 872         if (!canlaunch)
 873         {
 874                 nap->tv_sec = autovacuum_naptime;
 875                 nap->tv_usec = 0;
 876         }
 877         else if (!dlist_is_empty(&DatabaseList))
 878         {
 879                 TimestampTz current_time = GetCurrentTimestamp();
 880                 TimestampTz next_wakeup;
 881                 avl_dbase  *avdb;
 882                 long            secs;
 883                 int                     usecs;
 884
 885                 avdb = dlist_tail_element(avl_dbase, adl_node, &DatabaseList);
 886
 887                 next_wakeup = avdb->adl_next_worker;
 888                 TimestampDifference(current_time, next_wakeup, &secs, &usecs);
 889
 890                 nap->tv_sec = secs;
 891                 nap->tv_usec = usecs;
 892         }
 893         else
 894         {
 895                 /* list is empty, sleep for whole autovacuum_naptime seconds  */
 896                 nap->tv_sec = autovacuum_naptime;
 897                 nap->tv_usec = 0;
 898         }
 899
 900         /*
 901          * If the result is exactly zero, it means a database had an entry with
 902          * time in the past.  Rebuild the list so that the databases are evenly
 903          * distributed again, and recalculate the time to sleep.  This can happen
 904          * if there are more tables needing vacuum than workers, and they all take
 905          * longer to vacuum than autovacuum_naptime.
 906          *
 907          * We only recurse once.  rebuild_database_list should always return times
 908          * in the future, but it seems best not to trust too much on that.
 909          */
 910         if (nap->tv_sec == 0 && nap->tv_usec == 0 && !recursing)
 911         {
 912                 rebuild_database_list(InvalidOid);
 913                 launcher_determine_sleep(canlaunch, true, nap);
 914                 return;
 915         }
 916
 917         /* The smallest time we'll allow the launcher to sleep. */
 918         if (nap->tv_sec <= 0 && nap->tv_usec <= MIN_AUTOVAC_SLEEPTIME * 1000)
 919         {
 920                 nap->tv_sec = 0;
 921                 nap->tv_usec = MIN_AUTOVAC_SLEEPTIME * 1000;
 922         }
 923
 924         /*
 925          * If the sleep time is too large, clamp it to an arbitrary maximum (plus
 926          * any fractional seconds, for simplicity).  This avoids an essentially
 927          * infinite sleep in strange cases like the system clock going backwards a
 928          * few years.
 929          */
 930         if (nap->tv_sec > MAX_AUTOVAC_SLEEPTIME)
 931                 nap->tv_sec = MAX_AUTOVAC_SLEEPTIME;
 932 }
 933
 934 /*
 935  * Build an updated DatabaseList.  It must only contain databases that appear
 936  * in pgstats, and must be sorted by next_worker from highest to lowest,
 937  * distributed regularly across the next autovacuum_naptime interval.
 938  *
 939  * Receives the Oid of the database that made this list be generated (we call
 940  * this the "new" database, because when the database was already present on
 941  * the list, we expect that this function is not called at all).  The
 942  * preexisting list, if any, will be used to preserve the order of the
 943  * databases in the autovacuum_naptime period.  The new database is put at the
 944  * end of the interval.  The actual values are not saved, which should not be
 945  * much of a problem.
 946  */
 947 static void
 948 rebuild_database_list(Oid newdb)
 949 {
 950         List       *dblist;
 951         ListCell   *cell;
 952         MemoryContext newcxt;
 953         MemoryContext oldcxt;
 954         MemoryContext tmpcxt;
 955         HASHCTL         hctl;
 956         int                     score;
 957         int                     nelems;
 958         HTAB       *dbhash;
 959         dlist_iter      iter;
 960
 961         newcxt = AllocSetContextCreate(AutovacMemCxt,
 962                                                                    "Autovacuum database list",
 963                                                                    ALLOCSET_DEFAULT_SIZES);
 964         tmpcxt = AllocSetContextCreate(newcxt,
 965                                                                    "Autovacuum database list (tmp)",
 966                                                                    ALLOCSET_DEFAULT_SIZES);
 967         oldcxt = MemoryContextSwitchTo(tmpcxt);
 968
 969         /*
 970          * Implementing this is not as simple as it sounds, because we need to put
 971          * the new database at the end of the list; next the databases that were
 972          * already on the list, and finally (at the tail of the list) all the
 973          * other databases that are not on the existing list.
 974          *
 975          * To do this, we build an empty hash table of scored databases.  We will
 976          * start with the lowest score (zero) for the new database, then
 977          * increasing scores for the databases in the existing list, in order, and
 978          * lastly increasing scores for all databases gotten via
 979          * get_database_list() that are not already on the hash.
 980          *
 981          * Then we will put all the hash elements into an array, sort the array by
 982          * score, and finally put the array elements into the new doubly linked
 983          * list.
 984          */
 985         hctl.keysize = sizeof(Oid);
 986         hctl.entrysize = sizeof(avl_dbase);
 987         hctl.hcxt = tmpcxt;
 988         dbhash = hash_create("autovacuum db hash", 20, &hctl,   /* magic number here
 989                                                                                                                          * FIXME */
 990                                                  HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
 991
 992         /* start by inserting the new database */
 993         score = 0;
 994         if (OidIsValid(newdb))
 995         {
 996                 avl_dbase  *db;
 997                 PgStat_StatDBEntry *entry;
 998
 999                 /* only consider this database if it has a pgstat entry */
1000                 entry = pgstat_fetch_stat_dbentry(newdb);
1001                 if (entry != NULL)
1002                 {
1003                         /* we assume it isn't found because the hash was just created */
1004                         db = hash_search(dbhash, &newdb, HASH_ENTER, NULL);
1005
1006                         /* hash_search already filled in the key */
1007                         db->adl_score = score++;
1008                         /* next_worker is filled in later */
1009                 }
1010         }
1011
1012         /* Now insert the databases from the existing list */
1013         dlist_foreach(iter, &DatabaseList)
1014         {
1015                 avl_dbase  *avdb = dlist_container(avl_dbase, adl_node, iter.cur);
1016                 avl_dbase  *db;
1017                 bool            found;
1018                 PgStat_StatDBEntry *entry;
1019
1020                 /*
1021                  * skip databases with no stat entries -- in particular, this gets rid
1022                  * of dropped databases
1023                  */
1024                 entry = pgstat_fetch_stat_dbentry(avdb->adl_datid);
1025                 if (entry == NULL)
1026                         continue;
1027
1028                 db = hash_search(dbhash, &(avdb->adl_datid), HASH_ENTER, &found);
1029
1030                 if (!found)
1031                 {
1032                         /* hash_search already filled in the key */
1033                         db->adl_score = score++;
1034                         /* next_worker is filled in later */
1035                 }
1036         }
1037
1038         /* finally, insert all qualifying databases not previously inserted */
1039         dblist = get_database_list();
1040         foreach(cell, dblist)
1041         {
1042                 avw_dbase  *avdb = lfirst(cell);
1043                 avl_dbase  *db;
1044                 bool            found;
1045                 PgStat_StatDBEntry *entry;
1046
1047                 /* only consider databases with a pgstat entry */
1048                 entry = pgstat_fetch_stat_dbentry(avdb->adw_datid);
1049                 if (entry == NULL)
1050                         continue;
1051
1052                 db = hash_search(dbhash, &(avdb->adw_datid), HASH_ENTER, &found);
1053                 /* only update the score if the database was not already on the hash */
1054                 if (!found)
1055                 {
1056                         /* hash_search already filled in the key */
1057                         db->adl_score = score++;
1058                         /* next_worker is filled in later */
1059                 }
1060         }
1061         nelems = score;
1062
1063         /* from here on, the allocated memory belongs to the new list */
1064         MemoryContextSwitchTo(newcxt);
1065         dlist_init(&DatabaseList);
1066
1067         if (nelems > 0)
1068         {
1069                 TimestampTz current_time;
1070                 int                     millis_increment;
1071                 avl_dbase  *dbary;
1072                 avl_dbase  *db;
1073                 HASH_SEQ_STATUS seq;
1074                 int                     i;
1075
1076                 /* put all the hash elements into an array */
1077                 dbary = palloc(nelems * sizeof(avl_dbase));
1078
1079                 i = 0;
1080                 hash_seq_init(&seq, dbhash);
1081                 while ((db = hash_seq_search(&seq)) != NULL)
1082                         memcpy(&(dbary[i++]), db, sizeof(avl_dbase));
1083
1084                 /* sort the array */
1085                 qsort(dbary, nelems, sizeof(avl_dbase), db_comparator);
1086
1087                 /*
1088                  * Determine the time interval between databases in the schedule. If
1089                  * we see that the configured naptime would take us to sleep times
1090                  * lower than our min sleep time (which launcher_determine_sleep is
1091                  * coded not to allow), silently use a larger naptime (but don't touch
1092                  * the GUC variable).
1093                  */
1094                 millis_increment = 1000.0 * autovacuum_naptime / nelems;
1095                 if (millis_increment <= MIN_AUTOVAC_SLEEPTIME)
1096                         millis_increment = MIN_AUTOVAC_SLEEPTIME * 1.1;
1097
1098                 current_time = GetCurrentTimestamp();
1099
1100                 /*
1101                  * move the elements from the array into the dlist, setting the
1102                  * next_worker while walking the array
1103                  */
1104                 for (i = 0; i < nelems; i++)
1105                 {
1106                         db = &(dbary[i]);
1107
1108                         current_time = TimestampTzPlusMilliseconds(current_time,
1109                                                                                                            millis_increment);
1110                         db->adl_next_worker = current_time;
1111
1112                         /* later elements should go closer to the head of the list */
1113                         dlist_push_head(&DatabaseList, &db->adl_node);
1114                 }
1115         }
1116
1117         /* all done, clean up memory */
1118         if (DatabaseListCxt != NULL)
1119                 MemoryContextDelete(DatabaseListCxt);
1120         MemoryContextDelete(tmpcxt);
1121         DatabaseListCxt = newcxt;
1122         MemoryContextSwitchTo(oldcxt);
1123 }
1124
1125 /* qsort comparator for avl_dbase, using adl_score */
1126 static int
1127 db_comparator(const void *a, const void *b)
1128 {
1129         if (((const avl_dbase *) a)->adl_score == ((const avl_dbase *) b)->adl_score)
1130                 return 0;
1131         else
1132                 return (((const avl_dbase *) a)->adl_score < ((const avl_dbase *) b)->adl_score) ? 1 : -1;
1133 }
1134
1135 /*
1136  * do_start_worker
1137  *
1138  * Bare-bones procedure for starting an autovacuum worker from the launcher.
1139  * It determines what database to work on, sets up shared memory stuff and
1140  * signals postmaster to start the worker.  It fails gracefully if invoked when
1141  * autovacuum_workers are already active.
1142  *
1143  * Return value is the OID of the database that the worker is going to process,
1144  * or InvalidOid if no worker was actually started.
1145  */
1146 static Oid
1147 do_start_worker(void)
1148 {
1149         List       *dblist;
1150         ListCell   *cell;
1151         TransactionId xidForceLimit;
1152         MultiXactId multiForceLimit;
1153         bool            for_xid_wrap;
1154         bool            for_multi_wrap;
1155         avw_dbase  *avdb;
1156         TimestampTz current_time;
1157         bool            skipit = false;
1158         Oid                     retval = InvalidOid;
1159         MemoryContext tmpcxt,
1160                                 oldcxt;
1161
1162         /* return quickly when there are no free workers */
1163         LWLockAcquire(AutovacuumLock, LW_SHARED);
1164         if (dlist_is_empty(&AutoVacuumShmem->av_freeWorkers))
1165         {
1166                 LWLockRelease(AutovacuumLock);
1167                 return InvalidOid;
1168         }
1169         LWLockRelease(AutovacuumLock);
1170
1171         /*
1172          * Create and switch to a temporary context to avoid leaking the memory
1173          * allocated for the database list.
1174          */
1175         tmpcxt = AllocSetContextCreate(CurrentMemoryContext,
1176                                                                    "Autovacuum start worker (tmp)",
1177                                                                    ALLOCSET_DEFAULT_SIZES);
1178         oldcxt = MemoryContextSwitchTo(tmpcxt);
1179
1180         /* Get a list of databases */
1181         dblist = get_database_list();
1182
1183         /*
1184          * Determine the oldest datfrozenxid/relfrozenxid that we will allow to
1185          * pass without forcing a vacuum.  (This limit can be tightened for
1186          * particular tables, but not loosened.)
1187          */
1188         recentXid = ReadNextTransactionId();
1189         xidForceLimit = recentXid - autovacuum_freeze_max_age;
1190         /* ensure it's a "normal" XID, else TransactionIdPrecedes misbehaves */
1191         /* this can cause the limit to go backwards by 3, but that's OK */
1192         if (xidForceLimit < FirstNormalTransactionId)
1193                 xidForceLimit -= FirstNormalTransactionId;
1194
1195         /* Also determine the oldest datminmxid we will consider. */
1196         recentMulti = ReadNextMultiXactId();
1197         multiForceLimit = recentMulti - MultiXactMemberFreezeThreshold();
1198         if (multiForceLimit < FirstMultiXactId)
1199                 multiForceLimit -= FirstMultiXactId;
1200
1201         /*
1202          * Choose a database to connect to.  We pick the database that was least
1203          * recently auto-vacuumed, or one that needs vacuuming to prevent Xid
1204          * wraparound-related data loss.  If any db at risk of Xid wraparound is
1205          * found, we pick the one with oldest datfrozenxid, independently of
1206          * autovacuum times; similarly we pick the one with the oldest datminmxid
1207          * if any is in MultiXactId wraparound.  Note that those in Xid wraparound
1208          * danger are given more priority than those in multi wraparound danger.
1209          *
1210          * Note that a database with no stats entry is not considered, except for
1211          * Xid wraparound purposes.  The theory is that if no one has ever
1212          * connected to it since the stats were last initialized, it doesn't need
1213          * vacuuming.
1214          *
1215          * XXX This could be improved if we had more info about whether it needs
1216          * vacuuming before connecting to it.  Perhaps look through the pgstats
1217          * data for the database's tables?  One idea is to keep track of the
1218          * number of new and dead tuples per database in pgstats.  However it
1219          * isn't clear how to construct a metric that measures that and not cause
1220          * starvation for less busy databases.
1221          */
1222         avdb = NULL;
1223         for_xid_wrap = false;
1224         for_multi_wrap = false;
1225         current_time = GetCurrentTimestamp();
1226         foreach(cell, dblist)
1227         {
1228                 avw_dbase  *tmp = lfirst(cell);
1229                 dlist_iter      iter;
1230
1231                 /* Check to see if this one is at risk of wraparound */
1232                 if (TransactionIdPrecedes(tmp->adw_frozenxid, xidForceLimit))
1233                 {
1234                         if (avdb == NULL ||
1235                                 TransactionIdPrecedes(tmp->adw_frozenxid,
1236                                                                           avdb->adw_frozenxid))
1237                                 avdb = tmp;
1238                         for_xid_wrap = true;
1239                         continue;
1240                 }
1241                 else if (for_xid_wrap)
1242                         continue;                       /* ignore not-at-risk DBs */
1243                 else if (MultiXactIdPrecedes(tmp->adw_minmulti, multiForceLimit))
1244                 {
1245                         if (avdb == NULL ||
1246                                 MultiXactIdPrecedes(tmp->adw_minmulti, avdb->adw_minmulti))
1247                                 avdb = tmp;
1248                         for_multi_wrap = true;
1249                         continue;
1250                 }
1251                 else if (for_multi_wrap)
1252                         continue;                       /* ignore not-at-risk DBs */
1253
1254                 /* Find pgstat entry if any */
1255                 tmp->adw_entry = pgstat_fetch_stat_dbentry(tmp->adw_datid);
1256
1257                 /*
1258                  * Skip a database with no pgstat entry; it means it hasn't seen any
1259                  * activity.
1260                  */
1261                 if (!tmp->adw_entry)
1262                         continue;
1263
1264                 /*
1265                  * Also, skip a database that appears on the database list as having
1266                  * been processed recently (less than autovacuum_naptime seconds ago).
1267                  * We do this so that we don't select a database which we just
1268                  * selected, but that pgstat hasn't gotten around to updating the last
1269                  * autovacuum time yet.
1270                  */
1271                 skipit = false;
1272
1273                 dlist_reverse_foreach(iter, &DatabaseList)
1274                 {
1275                         avl_dbase  *dbp = dlist_container(avl_dbase, adl_node, iter.cur);
1276
1277                         if (dbp->adl_datid == tmp->adw_datid)
1278                         {
1279                                 /*
1280                                  * Skip this database if its next_worker value falls between
1281                                  * the current time and the current time plus naptime.
1282                                  */
1283                                 if (!TimestampDifferenceExceeds(dbp->adl_next_worker,
1284                                                                                                 current_time, 0) &&
1285                                         !TimestampDifferenceExceeds(current_time,
1286                                                                                                 dbp->adl_next_worker,
1287                                                                                                 autovacuum_naptime * 1000))
1288                                         skipit = true;
1289
1290                                 break;
1291                         }
1292                 }
1293                 if (skipit)
1294                         continue;
1295
1296                 /*
1297                  * Remember the db with oldest autovac time.  (If we are here, both
1298                  * tmp->entry and db->entry must be non-null.)
1299                  */
1300                 if (avdb == NULL ||
1301                         tmp->adw_entry->last_autovac_time < avdb->adw_entry->last_autovac_time)
1302                         avdb = tmp;
1303         }
1304
1305         /* Found a database -- process it */
1306         if (avdb != NULL)
1307         {
1308                 WorkerInfo      worker;
1309                 dlist_node *wptr;
1310
1311                 LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
1312
1313                 /*
1314                  * Get a worker entry from the freelist.  We checked above, so there
1315                  * really should be a free slot.
1316                  */
1317                 wptr = dlist_pop_head_node(&AutoVacuumShmem->av_freeWorkers);
1318
1319                 worker = dlist_container(WorkerInfoData, wi_links, wptr);
1320                 worker->wi_dboid = avdb->adw_datid;
1321                 worker->wi_proc = NULL;
1322                 worker->wi_launchtime = GetCurrentTimestamp();
1323
1324                 AutoVacuumShmem->av_startingWorker = worker;
1325
1326                 LWLockRelease(AutovacuumLock);
1327
1328                 SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_WORKER);
1329
1330                 retval = avdb->adw_datid;
1331         }
1332         else if (skipit)
1333         {
1334                 /*
1335                  * If we skipped all databases on the list, rebuild it, because it
1336                  * probably contains a dropped database.
1337                  */
1338                 rebuild_database_list(InvalidOid);
1339         }
1340
1341         MemoryContextSwitchTo(oldcxt);
1342         MemoryContextDelete(tmpcxt);
1343
1344         return retval;
1345 }
1346
1347 /*
1348  * launch_worker
1349  *
1350  * Wrapper for starting a worker from the launcher.  Besides actually starting
1351  * it, update the database list to reflect the next time that another one will
1352  * need to be started on the selected database.  The actual database choice is
1353  * left to do_start_worker.
1354  *
1355  * This routine is also expected to insert an entry into the database list if
1356  * the selected database was previously absent from the list.
1357  */
1358 static void
1359 launch_worker(TimestampTz now)
1360 {
1361         Oid                     dbid;
1362         dlist_iter      iter;
1363
1364         dbid = do_start_worker();
1365         if (OidIsValid(dbid))
1366         {
1367                 bool            found = false;
1368
1369                 /*
1370                  * Walk the database list and update the corresponding entry.  If the
1371                  * database is not on the list, we'll recreate the list.
1372                  */
1373                 dlist_foreach(iter, &DatabaseList)
1374                 {
1375                         avl_dbase  *avdb = dlist_container(avl_dbase, adl_node, iter.cur);
1376
1377                         if (avdb->adl_datid == dbid)
1378                         {
1379                                 found = true;
1380
1381                                 /*
1382                                  * add autovacuum_naptime seconds to the current time, and use
1383                                  * that as the new "next_worker" field for this database.
1384                                  */
1385                                 avdb->adl_next_worker =
1386                                         TimestampTzPlusMilliseconds(now, autovacuum_naptime * 1000);
1387
1388                                 dlist_move_head(&DatabaseList, iter.cur);
1389                                 break;
1390                         }
1391                 }
1392
1393                 /*
1394                  * If the database was not present in the database list, we rebuild
1395                  * the list.  It's possible that the database does not get into the
1396                  * list anyway, for example if it's a database that doesn't have a
1397                  * pgstat entry, but this is not a problem because we don't want to
1398                  * schedule workers regularly into those in any case.
1399                  */
1400                 if (!found)
1401                         rebuild_database_list(dbid);
1402         }
1403 }
1404
1405 /*
1406  * Called from postmaster to signal a failure to fork a process to become
1407  * worker.  The postmaster should kill(SIGUSR2) the launcher shortly
1408  * after calling this function.
1409  */
1410 void
1411 AutoVacWorkerFailed(void)
1412 {
1413         AutoVacuumShmem->av_signal[AutoVacForkFailed] = true;
1414 }
1415
1416 /* SIGUSR2: a worker is up and running, or just finished, or failed to fork */
1417 static void
1418 avl_sigusr2_handler(SIGNAL_ARGS)
1419 {
1420         int                     save_errno = errno;
1421
1422         got_SIGUSR2 = true;
1423         SetLatch(MyLatch);
1424
1425         errno = save_errno;
1426 }
1427
1428
1429 /********************************************************************
1430  *                                        AUTOVACUUM WORKER CODE
1431  ********************************************************************/
1432
1433 #ifdef EXEC_BACKEND
1434 /*
1435  * forkexec routines for the autovacuum worker.
1436  *
1437  * Format up the arglist, then fork and exec.
1438  */
1439 static pid_t
1440 avworker_forkexec(void)
1441 {
1442         char       *av[10];
1443         int                     ac = 0;
1444
1445         av[ac++] = "postgres";
1446         av[ac++] = "--forkavworker";
1447         av[ac++] = NULL;                        /* filled in by postmaster_forkexec */
1448         av[ac] = NULL;
1449
1450         Assert(ac < lengthof(av));
1451
1452         return postmaster_forkexec(ac, av);
1453 }
1454
1455 /*
1456  * We need this set from the outside, before InitProcess is called
1457  */
1458 void
1459 AutovacuumWorkerIAm(void)
1460 {
1461         am_autovacuum_worker = true;
1462 }
1463 #endif
1464
1465 /*
1466  * Main entry point for autovacuum worker process.
1467  *
1468  * This code is heavily based on pgarch.c, q.v.
1469  */
1470 int
1471 StartAutoVacWorker(void)
1472 {
1473         pid_t           worker_pid;
1474
1475 #ifdef EXEC_BACKEND
1476         switch ((worker_pid = avworker_forkexec()))
1477 #else
1478         switch ((worker_pid = fork_process()))
1479 #endif
1480         {
1481                 case -1:
1482                         ereport(LOG,
1483                                         (errmsg("could not fork autovacuum worker process: %m")));
1484                         return 0;
1485
1486 #ifndef EXEC_BACKEND
1487                 case 0:
1488                         /* in postmaster child ... */
1489                         InitPostmasterChild();
1490
1491                         /* Close the postmaster's sockets */
1492                         ClosePostmasterPorts(false);
1493
1494                         AutoVacWorkerMain(0, NULL);
1495                         break;
1496 #endif
1497                 default:
1498                         return (int) worker_pid;
1499         }
1500
1501         /* shouldn't get here */
1502         return 0;
1503 }
1504
1505 /*
1506  * AutoVacWorkerMain
1507  */
1508 NON_EXEC_STATIC void
1509 AutoVacWorkerMain(int argc, char *argv[])
1510 {
1511         sigjmp_buf      local_sigjmp_buf;
1512         Oid                     dbid;
1513
1514         am_autovacuum_worker = true;
1515
1516         MyBackendType = B_AUTOVAC_WORKER;
1517         init_ps_display(NULL);
1518
1519         SetProcessingMode(InitProcessing);
1520
1521         /*
1522          * Set up signal handlers.  We operate on databases much like a regular
1523          * backend, so we use the same signal handling.  See equivalent code in
1524          * tcop/postgres.c.
1525          */
1526         pqsignal(SIGHUP, SignalHandlerForConfigReload);
1527
1528         /*
1529          * SIGINT is used to signal canceling the current table's vacuum; SIGTERM
1530          * means abort and exit cleanly, and SIGQUIT means abandon ship.
1531          */
1532         pqsignal(SIGINT, StatementCancelHandler);
1533         pqsignal(SIGTERM, die);
1534         /* SIGQUIT handler was already set up by InitPostmasterChild */
1535
1536         InitializeTimeouts();           /* establishes SIGALRM handler */
1537
1538         pqsignal(SIGPIPE, SIG_IGN);
1539         pqsignal(SIGUSR1, procsignal_sigusr1_handler);
1540         pqsignal(SIGUSR2, SIG_IGN);
1541         pqsignal(SIGFPE, FloatExceptionHandler);
1542         pqsignal(SIGCHLD, SIG_DFL);
1543
1544         /*
1545          * Create a per-backend PGPROC struct in shared memory, except in the
1546          * EXEC_BACKEND case where this was done in SubPostmasterMain. We must do
1547          * this before we can use LWLocks (and in the EXEC_BACKEND case we already
1548          * had to do some stuff with LWLocks).
1549          */
1550 #ifndef EXEC_BACKEND
1551         InitProcess();
1552 #endif
1553
1554         /* Early initialization */
1555         BaseInit();
1556
1557         /*
1558          * If an exception is encountered, processing resumes here.
1559          *
1560          * Unlike most auxiliary processes, we don't attempt to continue
1561          * processing after an error; we just clean up and exit.  The autovac
1562          * launcher is responsible for spawning another worker later.
1563          *
1564          * Note that we use sigsetjmp(..., 1), so that the prevailing signal mask
1565          * (to wit, BlockSig) will be restored when longjmp'ing to here.  Thus,
1566          * signals other than SIGQUIT will be blocked until we exit.  It might
1567          * seem that this policy makes the HOLD_INTERRUPTS() call redundant, but
1568          * it is not since InterruptPending might be set already.
1569          */
1570         if (sigsetjmp(local_sigjmp_buf, 1) != 0)
1571         {
1572                 /* since not using PG_TRY, must reset error stack by hand */
1573                 error_context_stack = NULL;
1574
1575                 /* Prevents interrupts while cleaning up */
1576                 HOLD_INTERRUPTS();
1577
1578                 /* Report the error to the server log */
1579                 EmitErrorReport();
1580
1581                 /*
1582                  * We can now go away.  Note that because we called InitProcess, a
1583                  * callback was registered to do ProcKill, which will clean up
1584                  * necessary state.
1585                  */
1586                 proc_exit(0);
1587         }
1588
1589         /* We can now handle ereport(ERROR) */
1590         PG_exception_stack = &local_sigjmp_buf;
1591
1592         sigprocmask(SIG_SETMASK, &UnBlockSig, NULL);
1593
1594         /*
1595          * Set always-secure search path, so malicious users can't redirect user
1596          * code (e.g. pg_index.indexprs).  (That code runs in a
1597          * SECURITY_RESTRICTED_OPERATION sandbox, so malicious users could not
1598          * take control of the entire autovacuum worker in any case.)
1599          */
1600         SetConfigOption("search_path", "", PGC_SUSET, PGC_S_OVERRIDE);
1601
1602         /*
1603          * Force zero_damaged_pages OFF in the autovac process, even if it is set
1604          * in postgresql.conf.  We don't really want such a dangerous option being
1605          * applied non-interactively.
1606          */
1607         SetConfigOption("zero_damaged_pages", "false", PGC_SUSET, PGC_S_OVERRIDE);
1608
1609         /*
1610          * Force settable timeouts off to avoid letting these settings prevent
1611          * regular maintenance from being executed.
1612          */
1613         SetConfigOption("statement_timeout", "0", PGC_SUSET, PGC_S_OVERRIDE);
1614         SetConfigOption("lock_timeout", "0", PGC_SUSET, PGC_S_OVERRIDE);
1615         SetConfigOption("idle_in_transaction_session_timeout", "0",
1616                                         PGC_SUSET, PGC_S_OVERRIDE);
1617
1618         /*
1619          * Force default_transaction_isolation to READ COMMITTED.  We don't want
1620          * to pay the overhead of serializable mode, nor add any risk of causing
1621          * deadlocks or delaying other transactions.
1622          */
1623         SetConfigOption("default_transaction_isolation", "read committed",
1624                                         PGC_SUSET, PGC_S_OVERRIDE);
1625
1626         /*
1627          * Force synchronous replication off to allow regular maintenance even if
1628          * we are waiting for standbys to connect. This is important to ensure we
1629          * aren't blocked from performing anti-wraparound tasks.
1630          */
1631         if (synchronous_commit > SYNCHRONOUS_COMMIT_LOCAL_FLUSH)
1632                 SetConfigOption("synchronous_commit", "local",
1633                                                 PGC_SUSET, PGC_S_OVERRIDE);
1634
1635         /*
1636          * Even when system is configured to use a different fetch consistency,
1637          * for autovac we always want fresh stats.
1638          */
1639         SetConfigOption("stats_fetch_consistency", "none", PGC_SUSET, PGC_S_OVERRIDE);
1640
1641         /*
1642          * Get the info about the database we're going to work on.
1643          */
1644         LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
1645
1646         /*
1647          * beware of startingWorker being INVALID; this should normally not
1648          * happen, but if a worker fails after forking and before this, the
1649          * launcher might have decided to remove it from the queue and start
1650          * again.
1651          */
1652         if (AutoVacuumShmem->av_startingWorker != NULL)
1653         {
1654                 MyWorkerInfo = AutoVacuumShmem->av_startingWorker;
1655                 dbid = MyWorkerInfo->wi_dboid;
1656                 MyWorkerInfo->wi_proc = MyProc;
1657
1658                 /* insert into the running list */
1659                 dlist_push_head(&AutoVacuumShmem->av_runningWorkers,
1660                                                 &MyWorkerInfo->wi_links);
1661
1662                 /*
1663                  * remove from the "starting" pointer, so that the launcher can start
1664                  * a new worker if required
1665                  */
1666                 AutoVacuumShmem->av_startingWorker = NULL;
1667                 LWLockRelease(AutovacuumLock);
1668
1669                 on_shmem_exit(FreeWorkerInfo, 0);
1670
1671                 /* wake up the launcher */
1672                 if (AutoVacuumShmem->av_launcherpid != 0)
1673                         kill(AutoVacuumShmem->av_launcherpid, SIGUSR2);
1674         }
1675         else
1676         {
1677                 /* no worker entry for me, go away */
1678                 elog(WARNING, "autovacuum worker started without a worker entry");
1679                 dbid = InvalidOid;
1680                 LWLockRelease(AutovacuumLock);
1681         }
1682
1683         if (OidIsValid(dbid))
1684         {
1685                 char            dbname[NAMEDATALEN];
1686
1687                 /*
1688                  * Report autovac startup to the cumulative stats system.  We
1689                  * deliberately do this before InitPostgres, so that the
1690                  * last_autovac_time will get updated even if the connection attempt
1691                  * fails.  This is to prevent autovac from getting "stuck" repeatedly
1692                  * selecting an unopenable database, rather than making any progress
1693                  * on stuff it can connect to.
1694                  */
1695                 pgstat_report_autovac(dbid);
1696
1697                 /*
1698                  * Connect to the selected database, specifying no particular user
1699                  *
1700                  * Note: if we have selected a just-deleted database (due to using
1701                  * stale stats info), we'll fail and exit here.
1702                  */
1703                 InitPostgres(NULL, dbid, NULL, InvalidOid, false, false,
1704                                          dbname);
1705                 SetProcessingMode(NormalProcessing);
1706                 set_ps_display(dbname);
1707                 ereport(DEBUG1,
1708                                 (errmsg_internal("autovacuum: processing database \"%s\"", dbname)));
1709
1710                 if (PostAuthDelay)
1711                         pg_usleep(PostAuthDelay * 1000000L);
1712
1713                 /* And do an appropriate amount of work */
1714                 recentXid = ReadNextTransactionId();
1715                 recentMulti = ReadNextMultiXactId();
1716                 do_autovacuum();
1717         }
1718
1719         /*
1720          * The launcher will be notified of my death in ProcKill, *if* we managed
1721          * to get a worker slot at all
1722          */
1723
1724         /* All done, go away */
1725         proc_exit(0);
1726 }
1727
1728 /*
1729  * Return a WorkerInfo to the free list
1730  */
1731 static void
1732 FreeWorkerInfo(int code, Datum arg)
1733 {
1734         if (MyWorkerInfo != NULL)
1735         {
1736                 LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
1737
1738                 /*
1739                  * Wake the launcher up so that he can launch a new worker immediately
1740                  * if required.  We only save the launcher's PID in local memory here;
1741                  * the actual signal will be sent when the PGPROC is recycled.  Note
1742                  * that we always do this, so that the launcher can rebalance the cost
1743                  * limit setting of the remaining workers.
1744                  *
1745                  * We somewhat ignore the risk that the launcher changes its PID
1746                  * between us reading it and the actual kill; we expect ProcKill to be
1747                  * called shortly after us, and we assume that PIDs are not reused too
1748                  * quickly after a process exits.
1749                  */
1750                 AutovacuumLauncherPid = AutoVacuumShmem->av_launcherpid;
1751
1752                 dlist_delete(&MyWorkerInfo->wi_links);
1753                 MyWorkerInfo->wi_dboid = InvalidOid;
1754                 MyWorkerInfo->wi_tableoid = InvalidOid;
1755                 MyWorkerInfo->wi_sharedrel = false;
1756                 MyWorkerInfo->wi_proc = NULL;
1757                 MyWorkerInfo->wi_launchtime = 0;
1758                 MyWorkerInfo->wi_dobalance = false;
1759                 MyWorkerInfo->wi_cost_delay = 0;
1760                 MyWorkerInfo->wi_cost_limit = 0;
1761                 MyWorkerInfo->wi_cost_limit_base = 0;
1762                 dlist_push_head(&AutoVacuumShmem->av_freeWorkers,
1763                                                 &MyWorkerInfo->wi_links);
1764                 /* not mine anymore */
1765                 MyWorkerInfo = NULL;
1766
1767                 /*
1768                  * now that we're inactive, cause a rebalancing of the surviving
1769                  * workers
1770                  */
1771                 AutoVacuumShmem->av_signal[AutoVacRebalance] = true;
1772                 LWLockRelease(AutovacuumLock);
1773         }
1774 }
1775
1776 /*
1777  * Update the cost-based delay parameters, so that multiple workers consume
1778  * each a fraction of the total available I/O.
1779  */
1780 void
1781 AutoVacuumUpdateDelay(void)
1782 {
1783         if (MyWorkerInfo)
1784         {
1785                 VacuumCostDelay = MyWorkerInfo->wi_cost_delay;
1786                 VacuumCostLimit = MyWorkerInfo->wi_cost_limit;
1787         }
1788 }
1789
1790 /*
1791  * autovac_balance_cost
1792  *              Recalculate the cost limit setting for each active worker.
1793  *
1794  * Caller must hold the AutovacuumLock in exclusive mode.
1795  */
1796 static void
1797 autovac_balance_cost(void)
1798 {
1799         /*
1800          * The idea here is that we ration out I/O equally.  The amount of I/O
1801          * that a worker can consume is determined by cost_limit/cost_delay, so we
1802          * try to equalize those ratios rather than the raw limit settings.
1803          *
1804          * note: in cost_limit, zero also means use value from elsewhere, because
1805          * zero is not a valid value.
1806          */
1807         int                     vac_cost_limit = (autovacuum_vac_cost_limit > 0 ?
1808                                                                   autovacuum_vac_cost_limit : VacuumCostLimit);
1809         double          vac_cost_delay = (autovacuum_vac_cost_delay >= 0 ?
1810                                                                   autovacuum_vac_cost_delay : VacuumCostDelay);
1811         double          cost_total;
1812         double          cost_avail;
1813         dlist_iter      iter;
1814
1815         /* not set? nothing to do */
1816         if (vac_cost_limit <= 0 || vac_cost_delay <= 0)
1817                 return;
1818
1819         /* calculate the total base cost limit of participating active workers */
1820         cost_total = 0.0;
1821         dlist_foreach(iter, &AutoVacuumShmem->av_runningWorkers)
1822         {
1823                 WorkerInfo      worker = dlist_container(WorkerInfoData, wi_links, iter.cur);
1824
1825                 if (worker->wi_proc != NULL &&
1826                         worker->wi_dobalance &&
1827                         worker->wi_cost_limit_base > 0 && worker->wi_cost_delay > 0)
1828                         cost_total +=
1829                                 (double) worker->wi_cost_limit_base / worker->wi_cost_delay;
1830         }
1831
1832         /* there are no cost limits -- nothing to do */
1833         if (cost_total <= 0)
1834                 return;
1835
1836         /*
1837          * Adjust cost limit of each active worker to balance the total of cost
1838          * limit to autovacuum_vacuum_cost_limit.
1839          */
1840         cost_avail = (double) vac_cost_limit / vac_cost_delay;
1841         dlist_foreach(iter, &AutoVacuumShmem->av_runningWorkers)
1842         {
1843                 WorkerInfo      worker = dlist_container(WorkerInfoData, wi_links, iter.cur);
1844
1845                 if (worker->wi_proc != NULL &&
1846                         worker->wi_dobalance &&
1847                         worker->wi_cost_limit_base > 0 && worker->wi_cost_delay > 0)
1848                 {
1849                         int                     limit = (int)
1850                         (cost_avail * worker->wi_cost_limit_base / cost_total);
1851
1852                         /*
1853                          * We put a lower bound of 1 on the cost_limit, to avoid division-
1854                          * by-zero in the vacuum code.  Also, in case of roundoff trouble
1855                          * in these calculations, let's be sure we don't ever set
1856                          * cost_limit to more than the base value.
1857                          */
1858                         worker->wi_cost_limit = Max(Min(limit,
1859                                                                                         worker->wi_cost_limit_base),
1860                                                                                 1);
1861                 }
1862
1863                 if (worker->wi_proc != NULL)
1864                         elog(DEBUG2, "autovac_balance_cost(pid=%d db=%u, rel=%u, dobalance=%s cost_limit=%d, cost_limit_base=%d, cost_delay=%g)",
1865                                  worker->wi_proc->pid, worker->wi_dboid, worker->wi_tableoid,
1866                                  worker->wi_dobalance ? "yes" : "no",
1867                                  worker->wi_cost_limit, worker->wi_cost_limit_base,
1868                                  worker->wi_cost_delay);
1869         }
1870 }
1871
1872 /*
1873  * get_database_list
1874  *              Return a list of all databases found in pg_database.
1875  *
1876  * The list and associated data is allocated in the caller's memory context,
1877  * which is in charge of ensuring that it's properly cleaned up afterwards.
1878  *
1879  * Note: this is the only function in which the autovacuum launcher uses a
1880  * transaction.  Although we aren't attached to any particular database and
1881  * therefore can't access most catalogs, we do have enough infrastructure
1882  * to do a seqscan on pg_database.
1883  */
1884 static List *
1885 get_database_list(void)
1886 {
1887         List       *dblist = NIL;
1888         Relation        rel;
1889         TableScanDesc scan;
1890         HeapTuple       tup;
1891         MemoryContext resultcxt;
1892
1893         /* This is the context that we will allocate our output data in */
1894         resultcxt = CurrentMemoryContext;
1895
1896         /*
1897          * Start a transaction so we can access pg_database, and get a snapshot.
1898          * We don't have a use for the snapshot itself, but we're interested in
1899          * the secondary effect that it sets RecentGlobalXmin.  (This is critical
1900          * for anything that reads heap pages, because HOT may decide to prune
1901          * them even if the process doesn't attempt to modify any tuples.)
1902          *
1903          * FIXME: This comment is inaccurate / the code buggy. A snapshot that is
1904          * not pushed/active does not reliably prevent HOT pruning (->xmin could
1905          * e.g. be cleared when cache invalidations are processed).
1906          */
1907         StartTransactionCommand();
1908         (void) GetTransactionSnapshot();
1909
1910         rel = table_open(DatabaseRelationId, AccessShareLock);
1911         scan = table_beginscan_catalog(rel, 0, NULL);
1912
1913         while (HeapTupleIsValid(tup = heap_getnext(scan, ForwardScanDirection)))
1914         {
1915                 Form_pg_database pgdatabase = (Form_pg_database) GETSTRUCT(tup);
1916                 avw_dbase  *avdb;
1917                 MemoryContext oldcxt;
1918
1919                 /*
1920                  * Allocate our results in the caller's context, not the
1921                  * transaction's. We do this inside the loop, and restore the original
1922                  * context at the end, so that leaky things like heap_getnext() are
1923                  * not called in a potentially long-lived context.
1924                  */
1925                 oldcxt = MemoryContextSwitchTo(resultcxt);
1926
1927                 avdb = (avw_dbase *) palloc(sizeof(avw_dbase));
1928
1929                 avdb->adw_datid = pgdatabase->oid;
1930                 avdb->adw_name = pstrdup(NameStr(pgdatabase->datname));
1931                 avdb->adw_frozenxid = pgdatabase->datfrozenxid;
1932                 avdb->adw_minmulti = pgdatabase->datminmxid;
1933                 /* this gets set later: */
1934                 avdb->adw_entry = NULL;
1935
1936                 dblist = lappend(dblist, avdb);
1937                 MemoryContextSwitchTo(oldcxt);
1938         }
1939
1940         table_endscan(scan);
1941         table_close(rel, AccessShareLock);
1942
1943         CommitTransactionCommand();
1944
1945         /* Be sure to restore caller's memory context */
1946         MemoryContextSwitchTo(resultcxt);
1947
1948         return dblist;
1949 }
1950
1951 /*
1952  * Process a database table-by-table
1953  *
1954  * Note that CHECK_FOR_INTERRUPTS is supposed to be used in certain spots in
1955  * order not to ignore shutdown commands for too long.
1956  */
1957 static void
1958 do_autovacuum(void)
1959 {
1960         Relation        classRel;
1961         HeapTuple       tuple;
1962         TableScanDesc relScan;
1963         Form_pg_database dbForm;
1964         List       *table_oids = NIL;
1965         List       *orphan_oids = NIL;
1966         HASHCTL         ctl;
1967         HTAB       *table_toast_map;
1968         ListCell   *volatile cell;
1969         BufferAccessStrategy bstrategy;
1970         ScanKeyData key;
1971         TupleDesc       pg_class_desc;
1972         int                     effective_multixact_freeze_max_age;
1973         bool            did_vacuum = false;
1974         bool            found_concurrent_worker = false;
1975         int                     i;
1976
1977         /*
1978          * StartTransactionCommand and CommitTransactionCommand will automatically
1979          * switch to other contexts.  We need this one to keep the list of
1980          * relations to vacuum/analyze across transactions.
1981          */
1982         AutovacMemCxt = AllocSetContextCreate(TopMemoryContext,
1983                                                                                   "Autovacuum worker",
1984                                                                                   ALLOCSET_DEFAULT_SIZES);
1985         MemoryContextSwitchTo(AutovacMemCxt);
1986
1987         /* Start a transaction so our commands have one to play into. */
1988         StartTransactionCommand();
1989
1990         /*
1991          * Compute the multixact age for which freezing is urgent.  This is
1992          * normally autovacuum_multixact_freeze_max_age, but may be less if we are
1993          * short of multixact member space.
1994          */
1995         effective_multixact_freeze_max_age = MultiXactMemberFreezeThreshold();
1996
1997         /*
1998          * Find the pg_database entry and select the default freeze ages. We use
1999          * zero in template and nonconnectable databases, else the system-wide
2000          * default.
2001          */
2002         tuple = SearchSysCache1(DATABASEOID, ObjectIdGetDatum(MyDatabaseId));
2003         if (!HeapTupleIsValid(tuple))
2004                 elog(ERROR, "cache lookup failed for database %u", MyDatabaseId);
2005         dbForm = (Form_pg_database) GETSTRUCT(tuple);
2006
2007         if (dbForm->datistemplate || !dbForm->datallowconn)
2008         {
2009                 default_freeze_min_age = 0;
2010                 default_freeze_table_age = 0;
2011                 default_multixact_freeze_min_age = 0;
2012                 default_multixact_freeze_table_age = 0;
2013         }
2014         else
2015         {
2016                 default_freeze_min_age = vacuum_freeze_min_age;
2017                 default_freeze_table_age = vacuum_freeze_table_age;
2018                 default_multixact_freeze_min_age = vacuum_multixact_freeze_min_age;
2019                 default_multixact_freeze_table_age = vacuum_multixact_freeze_table_age;
2020         }
2021
2022         ReleaseSysCache(tuple);
2023
2024         /* StartTransactionCommand changed elsewhere */
2025         MemoryContextSwitchTo(AutovacMemCxt);
2026
2027         classRel = table_open(RelationRelationId, AccessShareLock);
2028
2029         /* create a copy so we can use it after closing pg_class */
2030         pg_class_desc = CreateTupleDescCopy(RelationGetDescr(classRel));
2031
2032         /* create hash table for toast <-> main relid mapping */
2033         ctl.keysize = sizeof(Oid);
2034         ctl.entrysize = sizeof(av_relation);
2035
2036         table_toast_map = hash_create("TOAST to main relid map",
2037                                                                   100,
2038                                                                   &ctl,
2039                                                                   HASH_ELEM | HASH_BLOBS);
2040
2041         /*
2042          * Scan pg_class to determine which tables to vacuum.
2043          *
2044          * We do this in two passes: on the first one we collect the list of plain
2045          * relations and materialized views, and on the second one we collect
2046          * TOAST tables. The reason for doing the second pass is that during it we
2047          * want to use the main relation's pg_class.reloptions entry if the TOAST
2048          * table does not have any, and we cannot obtain it unless we know
2049          * beforehand what's the main table OID.
2050          *
2051          * We need to check TOAST tables separately because in cases with short,
2052          * wide tables there might be proportionally much more activity in the
2053          * TOAST table than in its parent.
2054          */
2055         relScan = table_beginscan_catalog(classRel, 0, NULL);
2056
2057         /*
2058          * On the first pass, we collect main tables to vacuum, and also the main
2059          * table relid to TOAST relid mapping.
2060          */
2061         while ((tuple = heap_getnext(relScan, ForwardScanDirection)) != NULL)
2062         {
2063                 Form_pg_class classForm = (Form_pg_class) GETSTRUCT(tuple);
2064                 PgStat_StatTabEntry *tabentry;
2065                 AutoVacOpts *relopts;
2066                 Oid                     relid;
2067                 bool            dovacuum;
2068                 bool            doanalyze;
2069                 bool            wraparound;
2070
2071                 if (classForm->relkind != RELKIND_RELATION &&
2072                         classForm->relkind != RELKIND_MATVIEW)
2073                         continue;
2074
2075                 relid = classForm->oid;
2076
2077                 /*
2078                  * Check if it is a temp table (presumably, of some other backend's).
2079                  * We cannot safely process other backends' temp tables.
2080                  */
2081                 if (classForm->relpersistence == RELPERSISTENCE_TEMP)
2082                 {
2083                         /*
2084                          * We just ignore it if the owning backend is still active and
2085                          * using the temporary schema.  Also, for safety, ignore it if the
2086                          * namespace doesn't exist or isn't a temp namespace after all.
2087                          */
2088                         if (checkTempNamespaceStatus(classForm->relnamespace) == TEMP_NAMESPACE_IDLE)
2089                         {
2090                                 /*
2091                                  * The table seems to be orphaned -- although it might be that
2092                                  * the owning backend has already deleted it and exited; our
2093                                  * pg_class scan snapshot is not necessarily up-to-date
2094                                  * anymore, so we could be looking at a committed-dead entry.
2095                                  * Remember it so we can try to delete it later.
2096                                  */
2097                                 orphan_oids = lappend_oid(orphan_oids, relid);
2098                         }
2099                         continue;
2100                 }
2101
2102                 /* Fetch reloptions and the pgstat entry for this table */
2103                 relopts = extract_autovac_opts(tuple, pg_class_desc);
2104                 tabentry = pgstat_fetch_stat_tabentry_ext(classForm->relisshared,
2105                                                                                                   relid);
2106
2107                 /* Check if it needs vacuum or analyze */
2108                 relation_needs_vacanalyze(relid, relopts, classForm, tabentry,
2109                                                                   effective_multixact_freeze_max_age,
2110                                                                   &dovacuum, &doanalyze, &wraparound);
2111
2112                 /* Relations that need work are added to table_oids */
2113                 if (dovacuum || doanalyze)
2114                         table_oids = lappend_oid(table_oids, relid);
2115
2116                 /*
2117                  * Remember TOAST associations for the second pass.  Note: we must do
2118                  * this whether or not the table is going to be vacuumed, because we
2119                  * don't automatically vacuum toast tables along the parent table.
2120                  */
2121                 if (OidIsValid(classForm->reltoastrelid))
2122                 {
2123                         av_relation *hentry;
2124                         bool            found;
2125
2126                         hentry = hash_search(table_toast_map,
2127                                                                  &classForm->reltoastrelid,
2128                                                                  HASH_ENTER, &found);
2129
2130                         if (!found)
2131                         {
2132                                 /* hash_search already filled in the key */
2133                                 hentry->ar_relid = relid;
2134                                 hentry->ar_hasrelopts = false;
2135                                 if (relopts != NULL)
2136                                 {
2137                                         hentry->ar_hasrelopts = true;
2138                                         memcpy(&hentry->ar_reloptions, relopts,
2139                                                    sizeof(AutoVacOpts));
2140                                 }
2141                         }
2142                 }
2143         }
2144
2145         table_endscan(relScan);
2146
2147         /* second pass: check TOAST tables */
2148         ScanKeyInit(&key,
2149                                 Anum_pg_class_relkind,
2150                                 BTEqualStrategyNumber, F_CHAREQ,
2151                                 CharGetDatum(RELKIND_TOASTVALUE));
2152
2153         relScan = table_beginscan_catalog(classRel, 1, &key);
2154         while ((tuple = heap_getnext(relScan, ForwardScanDirection)) != NULL)
2155         {
2156                 Form_pg_class classForm = (Form_pg_class) GETSTRUCT(tuple);
2157                 PgStat_StatTabEntry *tabentry;
2158                 Oid                     relid;
2159                 AutoVacOpts *relopts = NULL;
2160                 bool            dovacuum;
2161                 bool            doanalyze;
2162                 bool            wraparound;
2163
2164                 /*
2165                  * We cannot safely process other backends' temp tables, so skip 'em.
2166                  */
2167                 if (classForm->relpersistence == RELPERSISTENCE_TEMP)
2168                         continue;
2169
2170                 relid = classForm->oid;
2171
2172                 /*
2173                  * fetch reloptions -- if this toast table does not have them, try the
2174                  * main rel
2175                  */
2176                 relopts = extract_autovac_opts(tuple, pg_class_desc);
2177                 if (relopts == NULL)
2178                 {
2179                         av_relation *hentry;
2180                         bool            found;
2181
2182                         hentry = hash_search(table_toast_map, &relid, HASH_FIND, &found);
2183                         if (found && hentry->ar_hasrelopts)
2184                                 relopts = &hentry->ar_reloptions;
2185                 }
2186
2187                 /* Fetch the pgstat entry for this table */
2188                 tabentry = pgstat_fetch_stat_tabentry_ext(classForm->relisshared,
2189                                                                                                   relid);
2190
2191                 relation_needs_vacanalyze(relid, relopts, classForm, tabentry,
2192                                                                   effective_multixact_freeze_max_age,
2193                                                                   &dovacuum, &doanalyze, &wraparound);
2194
2195                 /* ignore analyze for toast tables */
2196                 if (dovacuum)
2197                         table_oids = lappend_oid(table_oids, relid);
2198         }
2199
2200         table_endscan(relScan);
2201         table_close(classRel, AccessShareLock);
2202
2203         /*
2204          * Recheck orphan temporary tables, and if they still seem orphaned, drop
2205          * them.  We'll eat a transaction per dropped table, which might seem
2206          * excessive, but we should only need to do anything as a result of a
2207          * previous backend crash, so this should not happen often enough to
2208          * justify "optimizing".  Using separate transactions ensures that we
2209          * don't bloat the lock table if there are many temp tables to be dropped,
2210          * and it ensures that we don't lose work if a deletion attempt fails.
2211          */
2212         foreach(cell, orphan_oids)
2213         {
2214                 Oid                     relid = lfirst_oid(cell);
2215                 Form_pg_class classForm;
2216                 ObjectAddress object;
2217
2218                 /*
2219                  * Check for user-requested abort.
2220                  */
2221                 CHECK_FOR_INTERRUPTS();
2222
2223                 /*
2224                  * Try to lock the table.  If we can't get the lock immediately,
2225                  * somebody else is using (or dropping) the table, so it's not our
2226                  * concern anymore.  Having the lock prevents race conditions below.
2227                  */
2228                 if (!ConditionalLockRelationOid(relid, AccessExclusiveLock))
2229                         continue;
2230
2231                 /*
2232                  * Re-fetch the pg_class tuple and re-check whether it still seems to
2233                  * be an orphaned temp table.  If it's not there or no longer the same
2234                  * relation, ignore it.
2235                  */
2236                 tuple = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(relid));
2237                 if (!HeapTupleIsValid(tuple))
2238                 {
2239                         /* be sure to drop useless lock so we don't bloat lock table */
2240                         UnlockRelationOid(relid, AccessExclusiveLock);
2241                         continue;
2242                 }
2243                 classForm = (Form_pg_class) GETSTRUCT(tuple);
2244
2245                 /*
2246                  * Make all the same tests made in the loop above.  In event of OID
2247                  * counter wraparound, the pg_class entry we have now might be
2248                  * completely unrelated to the one we saw before.
2249                  */
2250                 if (!((classForm->relkind == RELKIND_RELATION ||
2251                            classForm->relkind == RELKIND_MATVIEW) &&
2252                           classForm->relpersistence == RELPERSISTENCE_TEMP))
2253                 {
2254                         UnlockRelationOid(relid, AccessExclusiveLock);
2255                         continue;
2256                 }
2257
2258                 if (checkTempNamespaceStatus(classForm->relnamespace) != TEMP_NAMESPACE_IDLE)
2259                 {
2260                         UnlockRelationOid(relid, AccessExclusiveLock);
2261                         continue;
2262                 }
2263
2264                 /* OK, let's delete it */
2265                 ereport(LOG,
2266                                 (errmsg("autovacuum: dropping orphan temp table \"%s.%s.%s\"",
2267                                                 get_database_name(MyDatabaseId),
2268                                                 get_namespace_name(classForm->relnamespace),
2269                                                 NameStr(classForm->relname))));
2270
2271                 object.classId = RelationRelationId;
2272                 object.objectId = relid;
2273                 object.objectSubId = 0;
2274                 performDeletion(&object, DROP_CASCADE,
2275                                                 PERFORM_DELETION_INTERNAL |
2276                                                 PERFORM_DELETION_QUIETLY |
2277                                                 PERFORM_DELETION_SKIP_EXTENSIONS);
2278
2279                 /*
2280                  * To commit the deletion, end current transaction and start a new
2281                  * one.  Note this also releases the lock we took.
2282                  */
2283                 CommitTransactionCommand();
2284                 StartTransactionCommand();
2285
2286                 /* StartTransactionCommand changed current memory context */
2287                 MemoryContextSwitchTo(AutovacMemCxt);
2288         }
2289
2290         /*
2291          * Create a buffer access strategy object for VACUUM to use.  We want to
2292          * use the same one across all the vacuum operations we perform, since the
2293          * point is for VACUUM not to blow out the shared cache.
2294          */
2295         bstrategy = GetAccessStrategy(BAS_VACUUM);
2296
2297         /*
2298          * create a memory context to act as fake PortalContext, so that the
2299          * contexts created in the vacuum code are cleaned up for each table.
2300          */
2301         PortalContext = AllocSetContextCreate(AutovacMemCxt,
2302                                                                                   "Autovacuum Portal",
2303                                                                                   ALLOCSET_DEFAULT_SIZES);
2304
2305         /*
2306          * Perform operations on collected tables.
2307          */
2308         foreach(cell, table_oids)
2309         {
2310                 Oid                     relid = lfirst_oid(cell);
2311                 HeapTuple       classTup;
2312                 autovac_table *tab;
2313                 bool            isshared;
2314                 bool            skipit;
2315                 double          stdVacuumCostDelay;
2316                 int                     stdVacuumCostLimit;
2317                 dlist_iter      iter;
2318
2319                 CHECK_FOR_INTERRUPTS();
2320
2321                 /*
2322                  * Check for config changes before processing each collected table.
2323                  */
2324                 if (ConfigReloadPending)
2325                 {
2326                         ConfigReloadPending = false;
2327                         ProcessConfigFile(PGC_SIGHUP);
2328
2329                         /*
2330                          * You might be tempted to bail out if we see autovacuum is now
2331                          * disabled.  Must resist that temptation -- this might be a
2332                          * for-wraparound emergency worker, in which case that would be
2333                          * entirely inappropriate.
2334                          */
2335                 }
2336
2337                 /*
2338                  * Find out whether the table is shared or not.  (It's slightly
2339                  * annoying to fetch the syscache entry just for this, but in typical
2340                  * cases it adds little cost because table_recheck_autovac would
2341                  * refetch the entry anyway.  We could buy that back by copying the
2342                  * tuple here and passing it to table_recheck_autovac, but that
2343                  * increases the odds of that function working with stale data.)
2344                  */
2345                 classTup = SearchSysCache1(RELOID, ObjectIdGetDatum(relid));
2346                 if (!HeapTupleIsValid(classTup))
2347                         continue;                       /* somebody deleted the rel, forget it */
2348                 isshared = ((Form_pg_class) GETSTRUCT(classTup))->relisshared;
2349                 ReleaseSysCache(classTup);
2350
2351                 /*
2352                  * Hold schedule lock from here until we've claimed the table.  We
2353                  * also need the AutovacuumLock to walk the worker array, but that one
2354                  * can just be a shared lock.
2355                  */
2356                 LWLockAcquire(AutovacuumScheduleLock, LW_EXCLUSIVE);
2357                 LWLockAcquire(AutovacuumLock, LW_SHARED);
2358
2359                 /*
2360                  * Check whether the table is being vacuumed concurrently by another
2361                  * worker.
2362                  */
2363                 skipit = false;
2364                 dlist_foreach(iter, &AutoVacuumShmem->av_runningWorkers)
2365                 {
2366                         WorkerInfo      worker = dlist_container(WorkerInfoData, wi_links, iter.cur);
2367
2368                         /* ignore myself */
2369                         if (worker == MyWorkerInfo)
2370                                 continue;
2371
2372                         /* ignore workers in other databases (unless table is shared) */
2373                         if (!worker->wi_sharedrel && worker->wi_dboid != MyDatabaseId)
2374                                 continue;
2375
2376                         if (worker->wi_tableoid == relid)
2377                         {
2378                                 skipit = true;
2379                                 found_concurrent_worker = true;
2380                                 break;
2381                         }
2382                 }
2383                 LWLockRelease(AutovacuumLock);
2384                 if (skipit)
2385                 {
2386                         LWLockRelease(AutovacuumScheduleLock);
2387                         continue;
2388                 }
2389
2390                 /*
2391                  * Store the table's OID in shared memory before releasing the
2392                  * schedule lock, so that other workers don't try to vacuum it
2393                  * concurrently.  (We claim it here so as not to hold
2394                  * AutovacuumScheduleLock while rechecking the stats.)
2395                  */
2396                 MyWorkerInfo->wi_tableoid = relid;
2397                 MyWorkerInfo->wi_sharedrel = isshared;
2398                 LWLockRelease(AutovacuumScheduleLock);
2399
2400                 /*
2401                  * Check whether pgstat data still says we need to vacuum this table.
2402                  * It could have changed if something else processed the table while
2403                  * we weren't looking. This doesn't entirely close the race condition,
2404                  * but it is very small.
2405                  */
2406                 MemoryContextSwitchTo(AutovacMemCxt);
2407                 tab = table_recheck_autovac(relid, table_toast_map, pg_class_desc,
2408                                                                         effective_multixact_freeze_max_age);
2409                 if (tab == NULL)
2410                 {
2411                         /* someone else vacuumed the table, or it went away */
2412                         LWLockAcquire(AutovacuumScheduleLock, LW_EXCLUSIVE);
2413                         MyWorkerInfo->wi_tableoid = InvalidOid;
2414                         MyWorkerInfo->wi_sharedrel = false;
2415                         LWLockRelease(AutovacuumScheduleLock);
2416                         continue;
2417                 }
2418
2419                 /*
2420                  * Remember the prevailing values of the vacuum cost GUCs.  We have to
2421                  * restore these at the bottom of the loop, else we'll compute wrong
2422                  * values in the next iteration of autovac_balance_cost().
2423                  */
2424                 stdVacuumCostDelay = VacuumCostDelay;
2425                 stdVacuumCostLimit = VacuumCostLimit;
2426
2427                 /* Must hold AutovacuumLock while mucking with cost balance info */
2428                 LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
2429
2430                 /* advertise my cost delay parameters for the balancing algorithm */
2431                 MyWorkerInfo->wi_dobalance = tab->at_dobalance;
2432                 MyWorkerInfo->wi_cost_delay = tab->at_vacuum_cost_delay;
2433                 MyWorkerInfo->wi_cost_limit = tab->at_vacuum_cost_limit;
2434                 MyWorkerInfo->wi_cost_limit_base = tab->at_vacuum_cost_limit;
2435
2436                 /* do a balance */
2437                 autovac_balance_cost();
2438
2439                 /* set the active cost parameters from the result of that */
2440                 AutoVacuumUpdateDelay();
2441
2442                 /* done */
2443                 LWLockRelease(AutovacuumLock);
2444
2445                 /* clean up memory before each iteration */
2446                 MemoryContextResetAndDeleteChildren(PortalContext);
2447
2448                 /*
2449                  * Save the relation name for a possible error message, to avoid a
2450                  * catalog lookup in case of an error.  If any of these return NULL,
2451                  * then the relation has been dropped since last we checked; skip it.
2452                  * Note: they must live in a long-lived memory context because we call
2453                  * vacuum and analyze in different transactions.
2454                  */
2455
2456                 tab->at_relname = get_rel_name(tab->at_relid);
2457                 tab->at_nspname = get_namespace_name(get_rel_namespace(tab->at_relid));
2458                 tab->at_datname = get_database_name(MyDatabaseId);
2459                 if (!tab->at_relname || !tab->at_nspname || !tab->at_datname)
2460                         goto deleted;
2461
2462                 /*
2463                  * We will abort vacuuming the current table if something errors out,
2464                  * and continue with the next one in schedule; in particular, this
2465                  * happens if we are interrupted with SIGINT.
2466                  */
2467                 PG_TRY();
2468                 {
2469                         /* Use PortalContext for any per-table allocations */
2470                         MemoryContextSwitchTo(PortalContext);
2471
2472                         /* have at it */
2473                         autovacuum_do_vac_analyze(tab, bstrategy);
2474
2475                         /*
2476                          * Clear a possible query-cancel signal, to avoid a late reaction
2477                          * to an automatically-sent signal because of vacuuming the
2478                          * current table (we're done with it, so it would make no sense to
2479                          * cancel at this point.)
2480                          */
2481                         QueryCancelPending = false;
2482                 }
2483                 PG_CATCH();
2484                 {
2485                         /*
2486                          * Abort the transaction, start a new one, and proceed with the
2487                          * next table in our list.
2488                          */
2489                         HOLD_INTERRUPTS();
2490                         if (tab->at_params.options & VACOPT_VACUUM)
2491                                 errcontext("automatic vacuum of table \"%s.%s.%s\"",
2492                                                    tab->at_datname, tab->at_nspname, tab->at_relname);
2493                         else
2494                                 errcontext("automatic analyze of table \"%s.%s.%s\"",
2495                                                    tab->at_datname, tab->at_nspname, tab->at_relname);
2496                         EmitErrorReport();
2497
2498                         /* this resets ProcGlobal->statusFlags[i] too */
2499                         AbortOutOfAnyTransaction();
2500                         FlushErrorState();
2501                         MemoryContextResetAndDeleteChildren(PortalContext);
2502
2503                         /* restart our transaction for the following operations */
2504                         StartTransactionCommand();
2505                         RESUME_INTERRUPTS();
2506                 }
2507                 PG_END_TRY();
2508
2509                 /* Make sure we're back in AutovacMemCxt */
2510                 MemoryContextSwitchTo(AutovacMemCxt);
2511
2512                 did_vacuum = true;
2513
2514                 /* ProcGlobal->statusFlags[i] are reset at the next end of xact */
2515
2516                 /* be tidy */
2517 deleted:
2518                 if (tab->at_datname != NULL)
2519                         pfree(tab->at_datname);
2520                 if (tab->at_nspname != NULL)
2521                         pfree(tab->at_nspname);
2522                 if (tab->at_relname != NULL)
2523                         pfree(tab->at_relname);
2524                 pfree(tab);
2525
2526                 /*
2527                  * Remove my info from shared memory.  We could, but intentionally
2528                  * don't, clear wi_cost_limit and friends --- this is on the
2529                  * assumption that we probably have more to do with similar cost
2530                  * settings, so we don't want to give up our share of I/O for a very
2531                  * short interval and thereby thrash the global balance.
2532                  */
2533                 LWLockAcquire(AutovacuumScheduleLock, LW_EXCLUSIVE);
2534                 MyWorkerInfo->wi_tableoid = InvalidOid;
2535                 MyWorkerInfo->wi_sharedrel = false;
2536                 LWLockRelease(AutovacuumScheduleLock);
2537
2538                 /* restore vacuum cost GUCs for the next iteration */
2539                 VacuumCostDelay = stdVacuumCostDelay;
2540                 VacuumCostLimit = stdVacuumCostLimit;
2541         }
2542
2543         /*
2544          * Perform additional work items, as requested by backends.
2545          */
2546         LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
2547         for (i = 0; i < NUM_WORKITEMS; i++)
2548         {
2549                 AutoVacuumWorkItem *workitem = &AutoVacuumShmem->av_workItems[i];
2550
2551                 if (!workitem->avw_used)
2552                         continue;
2553                 if (workitem->avw_active)
2554                         continue;
2555                 if (workitem->avw_database != MyDatabaseId)
2556                         continue;
2557
2558                 /* claim this one, and release lock while performing it */
2559                 workitem->avw_active = true;
2560                 LWLockRelease(AutovacuumLock);
2561
2562                 perform_work_item(workitem);
2563
2564                 /*
2565                  * Check for config changes before acquiring lock for further jobs.
2566                  */
2567                 CHECK_FOR_INTERRUPTS();
2568                 if (ConfigReloadPending)
2569                 {
2570                         ConfigReloadPending = false;
2571                         ProcessConfigFile(PGC_SIGHUP);
2572                 }
2573
2574                 LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
2575
2576                 /* and mark it done */
2577                 workitem->avw_active = false;
2578                 workitem->avw_used = false;
2579         }
2580         LWLockRelease(AutovacuumLock);
2581
2582         /*
2583          * We leak table_toast_map here (among other things), but since we're
2584          * going away soon, it's not a problem.
2585          */
2586
2587         /*
2588          * Update pg_database.datfrozenxid, and truncate pg_xact if possible. We
2589          * only need to do this once, not after each table.
2590          *
2591          * Even if we didn't vacuum anything, it may still be important to do
2592          * this, because one indirect effect of vac_update_datfrozenxid() is to
2593          * update ShmemVariableCache->xidVacLimit.  That might need to be done
2594          * even if we haven't vacuumed anything, because relations with older
2595          * relfrozenxid values or other databases with older datfrozenxid values
2596          * might have been dropped, allowing xidVacLimit to advance.
2597          *
2598          * However, it's also important not to do this blindly in all cases,
2599          * because when autovacuum=off this will restart the autovacuum launcher.
2600          * If we're not careful, an infinite loop can result, where workers find
2601          * no work to do and restart the launcher, which starts another worker in
2602          * the same database that finds no work to do.  To prevent that, we skip
2603          * this if (1) we found no work to do and (2) we skipped at least one
2604          * table due to concurrent autovacuum activity.  In that case, the other
2605          * worker has already done it, or will do so when it finishes.
2606          */
2607         if (did_vacuum || !found_concurrent_worker)
2608                 vac_update_datfrozenxid();
2609
2610         /* Finally close out the last transaction. */
2611         CommitTransactionCommand();
2612 }
2613
2614 /*
2615  * Execute a previously registered work item.
2616  */
2617 static void
2618 perform_work_item(AutoVacuumWorkItem *workitem)
2619 {
2620         char       *cur_datname = NULL;
2621         char       *cur_nspname = NULL;
2622         char       *cur_relname = NULL;
2623
2624         /*
2625          * Note we do not store table info in MyWorkerInfo, since this is not
2626          * vacuuming proper.
2627          */
2628
2629         /*
2630          * Save the relation name for a possible error message, to avoid a catalog
2631          * lookup in case of an error.  If any of these return NULL, then the
2632          * relation has been dropped since last we checked; skip it.
2633          */
2634         Assert(CurrentMemoryContext == AutovacMemCxt);
2635
2636         cur_relname = get_rel_name(workitem->avw_relation);
2637         cur_nspname = get_namespace_name(get_rel_namespace(workitem->avw_relation));
2638         cur_datname = get_database_name(MyDatabaseId);
2639         if (!cur_relname || !cur_nspname || !cur_datname)
2640                 goto deleted2;
2641
2642         autovac_report_workitem(workitem, cur_nspname, cur_relname);
2643
2644         /* clean up memory before each work item */
2645         MemoryContextResetAndDeleteChildren(PortalContext);
2646
2647         /*
2648          * We will abort the current work item if something errors out, and
2649          * continue with the next one; in particular, this happens if we are
2650          * interrupted with SIGINT.  Note that this means that the work item list
2651          * can be lossy.
2652          */
2653         PG_TRY();
2654         {
2655                 /* Use PortalContext for any per-work-item allocations */
2656                 MemoryContextSwitchTo(PortalContext);
2657
2658                 /* have at it */
2659                 switch (workitem->avw_type)
2660                 {
2661                         case AVW_BRINSummarizeRange:
2662                                 DirectFunctionCall2(brin_summarize_range,
2663                                                                         ObjectIdGetDatum(workitem->avw_relation),
2664                                                                         Int64GetDatum((int64) workitem->avw_blockNumber));
2665                                 break;
2666                         default:
2667                                 elog(WARNING, "unrecognized work item found: type %d",
2668                                          workitem->avw_type);
2669                                 break;
2670                 }
2671
2672                 /*
2673                  * Clear a possible query-cancel signal, to avoid a late reaction to
2674                  * an automatically-sent signal because of vacuuming the current table
2675                  * (we're done with it, so it would make no sense to cancel at this
2676                  * point.)
2677                  */
2678                 QueryCancelPending = false;
2679         }
2680         PG_CATCH();
2681         {
2682                 /*
2683                  * Abort the transaction, start a new one, and proceed with the next
2684                  * table in our list.
2685                  */
2686                 HOLD_INTERRUPTS();
2687                 errcontext("processing work entry for relation \"%s.%s.%s\"",
2688                                    cur_datname, cur_nspname, cur_relname);
2689                 EmitErrorReport();
2690
2691                 /* this resets ProcGlobal->statusFlags[i] too */
2692                 AbortOutOfAnyTransaction();
2693                 FlushErrorState();
2694                 MemoryContextResetAndDeleteChildren(PortalContext);
2695
2696                 /* restart our transaction for the following operations */
2697                 StartTransactionCommand();
2698                 RESUME_INTERRUPTS();
2699         }
2700         PG_END_TRY();
2701
2702         /* Make sure we're back in AutovacMemCxt */
2703         MemoryContextSwitchTo(AutovacMemCxt);
2704
2705         /* We intentionally do not set did_vacuum here */
2706
2707         /* be tidy */
2708 deleted2:
2709         if (cur_datname)
2710                 pfree(cur_datname);
2711         if (cur_nspname)
2712                 pfree(cur_nspname);
2713         if (cur_relname)
2714                 pfree(cur_relname);
2715 }
2716
2717 /*
2718  * extract_autovac_opts
2719  *
2720  * Given a relation's pg_class tuple, return the AutoVacOpts portion of
2721  * reloptions, if set; otherwise, return NULL.
2722  *
2723  * Note: callers do not have a relation lock on the table at this point,
2724  * so the table could have been dropped, and its catalog rows gone, after
2725  * we acquired the pg_class row.  If pg_class had a TOAST table, this would
2726  * be a risk; fortunately, it doesn't.
2727  */
2728 static AutoVacOpts *
2729 extract_autovac_opts(HeapTuple tup, TupleDesc pg_class_desc)
2730 {
2731         bytea      *relopts;
2732         AutoVacOpts *av;
2733
2734         Assert(((Form_pg_class) GETSTRUCT(tup))->relkind == RELKIND_RELATION ||
2735                    ((Form_pg_class) GETSTRUCT(tup))->relkind == RELKIND_MATVIEW ||
2736                    ((Form_pg_class) GETSTRUCT(tup))->relkind == RELKIND_TOASTVALUE);
2737
2738         relopts = extractRelOptions(tup, pg_class_desc, NULL);
2739         if (relopts == NULL)
2740                 return NULL;
2741
2742         av = palloc(sizeof(AutoVacOpts));
2743         memcpy(av, &(((StdRdOptions *) relopts)->autovacuum), sizeof(AutoVacOpts));
2744         pfree(relopts);
2745
2746         return av;
2747 }
2748
2749
2750 /*
2751  * table_recheck_autovac
2752  *
2753  * Recheck whether a table still needs vacuum or analyze.  Return value is a
2754  * valid autovac_table pointer if it does, NULL otherwise.
2755  *
2756  * Note that the returned autovac_table does not have the name fields set.
2757  */
2758 static autovac_table *
2759 table_recheck_autovac(Oid relid, HTAB *table_toast_map,
2760                                           TupleDesc pg_class_desc,
2761                                           int effective_multixact_freeze_max_age)
2762 {
2763         Form_pg_class classForm;
2764         HeapTuple       classTup;
2765         bool            dovacuum;
2766         bool            doanalyze;
2767         autovac_table *tab = NULL;
2768         bool            wraparound;
2769         AutoVacOpts *avopts;
2770
2771         /* fetch the relation's relcache entry */
2772         classTup = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(relid));
2773         if (!HeapTupleIsValid(classTup))
2774                 return NULL;
2775         classForm = (Form_pg_class) GETSTRUCT(classTup);
2776
2777         /*
2778          * Get the applicable reloptions.  If it is a TOAST table, try to get the
2779          * main table reloptions if the toast table itself doesn't have.
2780          */
2781         avopts = extract_autovac_opts(classTup, pg_class_desc);
2782         if (classForm->relkind == RELKIND_TOASTVALUE &&
2783                 avopts == NULL && table_toast_map != NULL)
2784         {
2785                 av_relation *hentry;
2786                 bool            found;
2787
2788                 hentry = hash_search(table_toast_map, &relid, HASH_FIND, &found);
2789                 if (found && hentry->ar_hasrelopts)
2790                         avopts = &hentry->ar_reloptions;
2791         }
2792
2793         recheck_relation_needs_vacanalyze(relid, avopts, classForm,
2794                                                                           effective_multixact_freeze_max_age,
2795                                                                           &dovacuum, &doanalyze, &wraparound);
2796
2797         /* OK, it needs something done */
2798         if (doanalyze || dovacuum)
2799         {
2800                 int                     freeze_min_age;
2801                 int                     freeze_table_age;
2802                 int                     multixact_freeze_min_age;
2803                 int                     multixact_freeze_table_age;
2804                 int                     vac_cost_limit;
2805                 double          vac_cost_delay;
2806                 int                     log_min_duration;
2807
2808                 /*
2809                  * Calculate the vacuum cost parameters and the freeze ages.  If there
2810                  * are options set in pg_class.reloptions, use them; in the case of a
2811                  * toast table, try the main table too.  Otherwise use the GUC
2812                  * defaults, autovacuum's own first and plain vacuum second.
2813                  */
2814
2815                 /* -1 in autovac setting means use plain vacuum_cost_delay */
2816                 vac_cost_delay = (avopts && avopts->vacuum_cost_delay >= 0)
2817                         ? avopts->vacuum_cost_delay
2818                         : (autovacuum_vac_cost_delay >= 0)
2819                         ? autovacuum_vac_cost_delay
2820                         : VacuumCostDelay;
2821
2822                 /* 0 or -1 in autovac setting means use plain vacuum_cost_limit */
2823                 vac_cost_limit = (avopts && avopts->vacuum_cost_limit > 0)
2824                         ? avopts->vacuum_cost_limit
2825                         : (autovacuum_vac_cost_limit > 0)
2826                         ? autovacuum_vac_cost_limit
2827                         : VacuumCostLimit;
2828
2829                 /* -1 in autovac setting means use log_autovacuum_min_duration */
2830                 log_min_duration = (avopts && avopts->log_min_duration >= 0)
2831                         ? avopts->log_min_duration
2832                         : Log_autovacuum_min_duration;
2833
2834                 /* these do not have autovacuum-specific settings */
2835                 freeze_min_age = (avopts && avopts->freeze_min_age >= 0)
2836                         ? avopts->freeze_min_age
2837                         : default_freeze_min_age;
2838
2839                 freeze_table_age = (avopts && avopts->freeze_table_age >= 0)
2840                         ? avopts->freeze_table_age
2841                         : default_freeze_table_age;
2842
2843                 multixact_freeze_min_age = (avopts &&
2844                                                                         avopts->multixact_freeze_min_age >= 0)
2845                         ? avopts->multixact_freeze_min_age
2846                         : default_multixact_freeze_min_age;
2847
2848                 multixact_freeze_table_age = (avopts &&
2849                                                                           avopts->multixact_freeze_table_age >= 0)
2850                         ? avopts->multixact_freeze_table_age
2851                         : default_multixact_freeze_table_age;
2852
2853                 tab = palloc(sizeof(autovac_table));
2854                 tab->at_relid = relid;
2855                 tab->at_sharedrel = classForm->relisshared;
2856
2857                 /*
2858                  * Select VACUUM options.  Note we don't say VACOPT_PROCESS_TOAST, so
2859                  * that vacuum() skips toast relations.  Also note we tell vacuum() to
2860                  * skip vac_update_datfrozenxid(); we'll do that separately.
2861                  */
2862                 tab->at_params.options =
2863                         (dovacuum ? (VACOPT_VACUUM |
2864                                                  VACOPT_PROCESS_MAIN |
2865                                                  VACOPT_SKIP_DATABASE_STATS) : 0) |
2866                         (doanalyze ? VACOPT_ANALYZE : 0) |
2867                         (!wraparound ? VACOPT_SKIP_LOCKED : 0);
2868
2869                 /*
2870                  * index_cleanup and truncate are unspecified at first in autovacuum.
2871                  * They will be filled in with usable values using their reloptions
2872                  * (or reloption defaults) later.
2873                  */
2874                 tab->at_params.index_cleanup = VACOPTVALUE_UNSPECIFIED;
2875                 tab->at_params.truncate = VACOPTVALUE_UNSPECIFIED;
2876                 /* As of now, we don't support parallel vacuum for autovacuum */
2877                 tab->at_params.nworkers = -1;
2878                 tab->at_params.freeze_min_age = freeze_min_age;
2879                 tab->at_params.freeze_table_age = freeze_table_age;
2880                 tab->at_params.multixact_freeze_min_age = multixact_freeze_min_age;
2881                 tab->at_params.multixact_freeze_table_age = multixact_freeze_table_age;
2882                 tab->at_params.is_wraparound = wraparound;
2883                 tab->at_params.log_min_duration = log_min_duration;
2884                 tab->at_vacuum_cost_limit = vac_cost_limit;
2885                 tab->at_vacuum_cost_delay = vac_cost_delay;
2886                 tab->at_relname = NULL;
2887                 tab->at_nspname = NULL;
2888                 tab->at_datname = NULL;
2889
2890                 /*
2891                  * If any of the cost delay parameters has been set individually for
2892                  * this table, disable the balancing algorithm.
2893                  */
2894                 tab->at_dobalance =
2895                         !(avopts && (avopts->vacuum_cost_limit > 0 ||
2896                                                  avopts->vacuum_cost_delay > 0));
2897         }
2898
2899         heap_freetuple(classTup);
2900         return tab;
2901 }
2902
2903 /*
2904  * recheck_relation_needs_vacanalyze
2905  *
2906  * Subroutine for table_recheck_autovac.
2907  *
2908  * Fetch the pgstat of a relation and recheck whether a relation
2909  * needs to be vacuumed or analyzed.
2910  */
2911 static void
2912 recheck_relation_needs_vacanalyze(Oid relid,
2913                                                                   AutoVacOpts *avopts,
2914                                                                   Form_pg_class classForm,
2915                                                                   int effective_multixact_freeze_max_age,
2916                                                                   bool *dovacuum,
2917                                                                   bool *doanalyze,
2918                                                                   bool *wraparound)
2919 {
2920         PgStat_StatTabEntry *tabentry;
2921
2922         /* fetch the pgstat table entry */
2923         tabentry = pgstat_fetch_stat_tabentry_ext(classForm->relisshared,
2924                                                                                           relid);
2925
2926         relation_needs_vacanalyze(relid, avopts, classForm, tabentry,
2927                                                           effective_multixact_freeze_max_age,
2928                                                           dovacuum, doanalyze, wraparound);
2929
2930         /* ignore ANALYZE for toast tables */
2931         if (classForm->relkind == RELKIND_TOASTVALUE)
2932                 *doanalyze = false;
2933 }
2934
2935 /*
2936  * relation_needs_vacanalyze
2937  *
2938  * Check whether a relation needs to be vacuumed or analyzed; return each into
2939  * "dovacuum" and "doanalyze", respectively.  Also return whether the vacuum is
2940  * being forced because of Xid or multixact wraparound.
2941  *
2942  * relopts is a pointer to the AutoVacOpts options (either for itself in the
2943  * case of a plain table, or for either itself or its parent table in the case
2944  * of a TOAST table), NULL if none; tabentry is the pgstats entry, which can be
2945  * NULL.
2946  *
2947  * A table needs to be vacuumed if the number of dead tuples exceeds a
2948  * threshold.  This threshold is calculated as
2949  *
2950  * threshold = vac_base_thresh + vac_scale_factor * reltuples
2951  *
2952  * For analyze, the analysis done is that the number of tuples inserted,
2953  * deleted and updated since the last analyze exceeds a threshold calculated
2954  * in the same fashion as above.  Note that the cumulative stats system stores
2955  * the number of tuples (both live and dead) that there were as of the last
2956  * analyze.  This is asymmetric to the VACUUM case.
2957  *
2958  * We also force vacuum if the table's relfrozenxid is more than freeze_max_age
2959  * transactions back, and if its relminmxid is more than
2960  * multixact_freeze_max_age multixacts back.
2961  *
2962  * A table whose autovacuum_enabled option is false is
2963  * automatically skipped (unless we have to vacuum it due to freeze_max_age).
2964  * Thus autovacuum can be disabled for specific tables. Also, when the cumulative
2965  * stats system does not have data about a table, it will be skipped.
2966  *
2967  * A table whose vac_base_thresh value is < 0 takes the base value from the
2968  * autovacuum_vacuum_threshold GUC variable.  Similarly, a vac_scale_factor
2969  * value < 0 is substituted with the value of
2970  * autovacuum_vacuum_scale_factor GUC variable.  Ditto for analyze.
2971  */
2972 static void
2973 relation_needs_vacanalyze(Oid relid,
2974                                                   AutoVacOpts *relopts,
2975                                                   Form_pg_class classForm,
2976                                                   PgStat_StatTabEntry *tabentry,
2977                                                   int effective_multixact_freeze_max_age,
2978  /* output params below */
2979                                                   bool *dovacuum,
2980                                                   bool *doanalyze,
2981                                                   bool *wraparound)
2982 {
2983         bool            force_vacuum;
2984         bool            av_enabled;
2985         float4          reltuples;              /* pg_class.reltuples */
2986
2987         /* constants from reloptions or GUC variables */
2988         int                     vac_base_thresh,
2989                                 vac_ins_base_thresh,
2990                                 anl_base_thresh;
2991         float4          vac_scale_factor,
2992                                 vac_ins_scale_factor,
2993                                 anl_scale_factor;
2994
2995         /* thresholds calculated from above constants */
2996         float4          vacthresh,
2997                                 vacinsthresh,
2998                                 anlthresh;
2999
3000         /* number of vacuum (resp. analyze) tuples at this time */
3001         float4          vactuples,
3002                                 instuples,
3003                                 anltuples;
3004
3005         /* freeze parameters */
3006         int                     freeze_max_age;
3007         int                     multixact_freeze_max_age;
3008         TransactionId xidForceLimit;
3009         MultiXactId multiForceLimit;
3010
3011         Assert(classForm != NULL);
3012         Assert(OidIsValid(relid));
3013
3014         /*
3015          * Determine vacuum/analyze equation parameters.  We have two possible
3016          * sources: the passed reloptions (which could be a main table or a toast
3017          * table), or the autovacuum GUC variables.
3018          */
3019
3020         /* -1 in autovac setting means use plain vacuum_scale_factor */
3021         vac_scale_factor = (relopts && relopts->vacuum_scale_factor >= 0)
3022                 ? relopts->vacuum_scale_factor
3023                 : autovacuum_vac_scale;
3024
3025         vac_base_thresh = (relopts && relopts->vacuum_threshold >= 0)
3026                 ? relopts->vacuum_threshold
3027                 : autovacuum_vac_thresh;
3028
3029         vac_ins_scale_factor = (relopts && relopts->vacuum_ins_scale_factor >= 0)
3030                 ? relopts->vacuum_ins_scale_factor
3031                 : autovacuum_vac_ins_scale;
3032
3033         /* -1 is used to disable insert vacuums */
3034         vac_ins_base_thresh = (relopts && relopts->vacuum_ins_threshold >= -1)
3035                 ? relopts->vacuum_ins_threshold
3036                 : autovacuum_vac_ins_thresh;
3037
3038         anl_scale_factor = (relopts && relopts->analyze_scale_factor >= 0)
3039                 ? relopts->analyze_scale_factor
3040                 : autovacuum_anl_scale;
3041
3042         anl_base_thresh = (relopts && relopts->analyze_threshold >= 0)
3043                 ? relopts->analyze_threshold
3044                 : autovacuum_anl_thresh;
3045
3046         freeze_max_age = (relopts && relopts->freeze_max_age >= 0)
3047                 ? Min(relopts->freeze_max_age, autovacuum_freeze_max_age)
3048                 : autovacuum_freeze_max_age;
3049
3050         multixact_freeze_max_age = (relopts && relopts->multixact_freeze_max_age >= 0)
3051                 ? Min(relopts->multixact_freeze_max_age, effective_multixact_freeze_max_age)
3052                 : effective_multixact_freeze_max_age;
3053
3054         av_enabled = (relopts ? relopts->enabled : true);
3055
3056         /* Force vacuum if table is at risk of wraparound */
3057         xidForceLimit = recentXid - freeze_max_age;
3058         if (xidForceLimit < FirstNormalTransactionId)
3059                 xidForceLimit -= FirstNormalTransactionId;
3060         force_vacuum = (TransactionIdIsNormal(classForm->relfrozenxid) &&
3061                                         TransactionIdPrecedes(classForm->relfrozenxid,
3062                                                                                   xidForceLimit));
3063         if (!force_vacuum)
3064         {
3065                 multiForceLimit = recentMulti - multixact_freeze_max_age;
3066                 if (multiForceLimit < FirstMultiXactId)
3067                         multiForceLimit -= FirstMultiXactId;
3068                 force_vacuum = MultiXactIdIsValid(classForm->relminmxid) &&
3069                         MultiXactIdPrecedes(classForm->relminmxid, multiForceLimit);
3070         }
3071         *wraparound = force_vacuum;
3072
3073         /* User disabled it in pg_class.reloptions?  (But ignore if at risk) */
3074         if (!av_enabled && !force_vacuum)
3075         {
3076                 *doanalyze = false;
3077                 *dovacuum = false;
3078                 return;
3079         }
3080
3081         /*
3082          * If we found stats for the table, and autovacuum is currently enabled,
3083          * make a threshold-based decision whether to vacuum and/or analyze.  If
3084          * autovacuum is currently disabled, we must be here for anti-wraparound
3085          * vacuuming only, so don't vacuum (or analyze) anything that's not being
3086          * forced.
3087          */
3088         if (PointerIsValid(tabentry) && AutoVacuumingActive())
3089         {
3090                 reltuples = classForm->reltuples;
3091                 vactuples = tabentry->dead_tuples;
3092                 instuples = tabentry->ins_since_vacuum;
3093                 anltuples = tabentry->mod_since_analyze;
3094
3095                 /* If the table hasn't yet been vacuumed, take reltuples as zero */
3096                 if (reltuples < 0)
3097                         reltuples = 0;
3098
3099                 vacthresh = (float4) vac_base_thresh + vac_scale_factor * reltuples;
3100                 vacinsthresh = (float4) vac_ins_base_thresh + vac_ins_scale_factor * reltuples;
3101                 anlthresh = (float4) anl_base_thresh + anl_scale_factor * reltuples;
3102
3103                 /*
3104                  * Note that we don't need to take special consideration for stat
3105                  * reset, because if that happens, the last vacuum and analyze counts
3106                  * will be reset too.
3107                  */
3108                 if (vac_ins_base_thresh >= 0)
3109                         elog(DEBUG3, "%s: vac: %.0f (threshold %.0f), ins: %.0f (threshold %.0f), anl: %.0f (threshold %.0f)",
3110                                  NameStr(classForm->relname),
3111                                  vactuples, vacthresh, instuples, vacinsthresh, anltuples, anlthresh);
3112                 else
3113                         elog(DEBUG3, "%s: vac: %.0f (threshold %.0f), ins: (disabled), anl: %.0f (threshold %.0f)",
3114                                  NameStr(classForm->relname),
3115                                  vactuples, vacthresh, anltuples, anlthresh);
3116
3117                 /* Determine if this table needs vacuum or analyze. */
3118                 *dovacuum = force_vacuum || (vactuples > vacthresh) ||
3119                         (vac_ins_base_thresh >= 0 && instuples > vacinsthresh);
3120                 *doanalyze = (anltuples > anlthresh);
3121         }
3122         else
3123         {
3124                 /*
3125                  * Skip a table not found in stat hash, unless we have to force vacuum
3126                  * for anti-wrap purposes.  If it's not acted upon, there's no need to
3127                  * vacuum it.
3128                  */
3129                 *dovacuum = force_vacuum;
3130                 *doanalyze = false;
3131         }
3132
3133         /* ANALYZE refuses to work with pg_statistic */
3134         if (relid == StatisticRelationId)
3135                 *doanalyze = false;
3136 }
3137
3138 /*
3139  * autovacuum_do_vac_analyze
3140  *              Vacuum and/or analyze the specified table
3141  */
3142 static void
3143 autovacuum_do_vac_analyze(autovac_table *tab, BufferAccessStrategy bstrategy)
3144 {
3145         RangeVar   *rangevar;
3146         VacuumRelation *rel;
3147         List       *rel_list;
3148
3149         /* Let pgstat know what we're doing */
3150         autovac_report_activity(tab);
3151
3152         /* Set up one VacuumRelation target, identified by OID, for vacuum() */
3153         rangevar = makeRangeVar(tab->at_nspname, tab->at_relname, -1);
3154         rel = makeVacuumRelation(rangevar, tab->at_relid, NIL);
3155         rel_list = list_make1(rel);
3156
3157         vacuum(rel_list, &tab->at_params, bstrategy, true);
3158 }
3159
3160 /*
3161  * autovac_report_activity
3162  *              Report to pgstat what autovacuum is doing
3163  *
3164  * We send a SQL string corresponding to what the user would see if the
3165  * equivalent command was to be issued manually.
3166  *
3167  * Note we assume that we are going to report the next command as soon as we're
3168  * done with the current one, and exit right after the last one, so we don't
3169  * bother to report "<IDLE>" or some such.
3170  */
3171 static void
3172 autovac_report_activity(autovac_table *tab)
3173 {
3174 #define MAX_AUTOVAC_ACTIV_LEN (NAMEDATALEN * 2 + 56)
3175         char            activity[MAX_AUTOVAC_ACTIV_LEN];
3176         int                     len;
3177
3178         /* Report the command and possible options */
3179         if (tab->at_params.options & VACOPT_VACUUM)
3180                 snprintf(activity, MAX_AUTOVAC_ACTIV_LEN,
3181                                  "autovacuum: VACUUM%s",
3182                                  tab->at_params.options & VACOPT_ANALYZE ? " ANALYZE" : "");
3183         else
3184                 snprintf(activity, MAX_AUTOVAC_ACTIV_LEN,
3185                                  "autovacuum: ANALYZE");
3186
3187         /*
3188          * Report the qualified name of the relation.
3189          */
3190         len = strlen(activity);
3191
3192         snprintf(activity + len, MAX_AUTOVAC_ACTIV_LEN - len,
3193                          " %s.%s%s", tab->at_nspname, tab->at_relname,
3194                          tab->at_params.is_wraparound ? " (to prevent wraparound)" : "");
3195
3196         /* Set statement_timestamp() to current time for pg_stat_activity */
3197         SetCurrentStatementStartTimestamp();
3198
3199         pgstat_report_activity(STATE_RUNNING, activity);
3200 }
3201
3202 /*
3203  * autovac_report_workitem
3204  *              Report to pgstat that autovacuum is processing a work item
3205  */
3206 static void
3207 autovac_report_workitem(AutoVacuumWorkItem *workitem,
3208                                                 const char *nspname, const char *relname)
3209 {
3210         char            activity[MAX_AUTOVAC_ACTIV_LEN + 12 + 2];
3211         char            blk[12 + 2];
3212         int                     len;
3213
3214         switch (workitem->avw_type)
3215         {
3216                 case AVW_BRINSummarizeRange:
3217                         snprintf(activity, MAX_AUTOVAC_ACTIV_LEN,
3218                                          "autovacuum: BRIN summarize");
3219                         break;
3220         }
3221
3222         /*
3223          * Report the qualified name of the relation, and the block number if any
3224          */
3225         len = strlen(activity);
3226
3227         if (BlockNumberIsValid(workitem->avw_blockNumber))
3228                 snprintf(blk, sizeof(blk), " %u", workitem->avw_blockNumber);
3229         else
3230                 blk[0] = '\0';
3231
3232         snprintf(activity + len, MAX_AUTOVAC_ACTIV_LEN - len,
3233                          " %s.%s%s", nspname, relname, blk);
3234
3235         /* Set statement_timestamp() to current time for pg_stat_activity */
3236         SetCurrentStatementStartTimestamp();
3237
3238         pgstat_report_activity(STATE_RUNNING, activity);
3239 }
3240
3241 /*
3242  * AutoVacuumingActive
3243  *              Check GUC vars and report whether the autovacuum process should be
3244  *              running.
3245  */
3246 bool
3247 AutoVacuumingActive(void)
3248 {
3249         if (!autovacuum_start_daemon || !pgstat_track_counts)
3250                 return false;
3251         return true;
3252 }
3253
3254 /*
3255  * Request one work item to the next autovacuum run processing our database.
3256  * Return false if the request can't be recorded.
3257  */
3258 bool
3259 AutoVacuumRequestWork(AutoVacuumWorkItemType type, Oid relationId,
3260                                           BlockNumber blkno)
3261 {
3262         int                     i;
3263         bool            result = false;
3264
3265         LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
3266
3267         /*
3268          * Locate an unused work item and fill it with the given data.
3269          */
3270         for (i = 0; i < NUM_WORKITEMS; i++)
3271         {
3272                 AutoVacuumWorkItem *workitem = &AutoVacuumShmem->av_workItems[i];
3273
3274                 if (workitem->avw_used)
3275                         continue;
3276
3277                 workitem->avw_used = true;
3278                 workitem->avw_active = false;
3279                 workitem->avw_type = type;
3280                 workitem->avw_database = MyDatabaseId;
3281                 workitem->avw_relation = relationId;
3282                 workitem->avw_blockNumber = blkno;
3283                 result = true;
3284
3285                 /* done */
3286                 break;
3287         }
3288
3289         LWLockRelease(AutovacuumLock);
3290
3291         return result;
3292 }
3293
3294 /*
3295  * autovac_init
3296  *              This is called at postmaster initialization.
3297  *
3298  * All we do here is annoy the user if he got it wrong.
3299  */
3300 void
3301 autovac_init(void)
3302 {
3303         if (autovacuum_start_daemon && !pgstat_track_counts)
3304                 ereport(WARNING,
3305                                 (errmsg("autovacuum not started because of misconfiguration"),
3306                                  errhint("Enable the \"track_counts\" option.")));
3307 }
3308
3309 /*
3310  * IsAutoVacuum functions
3311  *              Return whether this is either a launcher autovacuum process or a worker
3312  *              process.
3313  */
3314 bool
3315 IsAutoVacuumLauncherProcess(void)
3316 {
3317         return am_autovacuum_launcher;
3318 }
3319
3320 bool
3321 IsAutoVacuumWorkerProcess(void)
3322 {
3323         return am_autovacuum_worker;
3324 }
3325
3326
3327 /*
3328  * AutoVacuumShmemSize
3329  *              Compute space needed for autovacuum-related shared memory
3330  */
3331 Size
3332 AutoVacuumShmemSize(void)
3333 {
3334         Size            size;
3335
3336         /*
3337          * Need the fixed struct and the array of WorkerInfoData.
3338          */
3339         size = sizeof(AutoVacuumShmemStruct);
3340         size = MAXALIGN(size);
3341         size = add_size(size, mul_size(autovacuum_max_workers,
3342                                                                    sizeof(WorkerInfoData)));
3343         return size;
3344 }
3345
3346 /*
3347  * AutoVacuumShmemInit
3348  *              Allocate and initialize autovacuum-related shared memory
3349  */
3350 void
3351 AutoVacuumShmemInit(void)
3352 {
3353         bool            found;
3354
3355         AutoVacuumShmem = (AutoVacuumShmemStruct *)
3356                 ShmemInitStruct("AutoVacuum Data",
3357                                                 AutoVacuumShmemSize(),
3358                                                 &found);
3359
3360         if (!IsUnderPostmaster)
3361         {
3362                 WorkerInfo      worker;
3363                 int                     i;
3364
3365                 Assert(!found);
3366
3367                 AutoVacuumShmem->av_launcherpid = 0;
3368                 dlist_init(&AutoVacuumShmem->av_freeWorkers);
3369                 dlist_init(&AutoVacuumShmem->av_runningWorkers);
3370                 AutoVacuumShmem->av_startingWorker = NULL;
3371                 memset(AutoVacuumShmem->av_workItems, 0,
3372                            sizeof(AutoVacuumWorkItem) * NUM_WORKITEMS);
3373
3374                 worker = (WorkerInfo) ((char *) AutoVacuumShmem +
3375                                                            MAXALIGN(sizeof(AutoVacuumShmemStruct)));
3376
3377                 /* initialize the WorkerInfo free list */
3378                 for (i = 0; i < autovacuum_max_workers; i++)
3379                         dlist_push_head(&AutoVacuumShmem->av_freeWorkers,
3380                                                         &worker[i].wi_links);
3381         }
3382         else
3383                 Assert(found);
3384 }
3385
3386 /*
3387  * GUC check_hook for autovacuum_work_mem
3388  */
3389 bool
3390 check_autovacuum_work_mem(int *newval, void **extra, GucSource source)
3391 {
3392         /*
3393          * -1 indicates fallback.
3394          *
3395          * If we haven't yet changed the boot_val default of -1, just let it be.
3396          * Autovacuum will look to maintenance_work_mem instead.
3397          */
3398         if (*newval == -1)
3399                 return true;
3400
3401         /*
3402          * We clamp manually-set values to at least 1MB.  Since
3403          * maintenance_work_mem is always set to at least this value, do the same
3404          * here.
3405          */
3406         if (*newval < 1024)
3407                 *newval = 1024;
3408
3409         return true;
3410 }