src/mailman/bin/master.py

   1 # Copyright (C) 2001-2019 by the Free Software Foundation, Inc.
   2 #
   3 # This file is part of GNU Mailman.
   4 #
   5 # GNU Mailman is free software: you can redistribute it and/or modify it under
   6 # the terms of the GNU General Public License as published by the Free
   7 # Software Foundation, either version 3 of the License, or (at your option)
   8 # any later version.
   9 #
  10 # GNU Mailman is distributed in the hope that it will be useful, but WITHOUT
  11 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12 # FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  13 # more details.
  14 #
  15 # You should have received a copy of the GNU General Public License along with
  16 # GNU Mailman.  If not, see <http://www.gnu.org/licenses/>.
  17
  18 """Master subprocess watcher."""
  19
  20 import os
  21 import sys
  22 import click
  23 import signal
  24 import socket
  25 import logging
  26
  27 from datetime import timedelta
  28 from enum import Enum
  29 from flufl.lock import Lock, NotLockedError, TimeOutError
  30 from lazr.config import as_boolean
  31 from mailman.config import config
  32 from mailman.core.i18n import _
  33 from mailman.core.initialize import initialize
  34 from mailman.core.logging import reopen
  35 from mailman.utilities.options import I18nCommand, validate_runner_spec
  36 from mailman.version import MAILMAN_VERSION_FULL
  37 from public import public
  38
  39
  40 DOT = '.'
  41 LOCK_LIFETIME = timedelta(days=1, hours=6)
  42 SECONDS_IN_A_DAY = 86400
  43 SUBPROC_START_WAIT = timedelta(seconds=20)
  44
  45 # Environment variables to forward into subprocesses.
  46 PRESERVE_ENVS = (
  47     'COVERAGE_PROCESS_START',
  48     'LANG',
  49     'LANGUAGE',
  50     'LC_ADDRESS',
  51     'LC_ALL',
  52     'LC_COLLATE',
  53     'LC_CTYPE',
  54     'LC_IDENTIFICATION',
  55     'LC_MEASUREMENT',
  56     'LC_MESSAGES',
  57     'LC_MONETARY',
  58     'LC_NAME',
  59     'LC_NUMERIC',
  60     'LC_PAPER',
  61     'LC_TELEPHONE',
  62     'LC_TIME',
  63     'LOCALE_ARCHIVE',
  64     'MAILMAN_EXTRA_TESTING_CFG',
  65     'PYTHONPATH',
  66     'PYTHONHOME',
  67     )
  68
  69
  70 @public
  71 class WatcherState(Enum):
  72     """Enum for the state of the master process watcher."""
  73     # No lock has been acquired by any process.
  74     none = 0
  75     # Another master watcher is running.
  76     conflict = 1
  77     # No conflicting process exists.
  78     stale_lock = 2
  79     # Hostname from lock file doesn't match.
  80     host_mismatch = 3
  81
  82
  83 @public
  84 def master_state(lock_file=None):
  85     """Get the state of the master watcher.
  86
  87     :param lock_file: Path to the lock file, otherwise `config.LOCK_FILE`.
  88     :type lock_file: str
  89     :return: 2-tuple of the WatcherState describing the state of the lock
  90         file, and the lock object.
  91     """
  92     if lock_file is None:
  93         lock_file = config.LOCK_FILE
  94     # We'll never acquire the lock, so the lifetime doesn't matter.
  95     lock = Lock(lock_file)
  96     try:
  97         hostname, pid, tempfile = lock.details
  98     except NotLockedError:
  99         return WatcherState.none, lock
 100     if hostname != socket.getfqdn():
 101         return WatcherState.host_mismatch, lock
 102     # Find out if the process exists by calling kill with a signal 0.
 103     try:
 104         os.kill(pid, 0)
 105         return WatcherState.conflict, lock
 106     except ProcessLookupError:
 107         # No matching process id.
 108         return WatcherState.stale_lock, lock
 109
 110
 111 def acquire_lock_1(force, lock_file=None):
 112     """Try to acquire the master lock.
 113
 114     :param force: Flag that controls whether to force acquisition of the lock.
 115     :type force: bool
 116     :param lock_file: Path to the lock file, otherwise `config.LOCK_FILE`.
 117     :type lock_file: str
 118     :return: The master lock.
 119     :raises: `TimeOutError` if the lock could not be acquired.
 120     """
 121     if lock_file is None:
 122         lock_file = config.LOCK_FILE
 123     lock = Lock(lock_file, LOCK_LIFETIME)
 124     try:
 125         lock.lock(timedelta(seconds=0.1))
 126         return lock
 127     except TimeOutError:
 128         if not force:
 129             raise
 130         # Force removal of lock first.
 131         lock.disown()
 132         hostname, pid, tempfile = lock.details
 133         os.unlink(lock_file)
 134         return acquire_lock_1(force=False)
 135
 136
 137 def acquire_lock(force):
 138     """Acquire the master lock.
 139
 140     :param force: Flag that controls whether to force acquisition of the lock.
 141     :type force: bool
 142     :return: The master runner lock or None if the lock couldn't be acquired.
 143         In that case, an error messages is also printed to standard error.
 144     """
 145     try:
 146         lock = acquire_lock_1(force)
 147         return lock
 148     except TimeOutError:
 149         status, lock = master_state()
 150         if status is WatcherState.conflict:
 151             # Hostname matches and process exists.
 152             message = _("""\
 153 The master lock could not be acquired because it appears as though another
 154 master is already running.""")
 155         elif status is WatcherState.stale_lock:
 156             # Hostname matches but the process does not exist.
 157             program = sys.argv[0]                   # noqa: F841
 158             message = _("""\
 159 The master lock could not be acquired.  It appears as though there is a stale
 160 master lock.  Try re-running $program with the --force flag.""")
 161         elif status is WatcherState.host_mismatch:
 162             # Hostname doesn't even match.
 163             hostname, pid, tempfile = lock.details
 164             message = _("""\
 165 The master lock could not be acquired, because it appears as if some process
 166 on some other host may have acquired it.  We can't test for stale locks across
 167 host boundaries, so you'll have to clean this up manually.
 168
 169 Lock file: $config.LOCK_FILE
 170 Lock host: $hostname
 171
 172 Exiting.""")
 173         else:
 174             assert status is WatcherState.none, (
 175                 'Invalid enum value: {}'.format(status))
 176             hostname, pid, tempfile = lock.details
 177             message = _("""\
 178 For unknown reasons, the master lock could not be acquired.
 179
 180 Lock file: $config.LOCK_FILE
 181 Lock host: $hostname
 182
 183 Exiting.""")
 184         print(message, file=sys.stderr)
 185         sys.exit(1)
 186
 187
 188 class PIDWatcher:
 189     """A class which safely manages child process ids."""
 190
 191     def __init__(self):
 192         self._pids = {}
 193
 194     def __contains__(self, pid):
 195         return pid in self._pids.keys()
 196
 197     def __iter__(self):
 198         # Safely iterate over all the keys in the dictionary.  Because
 199         # asynchronous signals are involved, the dictionary's size could
 200         # change during iteration.  Iterate over a copy of the keys to avoid
 201         # that.
 202         for pid in self._pids.keys():
 203             yield pid
 204
 205     def add(self, pid, info):
 206         """Add process information.
 207
 208         :param pid: The process id.  The watcher must not already be tracking
 209             this process id.
 210         :type pid: int
 211         :param info: The process information.
 212         :type info: 4-tuple consisting of
 213             (runner-name, slice-number, slice-count, restart-count)
 214         """
 215         old_info = self._pids.get(pid)
 216         assert old_info is None, (
 217             'Duplicate process id {0} with existing info: {1}'.format(
 218                 pid, old_info))
 219         self._pids[pid] = info
 220
 221     def pop(self, pid):
 222         """Remove and return existing process information.
 223
 224         :param pid: The process id.  The watcher must already be tracking this
 225             process id.
 226         :type pid: int
 227         :return: The process information.
 228         :rtype: 4-tuple consisting of
 229             (runner-name, slice-number, slice-count, restart-count)
 230         :raise KeyError: if the process id is not being tracked.
 231         """
 232         return self._pids.pop(pid)
 233
 234     def drop(self, pid):
 235         """Remove and return existing process information.
 236
 237         This is like `pop()` except that no `KeyError` is raised if the
 238         process id is not being tracked.
 239
 240         :param pid: The process id.
 241         :type pid: int
 242         :return: The process information, or None if the process id is not
 243             being tracked.
 244         :rtype: 4-tuple consisting of
 245             (runner-name, slice-number, slice-count, restart-count)
 246         """
 247         return self._pids.pop(pid, None)
 248
 249
 250 @public
 251 class Loop:
 252     """Main control loop class."""
 253
 254     def __init__(self, lock=None, restartable=None, config_file=None):
 255         self._lock = lock
 256         self._restartable = restartable
 257         self._config_file = config_file
 258         self._kids = PIDWatcher()
 259
 260     def install_signal_handlers(self):
 261         """Install various signals handlers for control from the master."""
 262         log = logging.getLogger('mailman.runner')
 263         # Set up our signal handlers.  Also set up a SIGALRM handler to
 264         # refresh the lock once per day.  The lock lifetime is 1 day + 6 hours
 265         # so this should be plenty.
 266         def sigalrm_handler(signum, frame):                      # noqa: E306
 267             self._lock.refresh()
 268             signal.alarm(SECONDS_IN_A_DAY)
 269         signal.signal(signal.SIGALRM, sigalrm_handler)
 270         signal.alarm(SECONDS_IN_A_DAY)
 271         # SIGHUP tells the runners to close and reopen their log files.
 272         def sighup_handler(signum, frame):                        # noqa: E306
 273             reopen()
 274             for pid in self._kids:
 275                 os.kill(pid, signal.SIGHUP)
 276             log.info('Master watcher caught SIGHUP.  Re-opening log files.')
 277         signal.signal(signal.SIGHUP, sighup_handler)
 278         # SIGUSR1 is used by 'mailman restart'.
 279         def sigusr1_handler(signum, frame):                       # noqa: E306
 280             for pid in self._kids:
 281                 os.kill(pid, signal.SIGUSR1)
 282             log.info('Master watcher caught SIGUSR1.  Exiting.')
 283         signal.signal(signal.SIGUSR1, sigusr1_handler)
 284         # SIGTERM is what init will kill this process with when changing run
 285         # levels.  It's also the signal 'mailman stop' uses.
 286         def sigterm_handler(signum, frame):                       # noqa: E306
 287             for pid in self._kids:
 288                 os.kill(pid, signal.SIGTERM)
 289             log.info('Master watcher caught SIGTERM.  Exiting.')
 290         signal.signal(signal.SIGTERM, sigterm_handler)
 291         # SIGINT is what control-C gives.
 292         def sigint_handler(signum, frame):                        # noqa: E306
 293             for pid in self._kids:
 294                 os.kill(pid, signal.SIGINT)
 295             log.info('Master watcher caught SIGINT.  Restarting.')
 296         signal.signal(signal.SIGINT, sigint_handler)
 297
 298     def _start_runner(self, spec):
 299         """Start a runner.
 300
 301         All arguments are passed to the process.
 302
 303         :param spec: A runner spec, in a format acceptable to
 304             bin/runner's --runner argument, e.g. name:slice:count
 305         :type spec: string
 306         :return: The process id of the child runner.
 307         :rtype: int
 308         """
 309         pid = os.fork()
 310         if pid:
 311             # Parent.
 312             return pid
 313         # Child.
 314         #
 315         # Set the environment variable which tells the runner that it's
 316         # running under bin/master control.  This subtly changes the error
 317         # behavior of bin/runner.
 318         env = {'MAILMAN_UNDER_MASTER_CONTROL': '1'}
 319         # Craft the command line arguments for the exec() call.
 320         rswitch = '--runner=' + spec
 321         # Always pass the explicit path to the configuration file to the
 322         # sub-runners.  This avoids any debate about which cfg file is used.
 323         config_file = (config.filename if self._config_file is None
 324                        else self._config_file)
 325         # Wherever master lives, so too must live the runner script.
 326         exe = os.path.join(config.BIN_DIR, 'runner')   # pragma: nocover
 327         # config.PYTHON, which is the absolute path to the Python interpreter,
 328         # must be given as argv[0] due to Python's library search algorithm.
 329         args = [sys.executable, sys.executable, exe,   # pragma: nocover
 330                 '-C', config_file, rswitch]
 331         log = logging.getLogger('mailman.runner')
 332         log.debug('starting: %s', args)
 333         # We must pass this environment variable through if it's set,
 334         # otherwise runner processes will not have the correct VAR_DIR.
 335         var_dir = os.environ.get('MAILMAN_VAR_DIR')
 336         if var_dir is not None:
 337             env['MAILMAN_VAR_DIR'] = var_dir
 338         # For the testing framework, if these environment variables are set,
 339         # pass them on to the subprocess.
 340         for envvar in PRESERVE_ENVS:
 341             if envvar in os.environ:
 342                 env[envvar] = os.environ[envvar]
 343         args.append(env)
 344         os.execle(*args)
 345         # We should never get here.
 346         raise RuntimeError('os.execle() failed')
 347
 348     def start_runners(self, runner_names=None):
 349         """Start all the configured runners.
 350
 351         :param runners: If given, a sequence of runner names to start.  If not
 352             given, this sequence is taken from the configuration file.
 353         :type runners: a sequence of strings
 354         """
 355         if not runner_names:
 356             runner_names = []
 357             for runner_config in config.runner_configs:
 358                 # Strip off the 'runner.' prefix.
 359                 assert runner_config.name.startswith('runner.'), (
 360                     'Unexpected runner configuration section name: {}'.format(
 361                         runner_config.name))
 362                 runner_names.append(runner_config.name[7:])
 363         # For each runner we want to start, find their config section, which
 364         # will tell us the name of the class to instantiate, along with the
 365         # number of hash space slices to manage.
 366         for name in runner_names:
 367             section_name = 'runner.' + name
 368             # Let AttributeError propagate.
 369             runner_config = getattr(config, section_name)
 370             if not as_boolean(runner_config.start):
 371                 continue
 372             # Find out how many runners to instantiate.  This must be a power
 373             # of 2.
 374             count = int(runner_config.instances)
 375             assert (count & (count - 1)) == 0, (
 376                 'Runner "{0}", not a power of 2: {1}'.format(name, count))
 377             for slice_number in range(count):
 378                 # runner name, slice #, # of slices, restart count
 379                 info = (name, slice_number, count, 0)
 380                 spec = '{0}:{1:d}:{2:d}'.format(name, slice_number, count)
 381                 pid = self._start_runner(spec)
 382                 log = logging.getLogger('mailman.runner')
 383                 log.debug('[{0:d}] {1}'.format(pid, spec))
 384                 self._kids.add(pid, info)
 385
 386     def _pause(self):
 387         """Sleep until a signal is received."""
 388         # Sleep until a signal is received.  This prevents the master from
 389         # exiting immediately even if there are no runners (as happens in the
 390         # test suite).
 391         signal.pause()
 392
 393     def loop(self):
 394         """Main loop.
 395
 396         Wait until all the runner subprocesses have exited, restarting them if
 397         necessary and configured to do so.
 398         """
 399         log = logging.getLogger('mailman.runner')
 400         log.info('Master started')
 401         self._pause()
 402         while True:
 403             try:
 404                 pid, status = os.wait()
 405             except ChildProcessError:
 406                 # No children?  We're done.
 407                 break
 408             except InterruptedError:                         # pragma: nocover
 409                 # If the system call got interrupted, just restart it.
 410                 continue
 411             if pid not in self._kids:                        # pragma: nocover
 412                 # This is not a runner subprocess that we own.  E.g. maybe a
 413                 # plugin started it.
 414                 continue
 415             # Find out why the subprocess exited by getting the signal
 416             # received or exit status.
 417             if os.WIFSIGNALED(status):
 418                 why = os.WTERMSIG(status)
 419             elif os.WIFEXITED(status):
 420                 why = os.WEXITSTATUS(status)
 421             else:
 422                 why = None
 423             # We'll restart the subprocess if it exited with a SIGUSR1 or
 424             # because of a failure (i.e. no exit signal), and the no-restart
 425             # command line switch was not given.  This lets us better handle
 426             # runaway restarts (e.g.  if the subprocess had a syntax error!)
 427             rname, slice_number, count, restarts = self._kids.pop(pid)
 428             config_name = 'runner.' + rname
 429             restart = False
 430             if why == signal.SIGUSR1 and self._restartable:
 431                 restart = True
 432             # Have we hit the maximum number of restarts?
 433             restarts += 1
 434             max_restarts = int(getattr(config, config_name).max_restarts)
 435             if restarts > max_restarts:
 436                 restart = False
 437             # Are we permanently non-restartable?
 438             log.debug("""\
 439 Master detected subprocess exit
 440 (pid: {0:d}, why: {1}, class: {2}, slice: {3:d}/{4:d}) {5}""".format(
 441                      pid, why, rname, slice_number + 1, count,
 442                      ('[restarting]' if restart else '')))
 443             # See if we've reached the maximum number of allowable restarts.
 444             if restarts > max_restarts:
 445                 log.info("""\
 446 Runner {0} reached maximum restart limit of {1:d}, not restarting.""",
 447                          rname, max_restarts)
 448             # Now perhaps restart the process unless it exited with a
 449             # SIGTERM or we aren't restarting.
 450             if restart:
 451                 spec = '{0}:{1:d}:{2:d}'.format(rname, slice_number, count)
 452                 new_pid = self._start_runner(spec)
 453                 new_info = (rname, slice_number, count, restarts)
 454                 self._kids.add(new_pid, new_info)
 455         log.info('Master stopped')
 456
 457     def cleanup(self):
 458         """Ensure that all children have exited."""
 459         log = logging.getLogger('mailman.runner')
 460         # Send SIGTERMs to all the child processes and wait for them all to
 461         # exit.
 462         for pid in self._kids:
 463             try:
 464                 os.kill(pid, signal.SIGTERM)
 465             except ProcessLookupError:              # pragma: nocover
 466                 # The child has already exited.
 467                 log.info('ESRCH on pid: %d', pid)
 468             except OSError:                         # pragma: nocover
 469                 # XXX I'm not so sure about this.  It preserves the semantics
 470                 # before conversion to PEP 3151 exceptions.  But is it right?
 471                 pass
 472         # Wait for all the children to go away.
 473         while self._kids:
 474             try:
 475                 pid, status = os.wait()
 476                 self._kids.drop(pid)
 477             except ChildProcessError:
 478                 break
 479             except InterruptedError:                # pragma: nocover
 480                 continue
 481
 482
 483 @click.command(
 484     cls=I18nCommand,
 485     context_settings=dict(help_option_names=['-h', '--help']),
 486     help=_("""\
 487     Master subprocess watcher.
 488
 489     Start and watch the configured runners, ensuring that they stay alive and
 490     kicking.  Each runner is forked and exec'd in turn, with the master waiting
 491     on their process ids.  When it detects a child runner has exited, it may
 492     restart it.
 493
 494     The runners respond to SIGINT, SIGTERM, SIGUSR1 and SIGHUP.  SIGINT,
 495     SIGTERM and SIGUSR1 all cause a runner to exit cleanly.  The master will
 496     restart runners that have exited due to a SIGUSR1 or some kind of other
 497     exit condition (say because of an uncaught exception).  SIGHUP causes the
 498     master and the runners to close their log files, and reopen then upon the
 499     next printed message.
 500
 501     The master also responds to SIGINT, SIGTERM, SIGUSR1 and SIGHUP, which it
 502     simply passes on to the runners.  Note that the master will close and
 503     reopen its own log files on receipt of a SIGHUP.  The master also leaves
 504     its own process id in the file specified in the configuration file but you
 505     normally don't need to use this PID directly."""))
 506 @click.option(
 507     '-C', '--config', 'config_file',
 508     envvar='MAILMAN_CONFIG_FILE',
 509     type=click.Path(exists=True, dir_okay=False, resolve_path=True),
 510     help=_("""\
 511     Configuration file to use.  If not given, the environment variable
 512     MAILMAN_CONFIG_FILE is consulted and used if set.  If neither are given, a
 513     default configuration file is loaded."""))
 514 @click.option(
 515     '--no-restart', '-n', 'restartable',
 516     is_flag=True, default=True,
 517     help=_("""\
 518     Don't restart the runners when they exit because of an error or a SIGUSR1.
 519     Use this only for debugging."""))
 520 @click.option(
 521     '--force', '-f',
 522     is_flag=True, default=False,
 523     help=_("""\
 524     If the master watcher finds an existing master lock, it will normally exit
 525     with an error message.  With this option,the master will perform an extra
 526     level of checking.  If a process matching the host/pid described in the
 527     lock file is running, the master will still exit, requiring you to manually
 528     clean up the lock.  But if no matching process is found, the master will
 529     remove the apparently stale lock and make another attempt to claim the
 530     master lock."""))
 531 @click.option(
 532     '--runners', '-r',
 533     metavar='runner[:slice:range]',
 534     callback=validate_runner_spec, default=None,
 535     multiple=True,
 536     help=_("""\
 537     Override the default set of runners that the master will invoke, which is
 538     typically defined in the configuration file.  Multiple -r options may be
 539     given.  The values for -r are passed straight through to bin/runner."""))
 540 @click.option(
 541     '-v', '--verbose',
 542     is_flag=True, default=False,
 543     help=_('Display more debugging information to the log file.'))
 544 @click.version_option(MAILMAN_VERSION_FULL)
 545 @public
 546 def main(config_file, restartable, force, runners, verbose):
 547     # XXX https://github.com/pallets/click/issues/303
 548     """Master subprocess watcher.
 549
 550     Start and watch the configured runners and ensure that they stay
 551     alive and kicking.  Each runner is forked and exec'd in turn, with
 552     the master waiting on their process ids.  When it detects a child
 553     runner has exited, it may restart it.
 554
 555     The runners respond to SIGINT, SIGTERM, SIGUSR1 and SIGHUP.  SIGINT,
 556     SIGTERM and SIGUSR1 all cause a runner to exit cleanly.  The master
 557     will restart runners that have exited due to a SIGUSR1 or some kind
 558     of other exit condition (say because of an uncaught exception).
 559     SIGHUP causes the master and the runners to close their log files,
 560     and reopen then upon the next printed message.
 561
 562     The master also responds to SIGINT, SIGTERM, SIGUSR1 and SIGHUP,
 563     which it simply passes on to the runners.  Note that the master will
 564     close and reopen its own log files on receipt of a SIGHUP.  The
 565     master also leaves its own process id in the file `data/master.pid`
 566     but you normally don't need to use this pid directly.
 567     """
 568     initialize(config_file, verbose)
 569     # Acquire the master lock, exiting if we can't.  We'll let the caller
 570     # handle any clean up or lock breaking.  No `with` statement here because
 571     # Lock's constructor doesn't support a timeout.
 572     lock = acquire_lock(force)
 573     try:
 574         with open(config.PID_FILE, 'w') as fp:
 575             print(os.getpid(), file=fp)
 576         loop = Loop(lock, restartable, config.filename)
 577         loop.install_signal_handlers()
 578         try:
 579             loop.start_runners(runners)
 580             loop.loop()
 581         finally:
 582             loop.cleanup()
 583             os.remove(config.PID_FILE)
 584     finally:
 585         lock.unlock()