Merge branch 'alias' into 'master'
[mailman.git] / src / mailman / bin / master.py
blob9e588bda124891b69c6b1e505670fe794714108f
1 # Copyright (C) 2001-2019 by the Free Software Foundation, Inc.
3 # This file is part of GNU Mailman.
5 # GNU Mailman is free software: you can redistribute it and/or modify it under
6 # the terms of the GNU General Public License as published by the Free
7 # Software Foundation, either version 3 of the License, or (at your option)
8 # any later version.
10 # GNU Mailman is distributed in the hope that it will be useful, but WITHOUT
11 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 # more details.
15 # You should have received a copy of the GNU General Public License along with
16 # GNU Mailman. If not, see <http://www.gnu.org/licenses/>.
18 """Master subprocess watcher."""
20 import os
21 import sys
22 import click
23 import signal
24 import socket
25 import logging
27 from datetime import timedelta
28 from enum import Enum
29 from flufl.lock import Lock, NotLockedError, TimeOutError
30 from lazr.config import as_boolean
31 from mailman.config import config
32 from mailman.core.i18n import _
33 from mailman.core.initialize import initialize
34 from mailman.core.logging import reopen
35 from mailman.utilities.options import I18nCommand, validate_runner_spec
36 from mailman.version import MAILMAN_VERSION_FULL
37 from public import public
40 DOT = '.'
41 LOCK_LIFETIME = timedelta(days=1, hours=6)
42 SECONDS_IN_A_DAY = 86400
43 SUBPROC_START_WAIT = timedelta(seconds=20)
45 # Environment variables to forward into subprocesses.
46 PRESERVE_ENVS = (
47 'COVERAGE_PROCESS_START',
48 'LANG',
49 'LANGUAGE',
50 'LC_ADDRESS',
51 'LC_ALL',
52 'LC_COLLATE',
53 'LC_CTYPE',
54 'LC_IDENTIFICATION',
55 'LC_MEASUREMENT',
56 'LC_MESSAGES',
57 'LC_MONETARY',
58 'LC_NAME',
59 'LC_NUMERIC',
60 'LC_PAPER',
61 'LC_TELEPHONE',
62 'LC_TIME',
63 'LOCALE_ARCHIVE',
64 'MAILMAN_EXTRA_TESTING_CFG',
65 'PYTHONPATH',
66 'PYTHONHOME',
70 @public
71 class WatcherState(Enum):
72 """Enum for the state of the master process watcher."""
73 # No lock has been acquired by any process.
74 none = 0
75 # Another master watcher is running.
76 conflict = 1
77 # No conflicting process exists.
78 stale_lock = 2
79 # Hostname from lock file doesn't match.
80 host_mismatch = 3
83 @public
84 def master_state(lock_file=None):
85 """Get the state of the master watcher.
87 :param lock_file: Path to the lock file, otherwise `config.LOCK_FILE`.
88 :type lock_file: str
89 :return: 2-tuple of the WatcherState describing the state of the lock
90 file, and the lock object.
91 """
92 if lock_file is None:
93 lock_file = config.LOCK_FILE
94 # We'll never acquire the lock, so the lifetime doesn't matter.
95 lock = Lock(lock_file)
96 try:
97 hostname, pid, tempfile = lock.details
98 except NotLockedError:
99 return WatcherState.none, lock
100 if hostname != socket.getfqdn():
101 return WatcherState.host_mismatch, lock
102 # Find out if the process exists by calling kill with a signal 0.
103 try:
104 os.kill(pid, 0)
105 return WatcherState.conflict, lock
106 except ProcessLookupError:
107 # No matching process id.
108 return WatcherState.stale_lock, lock
111 def acquire_lock_1(force, lock_file=None):
112 """Try to acquire the master lock.
114 :param force: Flag that controls whether to force acquisition of the lock.
115 :type force: bool
116 :param lock_file: Path to the lock file, otherwise `config.LOCK_FILE`.
117 :type lock_file: str
118 :return: The master lock.
119 :raises: `TimeOutError` if the lock could not be acquired.
121 if lock_file is None:
122 lock_file = config.LOCK_FILE
123 lock = Lock(lock_file, LOCK_LIFETIME)
124 try:
125 lock.lock(timedelta(seconds=0.1))
126 return lock
127 except TimeOutError:
128 if not force:
129 raise
130 # Force removal of lock first.
131 lock.disown()
132 hostname, pid, tempfile = lock.details
133 os.unlink(lock_file)
134 return acquire_lock_1(force=False)
137 def acquire_lock(force):
138 """Acquire the master lock.
140 :param force: Flag that controls whether to force acquisition of the lock.
141 :type force: bool
142 :return: The master runner lock or None if the lock couldn't be acquired.
143 In that case, an error messages is also printed to standard error.
145 try:
146 lock = acquire_lock_1(force)
147 return lock
148 except TimeOutError:
149 status, lock = master_state()
150 if status is WatcherState.conflict:
151 # Hostname matches and process exists.
152 message = _("""\
153 The master lock could not be acquired because it appears as though another
154 master is already running.""")
155 elif status is WatcherState.stale_lock:
156 # Hostname matches but the process does not exist.
157 program = sys.argv[0] # noqa: F841
158 message = _("""\
159 The master lock could not be acquired. It appears as though there is a stale
160 master lock. Try re-running $program with the --force flag.""")
161 elif status is WatcherState.host_mismatch:
162 # Hostname doesn't even match.
163 hostname, pid, tempfile = lock.details
164 message = _("""\
165 The master lock could not be acquired, because it appears as if some process
166 on some other host may have acquired it. We can't test for stale locks across
167 host boundaries, so you'll have to clean this up manually.
169 Lock file: $config.LOCK_FILE
170 Lock host: $hostname
172 Exiting.""")
173 else:
174 assert status is WatcherState.none, (
175 'Invalid enum value: {}'.format(status))
176 hostname, pid, tempfile = lock.details
177 message = _("""\
178 For unknown reasons, the master lock could not be acquired.
180 Lock file: $config.LOCK_FILE
181 Lock host: $hostname
183 Exiting.""")
184 print(message, file=sys.stderr)
185 sys.exit(1)
188 class PIDWatcher:
189 """A class which safely manages child process ids."""
191 def __init__(self):
192 self._pids = {}
194 def __contains__(self, pid):
195 return pid in self._pids.keys()
197 def __iter__(self):
198 # Safely iterate over all the keys in the dictionary. Because
199 # asynchronous signals are involved, the dictionary's size could
200 # change during iteration. Iterate over a copy of the keys to avoid
201 # that.
202 for pid in self._pids.keys():
203 yield pid
205 def add(self, pid, info):
206 """Add process information.
208 :param pid: The process id. The watcher must not already be tracking
209 this process id.
210 :type pid: int
211 :param info: The process information.
212 :type info: 4-tuple consisting of
213 (runner-name, slice-number, slice-count, restart-count)
215 old_info = self._pids.get(pid)
216 assert old_info is None, (
217 'Duplicate process id {0} with existing info: {1}'.format(
218 pid, old_info))
219 self._pids[pid] = info
221 def pop(self, pid):
222 """Remove and return existing process information.
224 :param pid: The process id. The watcher must already be tracking this
225 process id.
226 :type pid: int
227 :return: The process information.
228 :rtype: 4-tuple consisting of
229 (runner-name, slice-number, slice-count, restart-count)
230 :raise KeyError: if the process id is not being tracked.
232 return self._pids.pop(pid)
234 def drop(self, pid):
235 """Remove and return existing process information.
237 This is like `pop()` except that no `KeyError` is raised if the
238 process id is not being tracked.
240 :param pid: The process id.
241 :type pid: int
242 :return: The process information, or None if the process id is not
243 being tracked.
244 :rtype: 4-tuple consisting of
245 (runner-name, slice-number, slice-count, restart-count)
247 return self._pids.pop(pid, None)
250 @public
251 class Loop:
252 """Main control loop class."""
254 def __init__(self, lock=None, restartable=None, config_file=None):
255 self._lock = lock
256 self._restartable = restartable
257 self._config_file = config_file
258 self._kids = PIDWatcher()
260 def install_signal_handlers(self):
261 """Install various signals handlers for control from the master."""
262 log = logging.getLogger('mailman.runner')
263 # Set up our signal handlers. Also set up a SIGALRM handler to
264 # refresh the lock once per day. The lock lifetime is 1 day + 6 hours
265 # so this should be plenty.
266 def sigalrm_handler(signum, frame): # noqa: E306
267 self._lock.refresh()
268 signal.alarm(SECONDS_IN_A_DAY)
269 signal.signal(signal.SIGALRM, sigalrm_handler)
270 signal.alarm(SECONDS_IN_A_DAY)
271 # SIGHUP tells the runners to close and reopen their log files.
272 def sighup_handler(signum, frame): # noqa: E306
273 reopen()
274 for pid in self._kids:
275 os.kill(pid, signal.SIGHUP)
276 log.info('Master watcher caught SIGHUP. Re-opening log files.')
277 signal.signal(signal.SIGHUP, sighup_handler)
278 # SIGUSR1 is used by 'mailman restart'.
279 def sigusr1_handler(signum, frame): # noqa: E306
280 for pid in self._kids:
281 os.kill(pid, signal.SIGUSR1)
282 log.info('Master watcher caught SIGUSR1. Exiting.')
283 signal.signal(signal.SIGUSR1, sigusr1_handler)
284 # SIGTERM is what init will kill this process with when changing run
285 # levels. It's also the signal 'mailman stop' uses.
286 def sigterm_handler(signum, frame): # noqa: E306
287 for pid in self._kids:
288 os.kill(pid, signal.SIGTERM)
289 log.info('Master watcher caught SIGTERM. Exiting.')
290 signal.signal(signal.SIGTERM, sigterm_handler)
291 # SIGINT is what control-C gives.
292 def sigint_handler(signum, frame): # noqa: E306
293 for pid in self._kids:
294 os.kill(pid, signal.SIGINT)
295 log.info('Master watcher caught SIGINT. Restarting.')
296 signal.signal(signal.SIGINT, sigint_handler)
298 def _start_runner(self, spec):
299 """Start a runner.
301 All arguments are passed to the process.
303 :param spec: A runner spec, in a format acceptable to
304 bin/runner's --runner argument, e.g. name:slice:count
305 :type spec: string
306 :return: The process id of the child runner.
307 :rtype: int
309 pid = os.fork()
310 if pid:
311 # Parent.
312 return pid
313 # Child.
315 # Set the environment variable which tells the runner that it's
316 # running under bin/master control. This subtly changes the error
317 # behavior of bin/runner.
318 env = {'MAILMAN_UNDER_MASTER_CONTROL': '1'}
319 # Craft the command line arguments for the exec() call.
320 rswitch = '--runner=' + spec
321 # Always pass the explicit path to the configuration file to the
322 # sub-runners. This avoids any debate about which cfg file is used.
323 config_file = (config.filename if self._config_file is None
324 else self._config_file)
325 # Wherever master lives, so too must live the runner script.
326 exe = os.path.join(config.BIN_DIR, 'runner') # pragma: nocover
327 # config.PYTHON, which is the absolute path to the Python interpreter,
328 # must be given as argv[0] due to Python's library search algorithm.
329 args = [sys.executable, sys.executable, exe, # pragma: nocover
330 '-C', config_file, rswitch]
331 log = logging.getLogger('mailman.runner')
332 log.debug('starting: %s', args)
333 # We must pass this environment variable through if it's set,
334 # otherwise runner processes will not have the correct VAR_DIR.
335 var_dir = os.environ.get('MAILMAN_VAR_DIR')
336 if var_dir is not None:
337 env['MAILMAN_VAR_DIR'] = var_dir
338 # For the testing framework, if these environment variables are set,
339 # pass them on to the subprocess.
340 for envvar in PRESERVE_ENVS:
341 if envvar in os.environ:
342 env[envvar] = os.environ[envvar]
343 args.append(env)
344 os.execle(*args)
345 # We should never get here.
346 raise RuntimeError('os.execle() failed')
348 def start_runners(self, runner_names=None):
349 """Start all the configured runners.
351 :param runners: If given, a sequence of runner names to start. If not
352 given, this sequence is taken from the configuration file.
353 :type runners: a sequence of strings
355 if not runner_names:
356 runner_names = []
357 for runner_config in config.runner_configs:
358 # Strip off the 'runner.' prefix.
359 assert runner_config.name.startswith('runner.'), (
360 'Unexpected runner configuration section name: {}'.format(
361 runner_config.name))
362 runner_names.append(runner_config.name[7:])
363 # For each runner we want to start, find their config section, which
364 # will tell us the name of the class to instantiate, along with the
365 # number of hash space slices to manage.
366 for name in runner_names:
367 section_name = 'runner.' + name
368 # Let AttributeError propagate.
369 runner_config = getattr(config, section_name)
370 if not as_boolean(runner_config.start):
371 continue
372 # Find out how many runners to instantiate. This must be a power
373 # of 2.
374 count = int(runner_config.instances)
375 assert (count & (count - 1)) == 0, (
376 'Runner "{0}", not a power of 2: {1}'.format(name, count))
377 for slice_number in range(count):
378 # runner name, slice #, # of slices, restart count
379 info = (name, slice_number, count, 0)
380 spec = '{0}:{1:d}:{2:d}'.format(name, slice_number, count)
381 pid = self._start_runner(spec)
382 log = logging.getLogger('mailman.runner')
383 log.debug('[{0:d}] {1}'.format(pid, spec))
384 self._kids.add(pid, info)
386 def _pause(self):
387 """Sleep until a signal is received."""
388 # Sleep until a signal is received. This prevents the master from
389 # exiting immediately even if there are no runners (as happens in the
390 # test suite).
391 signal.pause()
393 def loop(self):
394 """Main loop.
396 Wait until all the runner subprocesses have exited, restarting them if
397 necessary and configured to do so.
399 log = logging.getLogger('mailman.runner')
400 log.info('Master started')
401 self._pause()
402 while True:
403 try:
404 pid, status = os.wait()
405 except ChildProcessError:
406 # No children? We're done.
407 break
408 except InterruptedError: # pragma: nocover
409 # If the system call got interrupted, just restart it.
410 continue
411 if pid not in self._kids: # pragma: nocover
412 # This is not a runner subprocess that we own. E.g. maybe a
413 # plugin started it.
414 continue
415 # Find out why the subprocess exited by getting the signal
416 # received or exit status.
417 if os.WIFSIGNALED(status):
418 why = os.WTERMSIG(status)
419 elif os.WIFEXITED(status):
420 why = os.WEXITSTATUS(status)
421 else:
422 why = None
423 # We'll restart the subprocess if it exited with a SIGUSR1 or
424 # because of a failure (i.e. no exit signal), and the no-restart
425 # command line switch was not given. This lets us better handle
426 # runaway restarts (e.g. if the subprocess had a syntax error!)
427 rname, slice_number, count, restarts = self._kids.pop(pid)
428 config_name = 'runner.' + rname
429 restart = False
430 if why == signal.SIGUSR1 and self._restartable:
431 restart = True
432 # Have we hit the maximum number of restarts?
433 restarts += 1
434 max_restarts = int(getattr(config, config_name).max_restarts)
435 if restarts > max_restarts:
436 restart = False
437 # Are we permanently non-restartable?
438 log.debug("""\
439 Master detected subprocess exit
440 (pid: {0:d}, why: {1}, class: {2}, slice: {3:d}/{4:d}) {5}""".format(
441 pid, why, rname, slice_number + 1, count,
442 ('[restarting]' if restart else '')))
443 # See if we've reached the maximum number of allowable restarts.
444 if restarts > max_restarts:
445 log.info("""\
446 Runner {0} reached maximum restart limit of {1:d}, not restarting.""",
447 rname, max_restarts)
448 # Now perhaps restart the process unless it exited with a
449 # SIGTERM or we aren't restarting.
450 if restart:
451 spec = '{0}:{1:d}:{2:d}'.format(rname, slice_number, count)
452 new_pid = self._start_runner(spec)
453 new_info = (rname, slice_number, count, restarts)
454 self._kids.add(new_pid, new_info)
455 log.info('Master stopped')
457 def cleanup(self):
458 """Ensure that all children have exited."""
459 log = logging.getLogger('mailman.runner')
460 # Send SIGTERMs to all the child processes and wait for them all to
461 # exit.
462 for pid in self._kids:
463 try:
464 os.kill(pid, signal.SIGTERM)
465 except ProcessLookupError: # pragma: nocover
466 # The child has already exited.
467 log.info('ESRCH on pid: %d', pid)
468 except OSError: # pragma: nocover
469 # XXX I'm not so sure about this. It preserves the semantics
470 # before conversion to PEP 3151 exceptions. But is it right?
471 pass
472 # Wait for all the children to go away.
473 while self._kids:
474 try:
475 pid, status = os.wait()
476 self._kids.drop(pid)
477 except ChildProcessError:
478 break
479 except InterruptedError: # pragma: nocover
480 continue
483 @click.command(
484 cls=I18nCommand,
485 context_settings=dict(help_option_names=['-h', '--help']),
486 help=_("""\
487 Master subprocess watcher.
489 Start and watch the configured runners, ensuring that they stay alive and
490 kicking. Each runner is forked and exec'd in turn, with the master waiting
491 on their process ids. When it detects a child runner has exited, it may
492 restart it.
494 The runners respond to SIGINT, SIGTERM, SIGUSR1 and SIGHUP. SIGINT,
495 SIGTERM and SIGUSR1 all cause a runner to exit cleanly. The master will
496 restart runners that have exited due to a SIGUSR1 or some kind of other
497 exit condition (say because of an uncaught exception). SIGHUP causes the
498 master and the runners to close their log files, and reopen then upon the
499 next printed message.
501 The master also responds to SIGINT, SIGTERM, SIGUSR1 and SIGHUP, which it
502 simply passes on to the runners. Note that the master will close and
503 reopen its own log files on receipt of a SIGHUP. The master also leaves
504 its own process id in the file specified in the configuration file but you
505 normally don't need to use this PID directly."""))
506 @click.option(
507 '-C', '--config', 'config_file',
508 envvar='MAILMAN_CONFIG_FILE',
509 type=click.Path(exists=True, dir_okay=False, resolve_path=True),
510 help=_("""\
511 Configuration file to use. If not given, the environment variable
512 MAILMAN_CONFIG_FILE is consulted and used if set. If neither are given, a
513 default configuration file is loaded."""))
514 @click.option(
515 '--no-restart', '-n', 'restartable',
516 is_flag=True, default=True,
517 help=_("""\
518 Don't restart the runners when they exit because of an error or a SIGUSR1.
519 Use this only for debugging."""))
520 @click.option(
521 '--force', '-f',
522 is_flag=True, default=False,
523 help=_("""\
524 If the master watcher finds an existing master lock, it will normally exit
525 with an error message. With this option,the master will perform an extra
526 level of checking. If a process matching the host/pid described in the
527 lock file is running, the master will still exit, requiring you to manually
528 clean up the lock. But if no matching process is found, the master will
529 remove the apparently stale lock and make another attempt to claim the
530 master lock."""))
531 @click.option(
532 '--runners', '-r',
533 metavar='runner[:slice:range]',
534 callback=validate_runner_spec, default=None,
535 multiple=True,
536 help=_("""\
537 Override the default set of runners that the master will invoke, which is
538 typically defined in the configuration file. Multiple -r options may be
539 given. The values for -r are passed straight through to bin/runner."""))
540 @click.option(
541 '-v', '--verbose',
542 is_flag=True, default=False,
543 help=_('Display more debugging information to the log file.'))
544 @click.version_option(MAILMAN_VERSION_FULL)
545 @public
546 def main(config_file, restartable, force, runners, verbose):
547 # XXX https://github.com/pallets/click/issues/303
548 """Master subprocess watcher.
550 Start and watch the configured runners and ensure that they stay
551 alive and kicking. Each runner is forked and exec'd in turn, with
552 the master waiting on their process ids. When it detects a child
553 runner has exited, it may restart it.
555 The runners respond to SIGINT, SIGTERM, SIGUSR1 and SIGHUP. SIGINT,
556 SIGTERM and SIGUSR1 all cause a runner to exit cleanly. The master
557 will restart runners that have exited due to a SIGUSR1 or some kind
558 of other exit condition (say because of an uncaught exception).
559 SIGHUP causes the master and the runners to close their log files,
560 and reopen then upon the next printed message.
562 The master also responds to SIGINT, SIGTERM, SIGUSR1 and SIGHUP,
563 which it simply passes on to the runners. Note that the master will
564 close and reopen its own log files on receipt of a SIGHUP. The
565 master also leaves its own process id in the file `data/master.pid`
566 but you normally don't need to use this pid directly.
568 initialize(config_file, verbose)
569 # Acquire the master lock, exiting if we can't. We'll let the caller
570 # handle any clean up or lock breaking. No `with` statement here because
571 # Lock's constructor doesn't support a timeout.
572 lock = acquire_lock(force)
573 try:
574 with open(config.PID_FILE, 'w') as fp:
575 print(os.getpid(), file=fp)
576 loop = Loop(lock, restartable, config.filename)
577 loop.install_signal_handlers()
578 try:
579 loop.start_runners(runners)
580 loop.loop()
581 finally:
582 loop.cleanup()
583 os.remove(config.PID_FILE)
584 finally:
585 lock.unlock()