1 # Copyright (C) 2001-2019 by the Free Software Foundation, Inc.
3 # This file is part of GNU Mailman.
5 # GNU Mailman is free software: you can redistribute it and/or modify it under
6 # the terms of the GNU General Public License as published by the Free
7 # Software Foundation, either version 3 of the License, or (at your option)
10 # GNU Mailman is distributed in the hope that it will be useful, but WITHOUT
11 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
15 # You should have received a copy of the GNU General Public License along with
16 # GNU Mailman. If not, see <http://www.gnu.org/licenses/>.
18 """Master subprocess watcher."""
27 from datetime
import timedelta
29 from flufl
.lock
import Lock
, NotLockedError
, TimeOutError
30 from lazr
.config
import as_boolean
31 from mailman
.config
import config
32 from mailman
.core
.i18n
import _
33 from mailman
.core
.initialize
import initialize
34 from mailman
.core
.logging
import reopen
35 from mailman
.utilities
.options
import I18nCommand
, validate_runner_spec
36 from mailman
.version
import MAILMAN_VERSION_FULL
37 from public
import public
41 LOCK_LIFETIME
= timedelta(days
=1, hours
=6)
42 SECONDS_IN_A_DAY
= 86400
43 SUBPROC_START_WAIT
= timedelta(seconds
=20)
45 # Environment variables to forward into subprocesses.
47 'COVERAGE_PROCESS_START',
64 'MAILMAN_EXTRA_TESTING_CFG',
71 class WatcherState(Enum
):
72 """Enum for the state of the master process watcher."""
73 # No lock has been acquired by any process.
75 # Another master watcher is running.
77 # No conflicting process exists.
79 # Hostname from lock file doesn't match.
84 def master_state(lock_file
=None):
85 """Get the state of the master watcher.
87 :param lock_file: Path to the lock file, otherwise `config.LOCK_FILE`.
89 :return: 2-tuple of the WatcherState describing the state of the lock
90 file, and the lock object.
93 lock_file
= config
.LOCK_FILE
94 # We'll never acquire the lock, so the lifetime doesn't matter.
95 lock
= Lock(lock_file
)
97 hostname
, pid
, tempfile
= lock
.details
98 except NotLockedError
:
99 return WatcherState
.none
, lock
100 if hostname
!= socket
.getfqdn():
101 return WatcherState
.host_mismatch
, lock
102 # Find out if the process exists by calling kill with a signal 0.
105 return WatcherState
.conflict
, lock
106 except ProcessLookupError
:
107 # No matching process id.
108 return WatcherState
.stale_lock
, lock
111 def acquire_lock_1(force
, lock_file
=None):
112 """Try to acquire the master lock.
114 :param force: Flag that controls whether to force acquisition of the lock.
116 :param lock_file: Path to the lock file, otherwise `config.LOCK_FILE`.
118 :return: The master lock.
119 :raises: `TimeOutError` if the lock could not be acquired.
121 if lock_file
is None:
122 lock_file
= config
.LOCK_FILE
123 lock
= Lock(lock_file
, LOCK_LIFETIME
)
125 lock
.lock(timedelta(seconds
=0.1))
130 # Force removal of lock first.
132 hostname
, pid
, tempfile
= lock
.details
134 return acquire_lock_1(force
=False)
137 def acquire_lock(force
):
138 """Acquire the master lock.
140 :param force: Flag that controls whether to force acquisition of the lock.
142 :return: The master runner lock or None if the lock couldn't be acquired.
143 In that case, an error messages is also printed to standard error.
146 lock
= acquire_lock_1(force
)
149 status
, lock
= master_state()
150 if status
is WatcherState
.conflict
:
151 # Hostname matches and process exists.
153 The master lock could not be acquired because it appears as though another
154 master is already running.""")
155 elif status
is WatcherState
.stale_lock
:
156 # Hostname matches but the process does not exist.
157 program
= sys
.argv
[0] # noqa: F841
159 The master lock could not be acquired. It appears as though there is a stale
160 master lock. Try re-running $program with the --force flag.""")
161 elif status
is WatcherState
.host_mismatch
:
162 # Hostname doesn't even match.
163 hostname
, pid
, tempfile
= lock
.details
165 The master lock could not be acquired, because it appears as if some process
166 on some other host may have acquired it. We can't test for stale locks across
167 host boundaries, so you'll have to clean this up manually.
169 Lock file: $config.LOCK_FILE
174 assert status
is WatcherState
.none
, (
175 'Invalid enum value: {}'.format(status
))
176 hostname
, pid
, tempfile
= lock
.details
178 For unknown reasons, the master lock could not be acquired.
180 Lock file: $config.LOCK_FILE
184 print(message
, file=sys
.stderr
)
189 """A class which safely manages child process ids."""
194 def __contains__(self
, pid
):
195 return pid
in self
._pids
.keys()
198 # Safely iterate over all the keys in the dictionary. Because
199 # asynchronous signals are involved, the dictionary's size could
200 # change during iteration. Iterate over a copy of the keys to avoid
202 for pid
in self
._pids
.keys():
205 def add(self
, pid
, info
):
206 """Add process information.
208 :param pid: The process id. The watcher must not already be tracking
211 :param info: The process information.
212 :type info: 4-tuple consisting of
213 (runner-name, slice-number, slice-count, restart-count)
215 old_info
= self
._pids
.get(pid
)
216 assert old_info
is None, (
217 'Duplicate process id {0} with existing info: {1}'.format(
219 self
._pids
[pid
] = info
222 """Remove and return existing process information.
224 :param pid: The process id. The watcher must already be tracking this
227 :return: The process information.
228 :rtype: 4-tuple consisting of
229 (runner-name, slice-number, slice-count, restart-count)
230 :raise KeyError: if the process id is not being tracked.
232 return self
._pids
.pop(pid
)
235 """Remove and return existing process information.
237 This is like `pop()` except that no `KeyError` is raised if the
238 process id is not being tracked.
240 :param pid: The process id.
242 :return: The process information, or None if the process id is not
244 :rtype: 4-tuple consisting of
245 (runner-name, slice-number, slice-count, restart-count)
247 return self
._pids
.pop(pid
, None)
252 """Main control loop class."""
254 def __init__(self
, lock
=None, restartable
=None, config_file
=None):
256 self
._restartable
= restartable
257 self
._config
_file
= config_file
258 self
._kids
= PIDWatcher()
260 def install_signal_handlers(self
):
261 """Install various signals handlers for control from the master."""
262 log
= logging
.getLogger('mailman.runner')
263 # Set up our signal handlers. Also set up a SIGALRM handler to
264 # refresh the lock once per day. The lock lifetime is 1 day + 6 hours
265 # so this should be plenty.
266 def sigalrm_handler(signum
, frame
): # noqa: E306
268 signal
.alarm(SECONDS_IN_A_DAY
)
269 signal
.signal(signal
.SIGALRM
, sigalrm_handler
)
270 signal
.alarm(SECONDS_IN_A_DAY
)
271 # SIGHUP tells the runners to close and reopen their log files.
272 def sighup_handler(signum
, frame
): # noqa: E306
274 for pid
in self
._kids
:
275 os
.kill(pid
, signal
.SIGHUP
)
276 log
.info('Master watcher caught SIGHUP. Re-opening log files.')
277 signal
.signal(signal
.SIGHUP
, sighup_handler
)
278 # SIGUSR1 is used by 'mailman restart'.
279 def sigusr1_handler(signum
, frame
): # noqa: E306
280 for pid
in self
._kids
:
281 os
.kill(pid
, signal
.SIGUSR1
)
282 log
.info('Master watcher caught SIGUSR1. Exiting.')
283 signal
.signal(signal
.SIGUSR1
, sigusr1_handler
)
284 # SIGTERM is what init will kill this process with when changing run
285 # levels. It's also the signal 'mailman stop' uses.
286 def sigterm_handler(signum
, frame
): # noqa: E306
287 for pid
in self
._kids
:
288 os
.kill(pid
, signal
.SIGTERM
)
289 log
.info('Master watcher caught SIGTERM. Exiting.')
290 signal
.signal(signal
.SIGTERM
, sigterm_handler
)
291 # SIGINT is what control-C gives.
292 def sigint_handler(signum
, frame
): # noqa: E306
293 for pid
in self
._kids
:
294 os
.kill(pid
, signal
.SIGINT
)
295 log
.info('Master watcher caught SIGINT. Restarting.')
296 signal
.signal(signal
.SIGINT
, sigint_handler
)
298 def _start_runner(self
, spec
):
301 All arguments are passed to the process.
303 :param spec: A runner spec, in a format acceptable to
304 bin/runner's --runner argument, e.g. name:slice:count
306 :return: The process id of the child runner.
315 # Set the environment variable which tells the runner that it's
316 # running under bin/master control. This subtly changes the error
317 # behavior of bin/runner.
318 env
= {'MAILMAN_UNDER_MASTER_CONTROL': '1'}
319 # Craft the command line arguments for the exec() call.
320 rswitch
= '--runner=' + spec
321 # Always pass the explicit path to the configuration file to the
322 # sub-runners. This avoids any debate about which cfg file is used.
323 config_file
= (config
.filename
if self
._config
_file
is None
324 else self
._config
_file
)
325 # Wherever master lives, so too must live the runner script.
326 exe
= os
.path
.join(config
.BIN_DIR
, 'runner') # pragma: nocover
327 # config.PYTHON, which is the absolute path to the Python interpreter,
328 # must be given as argv[0] due to Python's library search algorithm.
329 args
= [sys
.executable
, sys
.executable
, exe
, # pragma: nocover
330 '-C', config_file
, rswitch
]
331 log
= logging
.getLogger('mailman.runner')
332 log
.debug('starting: %s', args
)
333 # We must pass this environment variable through if it's set,
334 # otherwise runner processes will not have the correct VAR_DIR.
335 var_dir
= os
.environ
.get('MAILMAN_VAR_DIR')
336 if var_dir
is not None:
337 env
['MAILMAN_VAR_DIR'] = var_dir
338 # For the testing framework, if these environment variables are set,
339 # pass them on to the subprocess.
340 for envvar
in PRESERVE_ENVS
:
341 if envvar
in os
.environ
:
342 env
[envvar
] = os
.environ
[envvar
]
345 # We should never get here.
346 raise RuntimeError('os.execle() failed')
348 def start_runners(self
, runner_names
=None):
349 """Start all the configured runners.
351 :param runners: If given, a sequence of runner names to start. If not
352 given, this sequence is taken from the configuration file.
353 :type runners: a sequence of strings
357 for runner_config
in config
.runner_configs
:
358 # Strip off the 'runner.' prefix.
359 assert runner_config
.name
.startswith('runner.'), (
360 'Unexpected runner configuration section name: {}'.format(
362 runner_names
.append(runner_config
.name
[7:])
363 # For each runner we want to start, find their config section, which
364 # will tell us the name of the class to instantiate, along with the
365 # number of hash space slices to manage.
366 for name
in runner_names
:
367 section_name
= 'runner.' + name
368 # Let AttributeError propagate.
369 runner_config
= getattr(config
, section_name
)
370 if not as_boolean(runner_config
.start
):
372 # Find out how many runners to instantiate. This must be a power
374 count
= int(runner_config
.instances
)
375 assert (count
& (count
- 1)) == 0, (
376 'Runner "{0}", not a power of 2: {1}'.format(name
, count
))
377 for slice_number
in range(count
):
378 # runner name, slice #, # of slices, restart count
379 info
= (name
, slice_number
, count
, 0)
380 spec
= '{0}:{1:d}:{2:d}'.format(name
, slice_number
, count
)
381 pid
= self
._start
_runner
(spec
)
382 log
= logging
.getLogger('mailman.runner')
383 log
.debug('[{0:d}] {1}'.format(pid
, spec
))
384 self
._kids
.add(pid
, info
)
387 """Sleep until a signal is received."""
388 # Sleep until a signal is received. This prevents the master from
389 # exiting immediately even if there are no runners (as happens in the
396 Wait until all the runner subprocesses have exited, restarting them if
397 necessary and configured to do so.
399 log
= logging
.getLogger('mailman.runner')
400 log
.info('Master started')
404 pid
, status
= os
.wait()
405 except ChildProcessError
:
406 # No children? We're done.
408 except InterruptedError
: # pragma: nocover
409 # If the system call got interrupted, just restart it.
411 if pid
not in self
._kids
: # pragma: nocover
412 # This is not a runner subprocess that we own. E.g. maybe a
415 # Find out why the subprocess exited by getting the signal
416 # received or exit status.
417 if os
.WIFSIGNALED(status
):
418 why
= os
.WTERMSIG(status
)
419 elif os
.WIFEXITED(status
):
420 why
= os
.WEXITSTATUS(status
)
423 # We'll restart the subprocess if it exited with a SIGUSR1 or
424 # because of a failure (i.e. no exit signal), and the no-restart
425 # command line switch was not given. This lets us better handle
426 # runaway restarts (e.g. if the subprocess had a syntax error!)
427 rname
, slice_number
, count
, restarts
= self
._kids
.pop(pid
)
428 config_name
= 'runner.' + rname
430 if why
== signal
.SIGUSR1
and self
._restartable
:
432 # Have we hit the maximum number of restarts?
434 max_restarts
= int(getattr(config
, config_name
).max_restarts
)
435 if restarts
> max_restarts
:
437 # Are we permanently non-restartable?
439 Master detected subprocess exit
440 (pid: {0:d}, why: {1}, class: {2}, slice: {3:d}/{4:d}) {5}""".format(
441 pid
, why
, rname
, slice_number
+ 1, count
,
442 ('[restarting]' if restart
else '')))
443 # See if we've reached the maximum number of allowable restarts.
444 if restarts
> max_restarts
:
446 Runner {0} reached maximum restart limit of {1:d}, not restarting.""",
448 # Now perhaps restart the process unless it exited with a
449 # SIGTERM or we aren't restarting.
451 spec
= '{0}:{1:d}:{2:d}'.format(rname
, slice_number
, count
)
452 new_pid
= self
._start
_runner
(spec
)
453 new_info
= (rname
, slice_number
, count
, restarts
)
454 self
._kids
.add(new_pid
, new_info
)
455 log
.info('Master stopped')
458 """Ensure that all children have exited."""
459 log
= logging
.getLogger('mailman.runner')
460 # Send SIGTERMs to all the child processes and wait for them all to
462 for pid
in self
._kids
:
464 os
.kill(pid
, signal
.SIGTERM
)
465 except ProcessLookupError
: # pragma: nocover
466 # The child has already exited.
467 log
.info('ESRCH on pid: %d', pid
)
468 except OSError: # pragma: nocover
469 # XXX I'm not so sure about this. It preserves the semantics
470 # before conversion to PEP 3151 exceptions. But is it right?
472 # Wait for all the children to go away.
475 pid
, status
= os
.wait()
477 except ChildProcessError
:
479 except InterruptedError
: # pragma: nocover
485 context_settings
=dict(help_option_names
=['-h', '--help']),
487 Master subprocess watcher.
489 Start and watch the configured runners, ensuring that they stay alive and
490 kicking. Each runner is forked and exec'd in turn, with the master waiting
491 on their process ids. When it detects a child runner has exited, it may
494 The runners respond to SIGINT, SIGTERM, SIGUSR1 and SIGHUP. SIGINT,
495 SIGTERM and SIGUSR1 all cause a runner to exit cleanly. The master will
496 restart runners that have exited due to a SIGUSR1 or some kind of other
497 exit condition (say because of an uncaught exception). SIGHUP causes the
498 master and the runners to close their log files, and reopen then upon the
499 next printed message.
501 The master also responds to SIGINT, SIGTERM, SIGUSR1 and SIGHUP, which it
502 simply passes on to the runners. Note that the master will close and
503 reopen its own log files on receipt of a SIGHUP. The master also leaves
504 its own process id in the file specified in the configuration file but you
505 normally don't need to use this PID directly."""))
507 '-C', '--config', 'config_file',
508 envvar
='MAILMAN_CONFIG_FILE',
509 type=click
.Path(exists
=True, dir_okay
=False, resolve_path
=True),
511 Configuration file to use. If not given, the environment variable
512 MAILMAN_CONFIG_FILE is consulted and used if set. If neither are given, a
513 default configuration file is loaded."""))
515 '--no-restart', '-n', 'restartable',
516 is_flag
=True, default
=True,
518 Don't restart the runners when they exit because of an error or a SIGUSR1.
519 Use this only for debugging."""))
522 is_flag
=True, default
=False,
524 If the master watcher finds an existing master lock, it will normally exit
525 with an error message. With this option,the master will perform an extra
526 level of checking. If a process matching the host/pid described in the
527 lock file is running, the master will still exit, requiring you to manually
528 clean up the lock. But if no matching process is found, the master will
529 remove the apparently stale lock and make another attempt to claim the
533 metavar
='runner[:slice:range]',
534 callback
=validate_runner_spec
, default
=None,
537 Override the default set of runners that the master will invoke, which is
538 typically defined in the configuration file. Multiple -r options may be
539 given. The values for -r are passed straight through to bin/runner."""))
542 is_flag
=True, default
=False,
543 help=_('Display more debugging information to the log file.'))
544 @click.version_option(MAILMAN_VERSION_FULL
)
546 def main(config_file
, restartable
, force
, runners
, verbose
):
547 # XXX https://github.com/pallets/click/issues/303
548 """Master subprocess watcher.
550 Start and watch the configured runners and ensure that they stay
551 alive and kicking. Each runner is forked and exec'd in turn, with
552 the master waiting on their process ids. When it detects a child
553 runner has exited, it may restart it.
555 The runners respond to SIGINT, SIGTERM, SIGUSR1 and SIGHUP. SIGINT,
556 SIGTERM and SIGUSR1 all cause a runner to exit cleanly. The master
557 will restart runners that have exited due to a SIGUSR1 or some kind
558 of other exit condition (say because of an uncaught exception).
559 SIGHUP causes the master and the runners to close their log files,
560 and reopen then upon the next printed message.
562 The master also responds to SIGINT, SIGTERM, SIGUSR1 and SIGHUP,
563 which it simply passes on to the runners. Note that the master will
564 close and reopen its own log files on receipt of a SIGHUP. The
565 master also leaves its own process id in the file `data/master.pid`
566 but you normally don't need to use this pid directly.
568 initialize(config_file
, verbose
)
569 # Acquire the master lock, exiting if we can't. We'll let the caller
570 # handle any clean up or lock breaking. No `with` statement here because
571 # Lock's constructor doesn't support a timeout.
572 lock
= acquire_lock(force
)
574 with
open(config
.PID_FILE
, 'w') as fp
:
575 print(os
.getpid(), file=fp
)
576 loop
= Loop(lock
, restartable
, config
.filename
)
577 loop
.install_signal_handlers()
579 loop
.start_runners(runners
)
583 os
.remove(config
.PID_FILE
)