KVM test: tests_base.cfg. sample: Fix test dependencies
[autotest-zwu.git] / scheduler / monitor_db_babysitter
blob4625781d62721c3902105e3085d3def9e894593b
1 #!/usr/bin/python -u
2 import os, sys, signal, time, subprocess, logging
3 from optparse import OptionParser
4 import common
5 from autotest_lib.scheduler import babysitter_logging_config
6 from autotest_lib.client.common_lib import error, global_config, utils
7 from autotest_lib.client.common_lib import logging_manager
8 from autotest_lib.scheduler import scheduler_logging_config
9 from autotest_lib.scheduler import monitor_db
11 PAUSE_LENGTH = 60
12 STALL_TIMEOUT = 2*60*60
14 parser = OptionParser()
15 parser.add_option("-r", action="store_true", dest="recover",
16 help=("run recovery mode (implicit after any crash)"))
17 parser.add_option("--background", dest="background", action="store_true",
18 default=False, help=("runs the scheduler monitor on "
19 "background"))
20 (options, args) = parser.parse_args()
22 autodir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
23 results_dir = os.path.join(autodir, 'results')
24 monitor_db_path = os.path.join(autodir, 'scheduler/monitor_db.py')
25 recover = (options.recover == True)
27 if len(args) != 0:
28 parser.print_help()
29 sys.exit(1)
32 def run_banner_output(cmd):
33 """Returns ------ CMD ------\nCMD_OUTPUT in a string"""
34 banner_output = '%s\n%%s\n\n' % cmd.center(60, '-')
35 command_output = ''
36 try:
37 cmd_out = utils.run(cmd, ignore_status=True, timeout=30)
38 command_output = cmd_out.stdout + cmd_out.stderr
39 except error.CmdError:
40 command_output = 'Timed out'
42 return banner_output % command_output
45 def kill_monitor():
46 logging.info("Killing monitor_db")
47 # try shutdown first
48 utils.signal_program(monitor_db.PID_FILE_PREFIX, sig=signal.SIGINT)
49 if utils.program_is_alive(monitor_db.PID_FILE_PREFIX): # was it killed?
50 # give it some time to shutdown
51 time.sleep(30)
52 # kill it
53 utils.signal_process(monitor_db.PID_FILE_PREFIX)
56 def handle_sigterm(signum, frame):
57 logging.info('Caught SIGTERM')
58 kill_monitor()
59 utils.delete_pid_file_if_exists(monitor_db.BABYSITTER_PID_FILE_PREFIX)
60 sys.exit(1)
62 signal.signal(signal.SIGTERM, handle_sigterm)
65 SiteMonitorProc = utils.import_site_class(
66 __file__, 'autotest_lib.scheduler.site_monitor_db_babysitter',
67 'SiteMonitorProc', object)
70 class MonitorProc(SiteMonitorProc):
71 def __init__(self, do_recovery=False):
72 args = [monitor_db_path]
73 if do_recovery:
74 args.append("--recover-hosts")
75 args.append(results_dir)
77 kill_monitor()
78 environ = os.environ
79 scheduler_config = scheduler_logging_config.SchedulerLoggingConfig
80 log_name = scheduler_config.get_log_name()
81 os.environ['AUTOTEST_SCHEDULER_LOG_NAME'] = log_name
82 scheduler_log_dir = scheduler_config.get_server_log_dir()
83 self.log_path = os.path.join(scheduler_log_dir, log_name)
85 self.log_size = 0
86 self.last_log_change = time.time()
88 logging.info("STARTING monitor_db with log file %s" % self.log_path)
89 self.args = args
91 # Allow site specific code to run, set environment variables and
92 # modify self.args if desired.
93 super(MonitorProc, self).__init__()
96 def start(self):
97 devnull = open(os.devnull, 'w')
98 self.proc = subprocess.Popen(self.args, stdout=devnull)
101 def is_running(self):
102 if self.proc.poll() is not None:
103 logging.info("monitor_db DIED")
104 return False
106 old_size = self.log_size
107 new_size = os.path.getsize(self.log_path)
108 if old_size != new_size:
109 logging.info("Log was touched")
110 self.log_size = new_size
111 self.last_log_change = time.time()
112 elif self.last_log_change + STALL_TIMEOUT < time.time():
113 logging.info("monitor_db STALLED")
114 self.collect_stalled_info()
115 return False
117 return True
120 def collect_stalled_info(self):
121 INFO_TO_COLLECT = ['uptime',
122 'ps auxwww',
123 'iostat -k -x 2 4',
125 db_cmd = '/usr/bin/mysqladmin --verbose processlist -u%s -p%s'
126 config = global_config.global_config
127 try:
128 user = config.get_config_value("BACKUP", "user")
129 password = config.get_config_value("BACKUP", "password")
130 db_cmd %= (user, password)
131 INFO_TO_COLLECT.append(db_cmd)
132 except global_config.ConfigError:
133 pass
134 stall_log_path = self.log_path + '.stall_info'
135 log = open(stall_log_path, "w")
136 for cmd in INFO_TO_COLLECT:
137 log.write(run_banner_output(cmd))
139 log.close()
142 if os.getuid() == 0:
143 logging.critical("Running as root, aborting!")
144 sys.exit(1)
146 if utils.program_is_alive(monitor_db.BABYSITTER_PID_FILE_PREFIX):
147 logging.critical("Monitor_db_babysitter already running, aborting!")
148 sys.exit(1)
150 utils.write_pid(monitor_db.BABYSITTER_PID_FILE_PREFIX)
152 if options.background:
153 logging_manager.configure_logging(
154 babysitter_logging_config.BabysitterLoggingConfig(use_console=False))
156 # Double fork - see http://code.activestate.com/recipes/66012/
157 try:
158 pid = os.fork()
159 if (pid > 0):
160 sys.exit(0) # exit from first parent
161 except OSError, e:
162 sys.stderr.write("fork #1 failed: (%d) %s\n" % (e.errno, e.strerror))
163 sys.exit(1)
165 # Decouple from parent environment
166 os.chdir("/")
167 os.umask(0)
168 os.setsid()
170 # Second fork
171 try:
172 pid = os.fork()
173 if (pid > 0):
174 sys.exit(0) # exit from second parent
175 except OSError, e:
176 sys.stderr.write("fork #2 failed: (%d) %s\n" % (e.errno, e.strerror))
177 sys.exit(1)
178 else:
179 logging_manager.configure_logging(
180 babysitter_logging_config.BabysitterLoggingConfig())
183 while True:
184 proc = MonitorProc(do_recovery=recover)
185 proc.start()
186 time.sleep(PAUSE_LENGTH)
187 while proc.is_running():
188 logging.info("Tick")
189 time.sleep(PAUSE_LENGTH)
190 recover = False