Moving test_setup.display_attributes to kvm_utils
[autotest-zwu.git] / scheduler / monitor_db_babysitter
blobaac6c4e8a490b8bca00666c1a9d29553df557113
1 #!/usr/bin/python -u
2 import os, sys, signal, time, subprocess, logging
3 from optparse import OptionParser
4 import common
5 from autotest_lib.scheduler import babysitter_logging_config
6 from autotest_lib.client.common_lib import error, global_config, utils
7 from autotest_lib.client.common_lib import logging_manager
8 from autotest_lib.scheduler import scheduler_logging_config
9 from autotest_lib.scheduler import monitor_db
11 PAUSE_LENGTH = 60
12 STALL_TIMEOUT = 2*60*60
14 parser = OptionParser()
15 parser.add_option("-r", action="store_true", dest="recover")
16 (options, args) = parser.parse_args()
18 autodir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
19 results_dir = os.path.join(autodir, 'results')
20 monitor_db_path = os.path.join(autodir, 'scheduler/monitor_db.py')
21 recover = (options.recover == True)
23 # load logging settings
24 logging_manager.configure_logging(
25 babysitter_logging_config.BabysitterLoggingConfig())
27 if len(args) != 0:
28 print "Usage: %s [options]" % __file__
29 print " -r Run recovery mode. (Note: recovery is implicit after"
30 print " any crash!)"
31 print
32 sys.exit(1)
35 def run_banner_output(cmd):
36 """Returns ------ CMD ------\nCMD_OUTPUT in a string"""
37 banner_output = '%s\n%%s\n\n' % cmd.center(60, '-')
38 command_output = ''
39 try:
40 cmd_out = utils.run(cmd, ignore_status=True, timeout=30)
41 command_output = cmd_out.stdout + cmd_out.stderr
42 except error.CmdError:
43 command_output = 'Timed out'
45 return banner_output % command_output
48 def kill_monitor():
49 logging.info("Killing monitor_db")
50 # try shutdown first
51 utils.signal_program(monitor_db.PID_FILE_PREFIX, sig=signal.SIGINT)
52 if utils.program_is_alive(monitor_db.PID_FILE_PREFIX): # was it killed?
53 # give it some time to shutdown
54 time.sleep(30)
55 # kill it
56 utils.signal_process(monitor_db.PID_FILE_PREFIX)
59 def handle_sigterm(signum, frame):
60 logging.info('Caught SIGTERM')
61 kill_monitor()
62 utils.delete_pid_file_if_exists(monitor_db.BABYSITTER_PID_FILE_PREFIX)
63 sys.exit(1)
65 signal.signal(signal.SIGTERM, handle_sigterm)
68 SiteMonitorProc = utils.import_site_class(
69 __file__, 'autotest_lib.scheduler.site_monitor_db_babysitter',
70 'SiteMonitorProc', object)
73 class MonitorProc(SiteMonitorProc):
74 def __init__(self, do_recovery=False):
75 args = [monitor_db_path]
76 if do_recovery:
77 args.append("--recover-hosts")
78 args.append(results_dir)
80 kill_monitor()
81 environ = os.environ
82 scheduler_config = scheduler_logging_config.SchedulerLoggingConfig
83 log_name = scheduler_config.get_log_name()
84 os.environ['AUTOTEST_SCHEDULER_LOG_NAME'] = log_name
85 scheduler_log_dir = scheduler_config.get_server_log_dir()
86 self.log_path = os.path.join(scheduler_log_dir, log_name)
88 self.log_size = 0
89 self.last_log_change = time.time()
91 logging.info("STARTING monitor_db with log file %s" % self.log_path)
92 self.args = args
94 # Allow site specific code to run, set environment variables and
95 # modify self.args if desired.
96 super(MonitorProc, self).__init__()
99 def start(self):
100 devnull = open(os.devnull, 'w')
101 self.proc = subprocess.Popen(self.args, stdout=devnull)
104 def is_running(self):
105 if self.proc.poll() is not None:
106 logging.info("monitor_db DIED")
107 return False
109 old_size = self.log_size
110 new_size = os.path.getsize(self.log_path)
111 if old_size != new_size:
112 logging.info("Log was touched")
113 self.log_size = new_size
114 self.last_log_change = time.time()
115 elif self.last_log_change + STALL_TIMEOUT < time.time():
116 logging.info("monitor_db STALLED")
117 self.collect_stalled_info()
118 return False
120 return True
123 def collect_stalled_info(self):
124 INFO_TO_COLLECT = ['uptime',
125 'ps auxwww',
126 'iostat -k -x 2 4',
128 db_cmd = '/usr/bin/mysqladmin --verbose processlist -u%s -p%s'
129 config = global_config.global_config
130 try:
131 user = config.get_config_value("BACKUP", "user")
132 password = config.get_config_value("BACKUP", "password")
133 db_cmd %= (user, password)
134 INFO_TO_COLLECT.append(db_cmd)
135 except global_config.ConfigError:
136 pass
137 stall_log_path = self.log_path + '.stall_info'
138 log = open(stall_log_path, "w")
139 for cmd in INFO_TO_COLLECT:
140 log.write(run_banner_output(cmd))
142 log.close()
145 logging.info("initializing")
147 if os.getuid() == 0:
148 logging.critical("running as root, aborting!")
149 sys.exit(1)
151 if utils.program_is_alive(monitor_db.BABYSITTER_PID_FILE_PREFIX):
152 logging.critical("monitor_db_babysitter already running, aborting!")
153 sys.exit(1)
154 utils.write_pid(monitor_db.BABYSITTER_PID_FILE_PREFIX)
156 while True:
157 proc = MonitorProc(do_recovery=recover)
158 proc.start()
159 time.sleep(PAUSE_LENGTH)
160 while proc.is_running():
161 logging.info("Tick")
162 time.sleep(PAUSE_LENGTH)
163 recover = False