2 import os
, sys
, signal
, time
, subprocess
, logging
3 from optparse
import OptionParser
5 from autotest_lib
.scheduler
import babysitter_logging_config
6 from autotest_lib
.client
.common_lib
import error
, global_config
, utils
7 from autotest_lib
.client
.common_lib
import logging_manager
8 from autotest_lib
.scheduler
import scheduler_logging_config
9 from autotest_lib
.scheduler
import monitor_db
12 STALL_TIMEOUT
= 2*60*60
14 parser
= OptionParser()
15 parser
.add_option("-r", action
="store_true", dest
="recover")
16 (options
, args
) = parser
.parse_args()
18 autodir
= os
.path
.abspath(os
.path
.join(os
.path
.dirname(__file__
), '..'))
19 results_dir
= os
.path
.join(autodir
, 'results')
20 monitor_db_path
= os
.path
.join(autodir
, 'scheduler/monitor_db.py')
21 recover
= (options
.recover
== True)
23 # load logging settings
24 logging_manager
.configure_logging(
25 babysitter_logging_config
.BabysitterLoggingConfig())
28 print "Usage: %s [options]" % __file__
29 print " -r Run recovery mode. (Note: recovery is implicit after"
35 def run_banner_output(cmd
):
36 """Returns ------ CMD ------\nCMD_OUTPUT in a string"""
37 banner_output
= '%s\n%%s\n\n' % cmd
.center(60, '-')
40 cmd_out
= utils
.run(cmd
, ignore_status
=True, timeout
=30)
41 command_output
= cmd_out
.stdout
+ cmd_out
.stderr
42 except error
.CmdError
:
43 command_output
= 'Timed out'
45 return banner_output
% command_output
49 logging
.info("Killing monitor_db")
51 utils
.signal_program(monitor_db
.PID_FILE_PREFIX
, sig
=signal
.SIGINT
)
52 if utils
.program_is_alive(monitor_db
.PID_FILE_PREFIX
): # was it killed?
53 # give it some time to shutdown
56 utils
.signal_process(monitor_db
.PID_FILE_PREFIX
)
59 def handle_sigterm(signum
, frame
):
60 logging
.info('Caught SIGTERM')
62 utils
.delete_pid_file_if_exists(monitor_db
.BABYSITTER_PID_FILE_PREFIX
)
65 signal
.signal(signal
.SIGTERM
, handle_sigterm
)
68 SiteMonitorProc
= utils
.import_site_class(
69 __file__
, 'autotest_lib.scheduler.site_monitor_db_babysitter',
70 'SiteMonitorProc', object)
73 class MonitorProc(SiteMonitorProc
):
74 def __init__(self
, do_recovery
=False):
75 args
= [monitor_db_path
]
77 args
.append("--recover-hosts")
78 args
.append(results_dir
)
82 scheduler_config
= scheduler_logging_config
.SchedulerLoggingConfig
83 log_name
= scheduler_config
.get_log_name()
84 os
.environ
['AUTOTEST_SCHEDULER_LOG_NAME'] = log_name
85 scheduler_log_dir
= scheduler_config
.get_server_log_dir()
86 self
.log_path
= os
.path
.join(scheduler_log_dir
, log_name
)
89 self
.last_log_change
= time
.time()
91 logging
.info("STARTING monitor_db with log file %s" % self
.log_path
)
94 # Allow site specific code to run, set environment variables and
95 # modify self.args if desired.
96 super(MonitorProc
, self
).__init
__()
100 devnull
= open(os
.devnull
, 'w')
101 self
.proc
= subprocess
.Popen(self
.args
, stdout
=devnull
)
104 def is_running(self
):
105 if self
.proc
.poll() is not None:
106 logging
.info("monitor_db DIED")
109 old_size
= self
.log_size
110 new_size
= os
.path
.getsize(self
.log_path
)
111 if old_size
!= new_size
:
112 logging
.info("Log was touched")
113 self
.log_size
= new_size
114 self
.last_log_change
= time
.time()
115 elif self
.last_log_change
+ STALL_TIMEOUT
< time
.time():
116 logging
.info("monitor_db STALLED")
117 self
.collect_stalled_info()
123 def collect_stalled_info(self
):
124 INFO_TO_COLLECT
= ['uptime',
128 db_cmd
= '/usr/bin/mysqladmin --verbose processlist -u%s -p%s'
129 config
= global_config
.global_config
131 user
= config
.get_config_value("BACKUP", "user")
132 password
= config
.get_config_value("BACKUP", "password")
133 db_cmd
%= (user
, password
)
134 INFO_TO_COLLECT
.append(db_cmd
)
135 except global_config
.ConfigError
:
137 stall_log_path
= self
.log_path
+ '.stall_info'
138 log
= open(stall_log_path
, "w")
139 for cmd
in INFO_TO_COLLECT
:
140 log
.write(run_banner_output(cmd
))
145 logging
.info("initializing")
148 logging
.critical("running as root, aborting!")
151 if utils
.program_is_alive(monitor_db
.BABYSITTER_PID_FILE_PREFIX
):
152 logging
.critical("monitor_db_babysitter already running, aborting!")
154 utils
.write_pid(monitor_db
.BABYSITTER_PID_FILE_PREFIX
)
157 proc
= MonitorProc(do_recovery
=recover
)
159 time
.sleep(PAUSE_LENGTH
)
160 while proc
.is_running():
162 time
.sleep(PAUSE_LENGTH
)