2 import os
, sys
, signal
, time
, subprocess
, logging
3 from optparse
import OptionParser
5 from autotest_lib
.scheduler
import babysitter_logging_config
6 from autotest_lib
.client
.common_lib
import error
, global_config
, utils
7 from autotest_lib
.client
.common_lib
import logging_manager
8 from autotest_lib
.scheduler
import scheduler_logging_config
9 from autotest_lib
.scheduler
import monitor_db
12 STALL_TIMEOUT
= 2*60*60
14 parser
= OptionParser()
15 parser
.add_option("-r", action
="store_true", dest
="recover",
16 help=("run recovery mode (implicit after any crash)"))
17 parser
.add_option("--background", dest
="background", action
="store_true",
18 default
=False, help=("runs the scheduler monitor on "
20 (options
, args
) = parser
.parse_args()
22 autodir
= os
.path
.abspath(os
.path
.join(os
.path
.dirname(__file__
), '..'))
23 results_dir
= os
.path
.join(autodir
, 'results')
24 monitor_db_path
= os
.path
.join(autodir
, 'scheduler/monitor_db.py')
25 recover
= (options
.recover
== True)
32 def run_banner_output(cmd
):
33 """Returns ------ CMD ------\nCMD_OUTPUT in a string"""
34 banner_output
= '%s\n%%s\n\n' % cmd
.center(60, '-')
37 cmd_out
= utils
.run(cmd
, ignore_status
=True, timeout
=30)
38 command_output
= cmd_out
.stdout
+ cmd_out
.stderr
39 except error
.CmdError
:
40 command_output
= 'Timed out'
42 return banner_output
% command_output
46 logging
.info("Killing monitor_db")
48 utils
.signal_program(monitor_db
.PID_FILE_PREFIX
, sig
=signal
.SIGINT
)
49 if utils
.program_is_alive(monitor_db
.PID_FILE_PREFIX
): # was it killed?
50 # give it some time to shutdown
53 utils
.signal_process(monitor_db
.PID_FILE_PREFIX
)
56 def handle_sigterm(signum
, frame
):
57 logging
.info('Caught SIGTERM')
59 utils
.delete_pid_file_if_exists(monitor_db
.BABYSITTER_PID_FILE_PREFIX
)
62 signal
.signal(signal
.SIGTERM
, handle_sigterm
)
65 SiteMonitorProc
= utils
.import_site_class(
66 __file__
, 'autotest_lib.scheduler.site_monitor_db_babysitter',
67 'SiteMonitorProc', object)
70 class MonitorProc(SiteMonitorProc
):
71 def __init__(self
, do_recovery
=False):
72 args
= [monitor_db_path
]
74 args
.append("--recover-hosts")
75 args
.append(results_dir
)
79 scheduler_config
= scheduler_logging_config
.SchedulerLoggingConfig
80 log_name
= scheduler_config
.get_log_name()
81 os
.environ
['AUTOTEST_SCHEDULER_LOG_NAME'] = log_name
82 scheduler_log_dir
= scheduler_config
.get_server_log_dir()
83 self
.log_path
= os
.path
.join(scheduler_log_dir
, log_name
)
86 self
.last_log_change
= time
.time()
88 logging
.info("STARTING monitor_db with log file %s" % self
.log_path
)
91 # Allow site specific code to run, set environment variables and
92 # modify self.args if desired.
93 super(MonitorProc
, self
).__init
__()
97 devnull
= open(os
.devnull
, 'w')
98 self
.proc
= subprocess
.Popen(self
.args
, stdout
=devnull
)
101 def is_running(self
):
102 if self
.proc
.poll() is not None:
103 logging
.info("monitor_db DIED")
106 old_size
= self
.log_size
107 new_size
= os
.path
.getsize(self
.log_path
)
108 if old_size
!= new_size
:
109 logging
.info("Log was touched")
110 self
.log_size
= new_size
111 self
.last_log_change
= time
.time()
112 elif self
.last_log_change
+ STALL_TIMEOUT
< time
.time():
113 logging
.info("monitor_db STALLED")
114 self
.collect_stalled_info()
120 def collect_stalled_info(self
):
121 INFO_TO_COLLECT
= ['uptime',
125 db_cmd
= '/usr/bin/mysqladmin --verbose processlist -u%s -p%s'
126 config
= global_config
.global_config
128 user
= config
.get_config_value("BACKUP", "user")
129 password
= config
.get_config_value("BACKUP", "password")
130 db_cmd
%= (user
, password
)
131 INFO_TO_COLLECT
.append(db_cmd
)
132 except global_config
.ConfigError
:
134 stall_log_path
= self
.log_path
+ '.stall_info'
135 log
= open(stall_log_path
, "w")
136 for cmd
in INFO_TO_COLLECT
:
137 log
.write(run_banner_output(cmd
))
143 logging
.critical("Running as root, aborting!")
146 if utils
.program_is_alive(monitor_db
.BABYSITTER_PID_FILE_PREFIX
):
147 logging
.critical("Monitor_db_babysitter already running, aborting!")
150 utils
.write_pid(monitor_db
.BABYSITTER_PID_FILE_PREFIX
)
152 if options
.background
:
153 logging_manager
.configure_logging(
154 babysitter_logging_config
.BabysitterLoggingConfig(use_console
=False))
156 # Double fork - see http://code.activestate.com/recipes/66012/
160 sys
.exit(0) # exit from first parent
162 sys
.stderr
.write("fork #1 failed: (%d) %s\n" % (e
.errno
, e
.strerror
))
165 # Decouple from parent environment
174 sys
.exit(0) # exit from second parent
176 sys
.stderr
.write("fork #2 failed: (%d) %s\n" % (e
.errno
, e
.strerror
))
179 logging_manager
.configure_logging(
180 babysitter_logging_config
.BabysitterLoggingConfig())
184 proc
= MonitorProc(do_recovery
=recover
)
186 time
.sleep(PAUSE_LENGTH
)
187 while proc
.is_running():
189 time
.sleep(PAUSE_LENGTH
)