virt.virt_test_utils: run_autotest - 'tar' needs relative paths to strip the leading '/'
[autotest-zwu.git] / server / crashcollect.py
blobdc582ac82c1c2b12d46a08e77ab41790edc2b8bf
1 import os, time, pickle, logging, shutil
3 from autotest_lib.client.common_lib import global_config
4 from autotest_lib.server import utils
7 # import any site hooks for the crashdump and crashinfo collection
8 get_site_crashdumps = utils.import_site_function(
9 __file__, "autotest_lib.server.site_crashcollect", "get_site_crashdumps",
10 lambda host, test_start_time: None)
11 get_site_crashinfo = utils.import_site_function(
12 __file__, "autotest_lib.server.site_crashcollect", "get_site_crashinfo",
13 lambda host, test_start_time: None)
16 def get_crashdumps(host, test_start_time):
17 get_site_crashdumps(host, test_start_time)
20 def get_crashinfo(host, test_start_time):
21 logging.info("Collecting crash information...")
23 # include crashdumps as part of the general crashinfo
24 get_crashdumps(host, test_start_time)
26 if wait_for_machine_to_recover(host):
27 # run any site-specific collection
28 get_site_crashinfo(host, test_start_time)
30 crashinfo_dir = get_crashinfo_dir(host)
31 collect_messages(host)
32 collect_log_file(host, "/var/log/monitor-ssh-reboots", crashinfo_dir)
33 collect_command(host, "dmesg", os.path.join(crashinfo_dir, "dmesg"))
34 collect_uncollected_logs(host)
37 # Load default for number of hours to wait before giving up on crash collection.
38 HOURS_TO_WAIT = global_config.global_config.get_config_value(
39 'SERVER', 'crash_collection_hours_to_wait', type=float, default=4.0)
42 def wait_for_machine_to_recover(host, hours_to_wait=HOURS_TO_WAIT):
43 """Wait for a machine (possibly down) to become accessible again.
45 @param host: A RemoteHost instance to wait on
46 @param hours_to_wait: Number of hours to wait before giving up
48 @returns: True if the machine comes back up, False otherwise
49 """
50 current_time = time.strftime("%b %d %H:%M:%S", time.localtime())
51 if host.is_up():
52 logging.info("%s already up, collecting crash info", host.hostname)
53 return True
55 logging.info("Waiting %s hours for %s to come up (%s)",
56 hours_to_wait, host.hostname, current_time)
57 if not host.wait_up(timeout=hours_to_wait * 3600):
58 logging.warning("%s down, unable to collect crash info",
59 host.hostname)
60 return False
61 else:
62 logging.info("%s is back up, collecting crash info", host.hostname)
63 return True
66 def get_crashinfo_dir(host):
67 """Find and if necessary create a directory to store crashinfo in.
69 @param host: The RemoteHost object that crashinfo will be collected from
71 @returns: The path to an existing directory for writing crashinfo into
72 """
73 host_resultdir = getattr(getattr(host, "job", None), "resultdir", None)
74 if host_resultdir:
75 infodir = host_resultdir
76 else:
77 infodir = os.path.abspath(os.getcwd())
78 infodir = os.path.join(infodir, "crashinfo.%s" % host.hostname)
79 if not os.path.exists(infodir):
80 os.mkdir(infodir)
81 return infodir
84 def collect_log_file(host, log_path, dest_path):
85 """Collects a log file from the remote machine.
87 Log files are collected from the remote machine and written into the
88 destination path. If dest_path is a directory, the log file will be named
89 using the basename of the remote log path.
91 @param host: The RemoteHost to collect logs from
92 @param log_path: The remote path to collect the log file from
93 @param dest_path: A path (file or directory) to write the copies logs into
94 """
95 logging.info("Collecting %s...", log_path)
96 try:
97 host.get_file(log_path, dest_path, preserve_perm=False)
98 except Exception:
99 logging.warning("Collection of %s failed", log_path)
103 def collect_command(host, command, dest_path):
104 """Collects the result of a command on the remote machine.
106 The standard output of the command will be collected and written into the
107 desitionation path. The destination path is assumed to be filename and
108 not a directory.
110 @param host: The RemoteHost to collect from
111 @param command: A shell command to run on the remote machine and capture
112 the output from.
113 @param dest_path: A file path to write the results of the log into
115 logging.info("Collecting '%s' ...", command)
116 devnull = open("/dev/null", "w")
117 try:
118 try:
119 result = host.run(command, stdout_tee=devnull).stdout
120 utils.open_write_close(dest_path, result)
121 except Exception, e:
122 logging.warning("Collection of '%s' failed:\n%s", command, e)
123 finally:
124 devnull.close()
127 def collect_uncollected_logs(host):
128 """Collects any leftover uncollected logs from the client.
130 @param host: The RemoteHost to collect from
132 if host.job:
133 try:
134 logs = host.job.get_client_logs()
135 for hostname, remote_path, local_path in logs:
136 if hostname == host.hostname:
137 logging.info("Retrieving logs from %s:%s into %s",
138 hostname, remote_path, local_path)
139 host.get_file(remote_path + "/", local_path + "/")
140 except Exception, e:
141 logging.warning("Error while trying to collect stranded "
142 "Autotest client logs: %s", e)
145 def collect_messages(host):
146 """Collects the 'new' contents of /var/log/messages.
148 If host.VAR_LOG_MESSAGE_COPY_PATH is on the remote machine, collects
149 the contents of /var/log/messages excluding whatever initial contents
150 are already present in host.VAR_LOG_MESSAGE_COPY_PATH. If it is not
151 present, simply collects the entire contents of /var/log/messages.
153 @param host: The RemoteHost to collect from
155 crashinfo_dir = get_crashinfo_dir(host)
157 try:
158 # paths to the messages files
159 messages = os.path.join(crashinfo_dir, "messages")
160 messages_raw = os.path.join(crashinfo_dir, "messages.raw")
161 messages_at_start = os.path.join(crashinfo_dir, "messages.at_start")
163 # grab the files from the remote host
164 collect_log_file(host, host.VAR_LOG_MESSAGES_COPY_PATH,
165 messages_at_start)
166 collect_log_file(host, "/var/log/messages", messages_raw)
168 # figure out how much of messages.raw to skip
169 if os.path.exists(messages_at_start):
170 # if the first lines of the messages at start should match the
171 # first lines of the current messages; if they don't then messages
172 # has been erase or rotated and we just grab all of it
173 first_line_at_start = utils.read_one_line(messages_at_start)
174 first_line_now = utils.read_one_line(messages_raw)
175 if first_line_at_start != first_line_now:
176 size_at_start = 0
177 else:
178 size_at_start = os.path.getsize(messages_at_start)
179 else:
180 size_at_start = 0
181 raw_messages_file = open(messages_raw)
182 messages_file = open(messages, "w")
183 raw_messages_file.seek(size_at_start)
184 shutil.copyfileobj(raw_messages_file, messages_file)
185 raw_messages_file.close()
186 messages_file.close()
188 # get rid of the "raw" versions of messages
189 os.remove(messages_raw)
190 if os.path.exists(messages_at_start):
191 os.remove(messages_at_start)
192 except Exception, e:
193 logging.warning("Error while collecting /var/log/messages: %s", e)