[runtime] Don't raise MonoError when unloading appdomain
[mono-project.git] / scripts / babysitter
blobf8b5a6c345ca1545b6e47e06d44172ce7c6a69b9
1 #!/usr/bin/env python
3 # Mimics GNU timeout, but does some fancy tracking based on custom features in mono nunit24.
5 import argparse
6 import subprocess
7 import re
8 import signal
9 import time
10 import sys
11 import os.path
12 import copy
13 import tempfile
14 import calendar
15 import json
17 ### Constants
19 # Here is how the communication with nunit works. It has to work with two constraints:
20 # - We don't invoke nunit directly. We invoke some Makefile which invokes some other Makefile
21 # and at some point down the line someone calls nunit.
22 # - nunit has to be able to report back to us even if (especially if) it terminates improperly.
23 # To deal with all this, communication babysitter->nunit is done by environment variables,
24 # and communication nunit->babysitter is done by leaving behind files in known locations.
26 # Filenames
28 CURRENT_TEST_FILE = "babysitter_report_current_test_file.txt"
29 RAN_TEST_FILE = "babysitter_report_ran_test_file.txt"
30 FAILED_TEST_FILE = "babysitter_report_failed_test_file.txt"
31 LOGGING_FILE = "babysitter_report.json_lines"
33 # Environment keys
35 # Keys used for Babysitter<->Nunit IPC
36 CURRENT_TEST_KEY = 'MONO_BABYSITTER_NUNIT_CURRENT_TEST_FILE' # Tell nunit where to leave files
37 RAN_TEST_KEY = 'MONO_BABYSITTER_NUNIT_RAN_TEST_FILE'
38 FAILED_TEST_KEY = 'MONO_BABYSITTER_NUNIT_FAILED_TEST_FILE'
39 RUN_KEY = 'MONO_BABYSITTER_NUNIT_RUN_TEST' # Semicolon-separated list of test names
40 RUN_MODE_KEY = 'MONO_BABYSITTER_NUNIT_RUN_MODE' # Equal to either RUN or AFTER
42 # Keys used for script configuration (see --help text)
43 LOG_FILE_KEY = 'MONO_BABYSITTER_LOG_FILE' # Path
44 RETRY_KEY = 'MONO_BABYSITTER_RETRY' # Equal to an integer
45 VERBOSE_KEY = 'MONO_BABYSITTER_VERBOSE' # "Undocumented"-- used for debugging babysitter
47 # JSON keys
49 DATE_JSON = 'date' # POSIX timestamp of test suite run
50 INVOKE_JSON = 'invocation'
51 COUNT_JSON = 'iteration' # How many times was command executed?
52 LIMIT_JSON = 'failure_max'
53 SUPPORT_JSON = 'babysitter_protocol' # Was the test suite running with a babysitter-aware nunit?
54 FINAL_CODE_JSON = 'final_code'
55 TESTS_JSON = 'tests' # Holds dictionary of (test case name)->(dict with TEST_ keys below)
56 TEST_FAILURES = 'normal_failures'
57 TEST_CRASH_FAILURES = 'crash_failures'
58 TEST_TIMEOUT_FAILURES = 'timeout_failures'
60 ### Interpret arguments
62 scriptname = sys.argv[0]
64 # This is very silly: This program needs to switch from argparse to manual parsing
65 # after the second positional argument, but argparse doesn't let you do this.
66 # I work around this with a custom subclass which can selectively swallow errors:
67 class Hesitate(Exception):
68 pass
69 class HesitantParser(argparse.ArgumentParser):
70 def __init__(s, *args, **kwargs):
71 s.hesitating = True # Errors will be swallowed until this is set false
72 argparse.ArgumentParser.__init__(s, *args, **kwargs)
73 def error(s, *args, **kwargs):
74 if s.hesitating:
75 raise Hesitate() # Bail out before errors are printed.
76 argparse.ArgumentParser.error(s, *args, **kwargs)
78 # Define args
79 argparser = HesitantParser(description="""\
80 Run a test suite with a timeout.\n
81 Durations are floating point numbers followed by an optional unit:\n
82 's' for seconds (the default)
83 'm' for minutes
84 'h' for hours
85 'd' for days\n
86 supported environment variables:
87 %s: File to write logs to (as line-delimited JSON)
88 %s: If set to a number, failed test cases will be rerun this many times (NUnit test suites only)""" %
89 (LOG_FILE_KEY, RETRY_KEY),
90 formatter_class=argparse.RawTextHelpFormatter)
91 argparser.add_argument('-s', '--signal', dest='signal', metavar='signal', default='TERM',
92 help="Send this signal to the command on timeout, instead of TERM.")
93 argparser.add_argument('-k', '--kill-after-duration', dest='kill_after', metavar='duration',
94 help="If process continues running after signal, send KILL after this duration.")
95 argparser.add_argument('duration',
96 help="Time to run before sending signal.")
97 argparser.add_argument('command', nargs="+", help="Command+args to run.")
99 # Repeatedly parse the given args until we find the shortest prefix for which parsing succeeds.
100 argc = len(sys.argv)
101 extra_args = []
102 for limit in range(1,argc+1):
103 try:
104 if limit == argc: # On the final pass, parse for real
105 argparser.hesitating = False
106 args = argparser.parse_args(sys.argv[1:limit])
107 # If we're still here, parse_args succeeded.
108 # The final parsed arg is the command; remaining argv items are its args.
109 extra_args = sys.argv[limit:]
110 break
111 except Hesitate: # Before the final pass, if parse_args fails, skip
112 pass
114 argparser.hesitating = False # Just act like a normal argparser from here
116 durationre = re.compile(r'(\d+)([smhd]?)')
117 def parse_duration(duration): # Accept units
118 match = durationre.match(duration)
119 if not match:
120 argparser.error("Could not understand duration %s" % duration)
121 time, units = match.group(1), match.group(2)
122 time = int(time)
123 if units == 'm':
124 time *= 60
125 elif units == 'h':
126 time *= 60*60
127 elif units == 'd':
128 time *= 60*60*24
129 return time
131 def parse_signal(sig): # Accept names
132 if sig.isdigit():
133 return int(sig)
134 for k,v in signal.__dict__.iteritems():
135 if k == ("SIG%s" % sig):
136 return v
137 argparser.error("Could not understand signal name %s" % sig)
139 # Final interpretation of arguments
140 duration = parse_duration(args.duration)
141 kill_after = parse_duration(args.kill_after) if args.kill_after is not None else None
142 timeout_signal = parse_signal(args.signal)
143 command = args.command + extra_args
145 # Process environment
146 global_env = copy.deepcopy( os.environ )
148 verbose = VERBOSE_KEY in global_env
149 logging = LOG_FILE_KEY in global_env
150 logfile = global_env[LOG_FILE_KEY] if logging else None
151 crash_resuming = True # TODO: Consider exposing this option, or adding a retry_on_crash option.
152 failmax = int(global_env[RETRY_KEY]) if RETRY_KEY in global_env else 0
153 babysitting = True # If false, babysitter becomes a timeout clone with no env manipulation or anything.
154 if babysitting:
155 babysitter_dir = tempfile.mkdtemp()
156 global_env[CURRENT_TEST_KEY] = os.path.join(babysitter_dir, CURRENT_TEST_FILE)
157 global_env[RAN_TEST_KEY] = os.path.join(babysitter_dir, RAN_TEST_FILE)
158 global_env[FAILED_TEST_KEY] = os.path.join(babysitter_dir, FAILED_TEST_FILE)
160 have_unix_process_groups = 'killpg' in os.__dict__
161 have_windows_process_groups = 'CREATE_NEW_PROCESS_GROUP' in subprocess.__dict__
163 ### Timeout implementation
165 def wait(proc, duration):
166 # TODO: If we detect Python 3.3, Popen objects have a wait(timeout) method we can use
167 start = time.time()
168 while True:
169 code = proc.poll()
170 if code is not None:
171 return code
172 if time.time()-start > duration:
173 return None
174 time.sleep(0.05)
176 # Popen and send_signal can't be called in their basic forms because we want to
177 # send signals to all children, not just to the immediately spawned process.
178 # Unfortunately the way to do this varies by operating system.
179 def popen(*args, **kwargs):
180 if have_unix_process_groups: # Call function on spawn to become process group leader
181 kwargs['preexec_fn'] = os.setsid
182 elif have_windows_process_groups: # Set magic flag for Windows process groups
183 kwargs['creationflags'] = subprocess.CREATE_NEW_PROCESS_GROUP
184 return subprocess.Popen(*args, **kwargs)
186 def send_signal(proc, sig):
187 if have_unix_process_groups: # UNIX
188 # For compatibility with GNU timeout, pre-send the signal to just the monitored process
189 os.kill(proc.pid, sig)
190 # Send signal to entire group
191 os.killpg(proc.pid, sig)
192 # For compatibility with GNU Timeout, send a SIGCONT after the signal
193 # (so delivery has a chance to occur even for stopped processes)
194 if sig != signal.SIGKILL and sig != signal.SIGCONT:
195 os.kill(proc.pid, signal.SIGCONT)
196 elif have_windows_process_groups: # Windows with Python 2.7 or better
197 os.kill(proc.pid, sig) # Becuase CREATE_NEW_PROCESS_GROUP, will go to entire group
198 else: # Windows with Python 2.6-- CREATE_NEW_PROCESS_GROUP not supported
199 proc.send_signal(sig) # No way to contact group, just kill process
201 ### Utility functions
203 def attemptDelete(path):
204 try:
205 os.remove(path)
206 except OSError:
207 pass
209 def attemptLines(path):
210 try:
211 with open(path) as f:
212 return map(lambda s: s.strip('\r\n'), f.readlines())
213 except (OSError, IOError):
214 return []
216 def attemptFirstLine(path):
217 lines = attemptLines(path)
218 if lines:
219 return lines[0]
220 return None
222 def posixtime(): # Amazingly, time.time() does not guarantee an epoch in the docs. However this does:
223 return calendar.timegm(time.gmtime())
225 failcount = {}
226 def failure_may_retry(test):
227 if test not in failcount:
228 failcount[test] = 0
229 failcount[test] += 1
230 return failcount[test] < failmax
232 def verbose_print(arg):
233 if (verbose):
234 print(arg)
236 def failure_annotate(test):
237 return "%s (failure #%d of %d allowed)" % (test, failcount[test], failmax)
239 def pluralize(lst):
240 return "s" if len(lst) > 1 else ""
242 ### Run command
244 def run(): # Returns exit code
245 resume_after = []
246 retry_these = []
247 ever_completed = False
248 died_politely = False
249 proc = None
250 code = None
252 # Set up logging
253 log = {DATE_JSON: posixtime(), COUNT_JSON:0, LIMIT_JSON:failmax, SUPPORT_JSON:False,
254 INVOKE_JSON: " ".join(command)}
256 def log_value(key, set=None, add=None, target=log): # Call to add toplevel value to log
257 if add is not None:
258 if key not in target:
259 target[key] = 0
260 target[key] += add
261 else:
262 target[key] = set
264 def log_test(testname, key, set=None, add=None): # Call to add test-case-level value to log
265 if TESTS_JSON not in log:
266 log[TESTS_JSON] = {}
267 if testname not in log[TESTS_JSON]:
268 log[TESTS_JSON][testname] = {}
269 log_value(key, set=set, add=add, target=log[TESTS_JSON][testname])
271 # Ready to run tests
272 try:
273 while True:
274 env = copy.copy(global_env)
275 if ever_completed:
276 retry_next = []
277 else: # Persist reported failures list until first non-crash run
278 retry_next = retry_these
280 log_value(COUNT_JSON, add=1)
282 # Prepare environment/filesystem
283 if babysitting:
284 for key in [CURRENT_TEST_KEY, RAN_TEST_KEY, FAILED_TEST_KEY]:
285 attemptDelete(env[key])
286 if resume_after:
287 env[RUN_KEY] = ";".join(resume_after)
288 env[RUN_MODE_KEY] = "EXCLUDE"
289 elif retry_these:
290 env[RUN_KEY] = ";".join(retry_these)
291 env[RUN_MODE_KEY] = "RUN"
293 # Run test suite
294 try:
295 proc = popen(command, env=env)
296 except OSError:
297 died_politely = True
298 sys.stderr.write("%s: Could not execute command `%s`\n" % (scriptname, command[0]))
299 sys.exit(127)
301 code = wait(proc, duration)
302 timed_out = code is None
303 if timed_out: # Initial timeout
304 send_signal(proc, timeout_signal)
305 if kill_after is not None: # Kill-after timeout
306 code = wait(proc, kill_after)
307 if code is None:
308 send_signal(proc, signal.SIGKILL)
309 code = proc.wait() # Waits forever
310 sys.stderr.write("%s: Command `%s` timed out\n" % (scriptname, command[0]))
311 died_politely = True
313 # The test suite has now run, and what happens next varies:
314 # 1. The suite either completed fully without failures, or timed out: Just quit.
315 # 2. The suite crashed (halted without completing):
316 # Remember any failures for later and rerun, using a blacklist of testcases we have completed.
317 # 3. The suite completed, but there were failures reported:
318 # Rerun, using a whitelist of only reported-failed testcases.
319 # 4. The suite crashed partway through a run with a whitelist:
320 # Rerun, using a whitelist consisting of the previous whitelist minus successful testcases.
322 crashed_at = attemptFirstLine(env[CURRENT_TEST_KEY])
323 failed_tests = attemptLines(env[FAILED_TEST_KEY])
324 ran_tests = attemptLines(env[RAN_TEST_KEY])
325 bailout = False
327 if crashed_at or failed_tests or ran_tests: # Test suite follows the babysitter protocol
328 log_value(SUPPORT_JSON, True)
330 if not crashed_at and not ever_completed: # The resume_after whitelist is only
331 resume_after = [] # used before the first noncrashing run
332 ever_completed = True
334 if timed_out: # Currently no retries after timeout
335 bailout = True
336 code = 124 # See GNU timeout manpage
338 if code or crashed_at: # Process failures
339 # Handle crash failures
340 if crashed_at and not timed_out:
341 log_test(crashed_at, TEST_CRASH_FAILURES, add=1)
342 if not crash_resuming:
343 bailout = True
345 if failure_may_retry(crashed_at):
346 if ever_completed: # Rerun with whitelist next time
347 for test in retry_these: # Prepopulate with last whitelist minus run testcases
348 if test == crashed_at or test not in ran_tests: # (plus crashed testcase)
349 retry_next.append(test)
350 else: # Rerun with blacklist next time
351 for test in ran_tests: # Add testcases we just ran to blacklist
352 if test != crashed_at: # (except for the crashed testcase)
353 resume_after.append(test)
354 else:
355 bailout = True
357 # Handle reported failures
358 for test in failed_tests:
359 log_test(test, TEST_FAILURES, add=1)
360 if failure_may_retry(test):
361 retry_next.append(test)
362 else:
363 bailout = True
365 # Report human-readable failures for stdout log.
366 message = "%s:" % (scriptname)
367 if timed_out:
368 message += " Saw timeout in test case %s (never allowed)." % (crashed_at)
369 elif crashed_at:
370 message += " Saw crash in test case %s." % (failure_annotate(crashed_at))
371 if failed_tests:
372 message += " Saw test failure in test case%s %s." % (pluralize(failed_tests), "; ".join(map(failure_annotate, failed_tests)))
373 if not (timed_out or crashed_at or failed_tests):
374 message += " Test suite terminated with code %d, " % (code)
375 if log[SUPPORT_JSON]:
376 message += "but failure did not occur during a test case. Halting."
377 else:
378 message += "and suite cannot report test case data. Halting."
379 elif bailout:
380 message += " Will halt testing."
381 print(message)
383 if bailout or not (resume_after or retry_next): # If not retrying
384 return code
386 # If we got here, a retry is occurring
387 retry_these = retry_next
389 # Report human-readable retry notice for stdout log.
390 message = "%s: Will rerun test suite" % (scriptname)
391 if log[COUNT_JSON] > 1:
392 message += " (iteration #%d)" % (log[COUNT_JSON])
393 if resume_after:
394 message += ", resuming at crashed testcase %s." % (crashed_at)
395 else:
396 message += ", running previously failed testcase%s: %s." % (pluralize(retry_these), "; ".join(retry_these))
397 print(message)
398 finally:
399 # Emergency: Ensure command does not outlive this script
400 if proc is not None and not died_politely:
401 send_signal(proc, signal.SIGKILL)
403 # Write out logs
404 log_value(FINAL_CODE_JSON, "EXCEPTION" if code is None else code)
405 if logging:
406 with open(logfile, "a") as f:
407 f.write(json.dumps(log) + os.linesep)
409 sys.exit( run() )