Transpose lines.
[sbcl.git] / tests / deadlock.impure.lisp
blob0c0b5d8870e9921560853ac30972a11b122aebfc
1 #-sb-thread (invoke-restart 'run-tests::skip-file)
2 #+ultrafutex (invoke-restart 'run-tests::skip-file)
4 (import '(sb-thread:join-thread
5 sb-thread:make-mutex
6 sb-thread:make-semaphore
7 sb-thread:make-thread
8 sb-thread:signal-semaphore
9 sb-thread:thread-deadlock
10 sb-thread:wait-on-semaphore
11 sb-thread:with-mutex))
13 (when (sb-sys:find-dynamic-foreign-symbol-address "show_gc_generation_throughput")
14 (setf (extern-alien "show_gc_generation_throughput" int) 0))
16 (with-test (:name :deadlock-detection.1)
17 (loop
18 repeat 1000
19 do (flet ((test (ma mb sa sb)
20 (lambda ()
21 (handler-case
22 (with-mutex (ma)
23 (signal-semaphore sa)
24 (wait-on-semaphore sb)
25 (with-mutex (mb)
26 :ok))
27 (thread-deadlock (e)
28 ;; (assert (plusp (length ...))) prevents
29 ;; flushing.
30 (assert (plusp (length (princ-to-string e))))
31 :deadlock)))))
32 (let* ((m1 (make-mutex :name "M1"))
33 (m2 (make-mutex :name "M2"))
34 (s1 (make-semaphore :name "S1"))
35 (s2 (make-semaphore :name "S2"))
36 (t1 (make-thread (test m1 m2 s1 s2) :name "T1"))
37 (t2 (make-thread (test m2 m1 s2 s1) :name "T2")))
38 ;; One will deadlock, and the other will then complete normally.
39 (let ((res (list (join-thread t1)
40 (join-thread t2))))
41 (assert (or (equal '(:deadlock :ok) res)
42 (equal '(:ok :deadlock) res))))))))
44 (with-test (:name :deadlock-detection.2)
45 (let* ((m1 (make-mutex :name "M1"))
46 (m2 (make-mutex :name "M2"))
47 (s1 (make-semaphore :name "S1"))
48 (s2 (make-semaphore :name "S2"))
49 (t1 (make-thread
50 (lambda ()
51 (with-mutex (m1)
52 (signal-semaphore s1)
53 (wait-on-semaphore s2)
54 (with-mutex (m2)
55 :ok)))
56 :name "T1")))
57 (prog (err)
58 :retry
59 (handler-bind ((thread-deadlock
60 (lambda (e)
61 (unless err
62 ;; Make sure we can print the condition
63 ;; while it's active
64 (let ((*print-circle* nil))
65 (setf err (princ-to-string e)))
66 (go :retry)))))
67 (when err
68 (sleep 1))
69 (assert (eq :ok (with-mutex (m2)
70 (unless err
71 (signal-semaphore s2)
72 (wait-on-semaphore s1)
73 (sleep 1))
74 (with-mutex (m1)
75 :ok)))))
76 (assert (stringp err)))
77 (assert (eq :ok (join-thread t1)))))
79 (with-test (:name :deadlock-detection.3
80 :broken-on (and :darwin :gc-stress))
81 (let* ((m1 (make-mutex :name "M1"))
82 (m2 (make-mutex :name "M2"))
83 (s1 (make-semaphore :name "S1"))
84 (s2 (make-semaphore :name "S2"))
85 (t1 (make-thread
86 (lambda ()
87 (with-mutex (m1)
88 (signal-semaphore s1)
89 (wait-on-semaphore s2)
90 (with-mutex (m2)
91 :ok)))
92 :name "T1")))
93 ;; Currently we don't consider it a deadlock
94 ;; if there is a timeout in the chain.
95 (assert (eq :deadline
96 (handler-case
97 (with-mutex (m2)
98 (signal-semaphore s2)
99 (wait-on-semaphore s1)
100 (sleep 1)
101 (sb-sys:with-deadline (:seconds 0.1)
102 (with-mutex (m1)
103 :ok)))
104 (sb-sys:deadline-timeout ()
105 :deadline)
106 (thread-deadlock ()
107 :deadlock))))
108 (assert (eq :ok (join-thread t1)))))
110 (with-test (:name (:deadlock-detection :interrupts)
111 :broken-on :win32)
112 (let* ((m1 (sb-thread:make-mutex :name "M1"))
113 (m2 (sb-thread:make-mutex :name "M2"))
114 (t1-can-go (sb-thread:make-semaphore :name "T1 can go"))
115 (t2-can-go (sb-thread:make-semaphore :name "T2 can go"))
116 (t1 (sb-thread:make-thread
117 (lambda ()
118 (sb-thread:with-mutex (m1)
119 (sb-thread:wait-on-semaphore t1-can-go)
120 :ok1))
121 :name "T1"))
122 (t2 (sb-thread:make-thread
123 (lambda ()
124 (sb-ext:wait-for (eq t1 (sb-thread:mutex-owner m1)))
125 (sb-thread:with-mutex (m1 :wait-p t)
126 (sb-thread:wait-on-semaphore t2-can-go)
127 :ok2))
128 :name "T2")))
129 (sb-ext:wait-for (eq m1 (sb-thread::thread-waiting-for t2)))
130 (sb-thread:interrupt-thread t2 (lambda ()
131 (sb-thread:with-mutex (m2 :wait-p t)
132 (sb-ext:wait-for
133 (eq m2 (sb-thread::thread-waiting-for t1)))
134 (sb-thread:signal-semaphore t2-can-go))))
135 (sb-ext:wait-for (eq t2 (sb-thread:mutex-owner m2)))
136 (sb-thread:interrupt-thread t1 (lambda ()
137 (sb-thread:with-mutex (m2 :wait-p t)
138 (sb-thread:signal-semaphore t1-can-go))))
139 ;; both threads should finish without a deadlock or deadlock
140 ;; detection error
141 (let ((res (list (sb-thread:join-thread t1)
142 (sb-thread:join-thread t2))))
143 (assert (equal '(:ok1 :ok2) res)))))
145 (with-test (:name (:deadlock-detection :gc))
146 ;; To semi-reliably trigger the error (in SBCL's where)
147 ;; it was present you had to run this for > 30 seconds,
148 ;; but that's a bit long for a single test.
149 (let* ((stop (+ 5 (get-universal-time)))
150 (m1 (sb-thread:make-mutex :name "m1"))
151 (t1 (sb-thread:make-thread
152 (lambda ()
153 (loop until (> (get-universal-time) stop)
154 do (sb-thread:with-mutex (m1)
155 (eval `(make-array 24))))
156 :ok)))
157 (t2 (sb-thread:make-thread
158 (lambda ()
159 (loop until (> (get-universal-time) stop)
160 do (sb-thread:with-mutex (m1)
161 (eval `(make-array 24))))
162 :ok))))
163 (let ((res (list (sb-thread:join-thread t1)
164 (sb-thread:join-thread t2))))
165 (assert (equal '(:ok :ok) res)))))
167 #+deadlock-test-timing
168 (progn
169 (defun clock-gettime () (sb-unix:clock-gettime sb-unix:clock-realtime))
170 (defun seconds-since (start_sec start_nsec)
171 (multiple-value-bind (stop_sec stop_nsec) (clock-gettime)
172 (+ (/ (coerce (- stop_nsec start_nsec) 'double-float) 1000000000)
173 (- stop_sec start_sec)))))
175 (defglobal *max-avl-tree-total* 0)
176 (defglobal *max-avl-tree-born* 0)
177 (defglobal *max-avl-tree-running* 0)
178 (defglobal *max-avl-tree-died* 0)
180 (defun avl-maptree (fun tree)
181 (sb-int:named-let recurse ((node tree))
182 (when node
183 (funcall fun node)
184 (recurse (sb-thread::avlnode-left node))
185 (recurse (sb-thread::avlnode-right node)))))
187 (defun thread-count (&optional (tree sb-thread::*all-threads*))
188 (let ((born 0)
189 (running 0)
190 (died 0))
191 (sb-int:dx-flet ((mapfun (node)
192 (ecase (sb-thread::thread-%visible (sb-thread::avlnode-data node))
193 (0 (incf born)) ; "can't happen" ?
194 (1 (incf running))
195 (-1 (incf died)))))
196 (avl-maptree #'mapfun tree))
197 (let ((total (+ born running died)))
198 (macrolet ((max-into (global mine)
199 `(let ((old ,global))
200 (loop
201 (when (<= ,mine old) (return))
202 (let ((actual (cas ,global old ,mine)))
203 (if (>= actual old) (return)) (setq old actual))))))
204 (max-into *max-avl-tree-total* total)
205 (max-into *max-avl-tree-born* born)
206 (max-into *max-avl-tree-running* running)
207 (max-into *max-avl-tree-died* died))
208 (list total born running died))))
210 ;;; I would optimistically guess that this test can no longer fail,
211 ;;; even for :win32, since there is no lisp mutex around *all-threads*.
212 ;;; (There is still the C one)
214 ;;; With the sesion lock acquisition as part of thread startup,
215 ;;; but with a patch that allows the thread creator to run while
216 ;;; the child is waiting, all this test managed to do was create thousands
217 ;;; of threads all blocked on the *SESSION-LOCK*, because MAKE-THREAD could
218 ;;; return the created thread did anything at all.
219 ;;; The way that happens is as follows:
220 ;;; thread 0: grab session lock, initiate GC
221 ;;; ... return from GC but not release session lock yet
222 ;;; thread 1: start sleeping
223 ;;; thread 2: try to grab session lock, enter a wait
224 ;;; thread 3: start sleeping
225 ;;; thread 4: try to grab session lock, enter a wait
226 ;;; thread 5: start sleeping
227 ;;; ...
228 ;;; So the number of threads running simultaneously depends entirely on when
229 ;;; the even numbered threads get CPU time to release the session lock.
231 ;;; That said, I was curious why this test complete _so_ much slowly
232 ;;; with faster thread start, and more quickly with slower thread start.
233 ;;; Enabling the :deadlock-test-timing feature shows:
235 ;;; Maxima attained: 2923 1 2923 2916 current=(2923 0 1 2922)
236 ;;; ---- ----
237 ;;; tree node count ^ ^ nunber of threads in "run" state
239 ;;; The answer is obvious: the time spent in each GC is proportional to
240 ;;; the number of threads, and with faster thread start, we can actually
241 ;;; achieve *nearly* 3000 threads running at the same time.
242 ;;; With each thread allocating slightly over 4MiB of memory, that's
243 ;;; about 12GiB of additional memory for the OS to manage which has
244 ;;; a 2nd-order effect on our runtime as well.
245 ;;; Contrast this with the "old way" where we were essentially
246 ;;; firing off two threads at a time and letting them finish
247 ;;; before getting to the next two.
249 ;; (pushnew :deadlock-test-timing *features*)
251 (defglobal *message-in-counter* 0)
252 (defglobal *message-out-counter* 0)
253 (declaim (fixnum *message-in-counter* *message-out-counter*))
254 (defparameter *huge-n-threads* 3000)
255 (defglobal *messages* (make-array (* 2 *huge-n-threads*)))
256 (declaim (simple-vector *messages*))
258 (defun show-queued-messages ()
259 (loop while (< *message-out-counter* *message-in-counter*)
260 do (let ((args (aref *messages* *message-out-counter*)))
261 ;; If a store did not get into the array yet, bail out
262 ;; and hope it shows up by the next time we're here.
263 (when (eql args 0)
264 (return-from show-queued-messages))
265 (apply #'format t (concatenate 'string "~4d " (car args)) (cdr args))
266 (terpri))
267 (incf *message-out-counter*)))
269 ;;; Atomically log a message for output by the main thread (to avoid interleaving)
270 (defun message (control &rest args)
271 (let* ((data (cons control args))
272 (index (atomic-incf *message-in-counter*)))
273 (setf (aref *messages* index) data)))
275 #+darwin
276 (test-util::disable-profiling)
278 ;;; This encounters the "backing off for retry" error if attempting
279 ;;; to start too many threads.
280 (defparameter *max-runnable-threads* #+x86-64 100 #-x86-64 5)
281 (with-test (:name :gc-deadlock
282 :broken-on :win32)
283 #+nil (write-line "WARNING: THIS TEST WILL HANG ON FAILURE!")
284 ;; Prior to 0.9.16.46 thread exit potentially deadlocked the
285 ;; GC due to *all-threads-lock* and session lock. On earlier
286 ;; versions and at least on one specific box this test is good enough
287 ;; to catch that typically well before the 1500th iteration.
288 (loop
289 with i = 0
290 with n = *huge-n-threads*
291 with running = nil
292 while (< i n)
294 (show-queued-messages)
295 (incf i)
296 #-deadlock-test-timing
297 (when (zerop (mod i 100))
298 (write-char #\.)
299 (force-output))
300 (when (> (length running) *max-runnable-threads*)
301 (let ((last (car (last running))))
302 (sb-thread:join-thread last)
303 (setq running (nbutlast running))))
304 (handler-case
305 (push
306 (if (oddp i)
307 (make-join-thread
308 (lambda (i &aux (rand (random 0.001)))
309 (declare (ignorable i))
310 #-deadlock-test-timing
311 (sleep rand)
312 #+deadlock-test-timing
313 (multiple-value-bind (t0_sec t0_nsec) (clock-gettime)
314 (message "Sleep ~f, threads=~d" i rand (thread-count))
315 (sleep rand)
316 (message "Done (~f sec)" i (seconds-since t0_sec t0_nsec))))
317 :arguments i
318 :name (format nil "SLEEP-~D" i))
319 (make-join-thread
320 (lambda (i)
321 (declare (ignorable i))
322 ;; KLUDGE: what we are doing here is explicit,
323 ;; but the same can happen because of a regular
324 ;; MAKE-THREAD or LIST-ALL-THREADS, and various
325 ;; session functions.
326 #-deadlock-test-timing
327 (progn
328 (sb-thread::with-session-lock (sb-thread::*session*)
329 (sb-ext:gc)))
330 #+deadlock-test-timing
331 (sb-int:binding* (((t0_sec t0_nsec) (clock-gettime))
332 ((t1_sec t1_nsec wait-time) (values nil nil nil)))
333 (message "GC, threads=~d" i (thread-count))
334 (sb-thread::with-session-lock (sb-thread::*session*)
335 (setq wait-time (seconds-since t0_sec t0_nsec))
336 (multiple-value-setq (t1_sec t1_nsec) (clock-gettime))
337 (sb-ext:gc))
338 (let ((gc-time (seconds-since t1_sec t1_nsec)))
339 (message "GC Done (~f wait + ~f gc)~A"
340 i wait-time gc-time
341 (if (> gc-time .5) "***" "")))))
342 :arguments i
343 :name (format nil "GC-~D" i)))
344 running)
345 (error (e)
346 ;; Not sure why this needs to back off - at most it was running 2 threads.
347 (format t "~%error creating thread ~D: ~A -- backing off for retry~%" i e)
348 (sleep 0.1)
349 (incf i))))
350 #+deadlock-test-timing
351 (progn
352 (format t "~&Main thread: draining messages~%") (force-output)
353 (loop while (< (cas *message-out-counter* 0 0) (length *messages*))
354 do (show-queued-messages)
355 (sleep .05))
356 (format t "Maxima attained: ~d ~d ~d ~d current=~d~%"
357 *max-avl-tree-total*
358 *max-avl-tree-born*
359 *max-avl-tree-running*
360 *max-avl-tree-died*
361 (thread-count))
362 (sb-thread:make-thread #'list :name "list")
363 (format t "After another make-thread: current=~d~%" (thread-count))
364 (let (gc-times)
365 (sb-int:dovector (x *messages*)
366 (when (string= (first x) "GC Done" :end1 7)
367 (push (fourth x) gc-times)))
368 (let ((min (reduce #'min gc-times))
369 (max (reduce #'max gc-times))
370 (sum (reduce #'+ gc-times)))
371 (format t "~&GC time: min=~f max=~f avg=~f sum=~f~%"
372 min max (/ sum (length gc-times)) sum)))))