safepoint: Remove unused context argument.
[sbcl.git] / tests / deadlock.impure.lisp
blob3a9b6f468990d1f42dd0518fb37ed78409020852
1 #-sb-thread (invoke-restart 'run-tests::skip-file)
3 (import '(sb-thread:join-thread
4 sb-thread:make-mutex
5 sb-thread:make-semaphore
6 sb-thread:make-thread
7 sb-thread:signal-semaphore
8 sb-thread:thread-deadlock
9 sb-thread:wait-on-semaphore
10 sb-thread:with-mutex))
12 (when (sb-sys:find-dynamic-foreign-symbol-address "show_gc_generation_throughput")
13 (setf (extern-alien "show_gc_generation_throughput" int) 0))
15 (with-test (:name :deadlock-detection.1)
16 (loop
17 repeat 1000
18 do (flet ((test (ma mb sa sb)
19 (lambda ()
20 (handler-case
21 (with-mutex (ma)
22 (signal-semaphore sa)
23 (wait-on-semaphore sb)
24 (with-mutex (mb)
25 :ok))
26 (thread-deadlock (e)
27 ;; (assert (plusp (length ...))) prevents
28 ;; flushing.
29 (assert (plusp (length (princ-to-string e))))
30 :deadlock)))))
31 (let* ((m1 (make-mutex :name "M1"))
32 (m2 (make-mutex :name "M2"))
33 (s1 (make-semaphore :name "S1"))
34 (s2 (make-semaphore :name "S2"))
35 (t1 (make-thread (test m1 m2 s1 s2) :name "T1"))
36 (t2 (make-thread (test m2 m1 s2 s1) :name "T2")))
37 ;; One will deadlock, and the other will then complete normally.
38 (let ((res (list (join-thread t1)
39 (join-thread t2))))
40 (assert (or (equal '(:deadlock :ok) res)
41 (equal '(:ok :deadlock) res))))))))
43 (with-test (:name :deadlock-detection.2)
44 (let* ((m1 (make-mutex :name "M1"))
45 (m2 (make-mutex :name "M2"))
46 (s1 (make-semaphore :name "S1"))
47 (s2 (make-semaphore :name "S2"))
48 (t1 (make-thread
49 (lambda ()
50 (with-mutex (m1)
51 (signal-semaphore s1)
52 (wait-on-semaphore s2)
53 (with-mutex (m2)
54 :ok)))
55 :name "T1")))
56 (prog (err)
57 :retry
58 (handler-bind ((thread-deadlock
59 (lambda (e)
60 (unless err
61 ;; Make sure we can print the condition
62 ;; while it's active
63 (let ((*print-circle* nil))
64 (setf err (princ-to-string e)))
65 (go :retry)))))
66 (when err
67 (sleep 1))
68 (assert (eq :ok (with-mutex (m2)
69 (unless err
70 (signal-semaphore s2)
71 (wait-on-semaphore s1)
72 (sleep 1))
73 (with-mutex (m1)
74 :ok)))))
75 (assert (stringp err)))
76 (assert (eq :ok (join-thread t1)))))
78 (with-test (:name :deadlock-detection.3
79 :broken-on (and :darwin :gc-stress))
80 (let* ((m1 (make-mutex :name "M1"))
81 (m2 (make-mutex :name "M2"))
82 (s1 (make-semaphore :name "S1"))
83 (s2 (make-semaphore :name "S2"))
84 (t1 (make-thread
85 (lambda ()
86 (with-mutex (m1)
87 (signal-semaphore s1)
88 (wait-on-semaphore s2)
89 (with-mutex (m2)
90 :ok)))
91 :name "T1")))
92 ;; Currently we don't consider it a deadlock
93 ;; if there is a timeout in the chain.
94 (assert (eq :deadline
95 (handler-case
96 (with-mutex (m2)
97 (signal-semaphore s2)
98 (wait-on-semaphore s1)
99 (sleep 1)
100 (sb-sys:with-deadline (:seconds 0.1)
101 (with-mutex (m1)
102 :ok)))
103 (sb-sys:deadline-timeout ()
104 :deadline)
105 (thread-deadlock ()
106 :deadlock))))
107 (assert (eq :ok (join-thread t1)))))
109 (with-test (:name (:deadlock-detection :interrupts)
110 :broken-on :win32)
111 (let* ((m1 (sb-thread:make-mutex :name "M1"))
112 (m2 (sb-thread:make-mutex :name "M2"))
113 (t1-can-go (sb-thread:make-semaphore :name "T1 can go"))
114 (t2-can-go (sb-thread:make-semaphore :name "T2 can go"))
115 (t1 (sb-thread:make-thread
116 (lambda ()
117 (sb-thread:with-mutex (m1)
118 (sb-thread:wait-on-semaphore t1-can-go)
119 :ok1))
120 :name "T1"))
121 (t2 (sb-thread:make-thread
122 (lambda ()
123 (sb-ext:wait-for (eq t1 (sb-thread:mutex-owner m1)))
124 (sb-thread:with-mutex (m1 :wait-p t)
125 (sb-thread:wait-on-semaphore t2-can-go)
126 :ok2))
127 :name "T2")))
128 (sb-ext:wait-for (eq m1 (sb-thread::thread-waiting-for t2)))
129 (sb-thread:interrupt-thread t2 (lambda ()
130 (sb-thread:with-mutex (m2 :wait-p t)
131 (sb-ext:wait-for
132 (eq m2 (sb-thread::thread-waiting-for t1)))
133 (sb-thread:signal-semaphore t2-can-go))))
134 (sb-ext:wait-for (eq t2 (sb-thread:mutex-owner m2)))
135 (sb-thread:interrupt-thread t1 (lambda ()
136 (sb-thread:with-mutex (m2 :wait-p t)
137 (sb-thread:signal-semaphore t1-can-go))))
138 ;; both threads should finish without a deadlock or deadlock
139 ;; detection error
140 (let ((res (list (sb-thread:join-thread t1)
141 (sb-thread:join-thread t2))))
142 (assert (equal '(:ok1 :ok2) res)))))
144 (with-test (:name (:deadlock-detection :gc))
145 ;; To semi-reliably trigger the error (in SBCL's where)
146 ;; it was present you had to run this for > 30 seconds,
147 ;; but that's a bit long for a single test.
148 (let* ((stop (+ 5 (get-universal-time)))
149 (m1 (sb-thread:make-mutex :name "m1"))
150 (t1 (sb-thread:make-thread
151 (lambda ()
152 (loop until (> (get-universal-time) stop)
153 do (sb-thread:with-mutex (m1)
154 (eval `(make-array 24))))
155 :ok)))
156 (t2 (sb-thread:make-thread
157 (lambda ()
158 (loop until (> (get-universal-time) stop)
159 do (sb-thread:with-mutex (m1)
160 (eval `(make-array 24))))
161 :ok))))
162 (let ((res (list (sb-thread:join-thread t1)
163 (sb-thread:join-thread t2))))
164 (assert (equal '(:ok :ok) res)))))
166 #+deadlock-test-timing
167 (progn
168 (defun clock-gettime () (sb-unix:clock-gettime sb-unix:clock-realtime))
169 (defun seconds-since (start_sec start_nsec)
170 (multiple-value-bind (stop_sec stop_nsec) (clock-gettime)
171 (+ (/ (coerce (- stop_nsec start_nsec) 'double-float) 1000000000)
172 (- stop_sec start_sec)))))
174 (defglobal *max-avl-tree-total* 0)
175 (defglobal *max-avl-tree-born* 0)
176 (defglobal *max-avl-tree-running* 0)
177 (defglobal *max-avl-tree-died* 0)
179 (defun avl-maptree (fun tree)
180 (sb-int:named-let recurse ((node tree))
181 (when node
182 (funcall fun node)
183 (recurse (sb-thread::avlnode-left node))
184 (recurse (sb-thread::avlnode-right node)))))
186 (defun thread-count (&optional (tree sb-thread::*all-threads*))
187 (let ((born 0)
188 (running 0)
189 (died 0))
190 (sb-int:dx-flet ((mapfun (node)
191 (ecase (sb-thread::thread-%visible (sb-thread::avlnode-data node))
192 (0 (incf born)) ; "can't happen" ?
193 (1 (incf running))
194 (-1 (incf died)))))
195 (avl-maptree #'mapfun tree))
196 (let ((total (+ born running died)))
197 (macrolet ((max-into (global mine)
198 `(let ((old ,global))
199 (loop
200 (when (<= ,mine old) (return))
201 (let ((actual (cas ,global old ,mine)))
202 (if (>= actual old) (return)) (setq old actual))))))
203 (max-into *max-avl-tree-total* total)
204 (max-into *max-avl-tree-born* born)
205 (max-into *max-avl-tree-running* running)
206 (max-into *max-avl-tree-died* died))
207 (list total born running died))))
209 ;;; I would optimistically guess that this test can no longer fail,
210 ;;; even for :win32, since there is no lisp mutex around *all-threads*.
211 ;;; (There is still the C one)
213 ;;; With the sesion lock acquisition as part of thread startup,
214 ;;; but with a patch that allows the thread creator to run while
215 ;;; the child is waiting, all this test managed to do was create thousands
216 ;;; of threads all blocked on the *SESSION-LOCK*, because MAKE-THREAD could
217 ;;; return the created thread did anything at all.
218 ;;; The way that happens is as follows:
219 ;;; thread 0: grab session lock, initiate GC
220 ;;; ... return from GC but not release session lock yet
221 ;;; thread 1: start sleeping
222 ;;; thread 2: try to grab session lock, enter a wait
223 ;;; thread 3: start sleeping
224 ;;; thread 4: try to grab session lock, enter a wait
225 ;;; thread 5: start sleeping
226 ;;; ...
227 ;;; So the number of threads running simultaneously depends entirely on when
228 ;;; the even numbered threads get CPU time to release the session lock.
230 ;;; That said, I was curious why this test complete _so_ much slowly
231 ;;; with faster thread start, and more quickly with slower thread start.
232 ;;; Enabling the :deadlock-test-timing feature shows:
234 ;;; Maxima attained: 2923 1 2923 2916 current=(2923 0 1 2922)
235 ;;; ---- ----
236 ;;; tree node count ^ ^ nunber of threads in "run" state
238 ;;; The answer is obvious: the time spent in each GC is proportional to
239 ;;; the number of threads, and with faster thread start, we can actually
240 ;;; achieve *nearly* 3000 threads running at the same time.
241 ;;; With each thread allocating slightly over 4MiB of memory, that's
242 ;;; about 12GiB of additional memory for the OS to manage which has
243 ;;; a 2nd-order effect on our runtime as well.
244 ;;; Contrast this with the "old way" where we were essentially
245 ;;; firing off two threads at a time and letting them finish
246 ;;; before getting to the next two.
248 ;; (pushnew :deadlock-test-timing *features*)
250 (defglobal *message-in-counter* 0)
251 (defglobal *message-out-counter* 0)
252 (declaim (fixnum *message-in-counter* *message-out-counter*))
253 (defparameter *huge-n-threads* 3000)
254 (defglobal *messages* (make-array (* 2 *huge-n-threads*)))
255 (declaim (simple-vector *messages*))
257 (defun show-queued-messages ()
258 (loop while (< *message-out-counter* *message-in-counter*)
259 do (let ((args (aref *messages* *message-out-counter*)))
260 ;; If a store did not get into the array yet, bail out
261 ;; and hope it shows up by the next time we're here.
262 (when (eql args 0)
263 (return-from show-queued-messages))
264 (apply #'format t (concatenate 'string "~4d " (car args)) (cdr args))
265 (terpri))
266 (incf *message-out-counter*)))
268 ;;; Atomically log a message for output by the main thread (to avoid interleaving)
269 (defun message (control &rest args)
270 (let* ((data (cons control args))
271 (index (atomic-incf *message-in-counter*)))
272 (setf (aref *messages* index) data)))
274 #+darwin
275 (test-util::disable-profiling)
277 ;;; This encounters the "backing off for retry" error if attempting
278 ;;; to start too many threads.
279 (defparameter *max-runnable-threads* #+x86-64 100 #-x86-64 5)
280 (with-test (:name :gc-deadlock
281 :broken-on :win32)
282 #+nil (write-line "WARNING: THIS TEST WILL HANG ON FAILURE!")
283 ;; Prior to 0.9.16.46 thread exit potentially deadlocked the
284 ;; GC due to *all-threads-lock* and session lock. On earlier
285 ;; versions and at least on one specific box this test is good enough
286 ;; to catch that typically well before the 1500th iteration.
287 (loop
288 with i = 0
289 with n = *huge-n-threads*
290 with running = nil
291 while (< i n)
293 (show-queued-messages)
294 (incf i)
295 #-deadlock-test-timing
296 (when (zerop (mod i 100))
297 (write-char #\.)
298 (force-output))
299 (when (> (length running) *max-runnable-threads*)
300 (let ((last (car (last running))))
301 (sb-thread:join-thread last)
302 (setq running (nbutlast running))))
303 (handler-case
304 (push
305 (if (oddp i)
306 (make-join-thread
307 (lambda (i &aux (rand (random 0.001)))
308 (declare (ignorable i))
309 #-deadlock-test-timing
310 (sleep rand)
311 #+deadlock-test-timing
312 (multiple-value-bind (t0_sec t0_nsec) (clock-gettime)
313 (message "Sleep ~f, threads=~d" i rand (thread-count))
314 (sleep rand)
315 (message "Done (~f sec)" i (seconds-since t0_sec t0_nsec))))
316 :arguments i
317 :name (format nil "SLEEP-~D" i))
318 (make-join-thread
319 (lambda (i)
320 (declare (ignorable i))
321 ;; KLUDGE: what we are doing here is explicit,
322 ;; but the same can happen because of a regular
323 ;; MAKE-THREAD or LIST-ALL-THREADS, and various
324 ;; session functions.
325 #-deadlock-test-timing
326 (progn
327 (sb-thread::with-session-lock (sb-thread::*session*)
328 (sb-ext:gc)))
329 #+deadlock-test-timing
330 (sb-int:binding* (((t0_sec t0_nsec) (clock-gettime))
331 ((t1_sec t1_nsec wait-time) (values nil nil nil)))
332 (message "GC, threads=~d" i (thread-count))
333 (sb-thread::with-session-lock (sb-thread::*session*)
334 (setq wait-time (seconds-since t0_sec t0_nsec))
335 (multiple-value-setq (t1_sec t1_nsec) (clock-gettime))
336 (sb-ext:gc))
337 (let ((gc-time (seconds-since t1_sec t1_nsec)))
338 (message "GC Done (~f wait + ~f gc)~A"
339 i wait-time gc-time
340 (if (> gc-time .5) "***" "")))))
341 :arguments i
342 :name (format nil "GC-~D" i)))
343 running)
344 (error (e)
345 ;; Not sure why this needs to back off - at most it was running 2 threads.
346 (format t "~%error creating thread ~D: ~A -- backing off for retry~%" i e)
347 (sleep 0.1)
348 (incf i))))
349 #+deadlock-test-timing
350 (progn
351 (format t "~&Main thread: draining messages~%") (force-output)
352 (loop while (< (cas *message-out-counter* 0 0) (length *messages*))
353 do (show-queued-messages)
354 (sleep .05))
355 (format t "Maxima attained: ~d ~d ~d ~d current=~d~%"
356 *max-avl-tree-total*
357 *max-avl-tree-born*
358 *max-avl-tree-running*
359 *max-avl-tree-died*
360 (thread-count))
361 (sb-thread:make-thread #'list :name "list")
362 (format t "After another make-thread: current=~d~%" (thread-count))
363 (let (gc-times)
364 (sb-int:dovector (x *messages*)
365 (when (string= (first x) "GC Done" :end1 7)
366 (push (fourth x) gc-times)))
367 (let ((min (reduce #'min gc-times))
368 (max (reduce #'max gc-times))
369 (sum (reduce #'+ gc-times)))
370 (format t "~&GC time: min=~f max=~f avg=~f sum=~f~%"
371 min max (/ sum (length gc-times)) sum)))))