1 #-sb-thread
(invoke-restart 'run-tests
::skip-file
)
2 #+ultrafutex
(invoke-restart 'run-tests
::skip-file
)
4 (import '(sb-thread:join-thread
6 sb-thread
:make-semaphore
8 sb-thread
:signal-semaphore
9 sb-thread
:thread-deadlock
10 sb-thread
:wait-on-semaphore
11 sb-thread
:with-mutex
))
13 (when (sb-sys:find-dynamic-foreign-symbol-address
"show_gc_generation_throughput")
14 (setf (extern-alien "show_gc_generation_throughput" int
) 0))
16 (with-test (:name
:deadlock-detection
.1)
19 do
(flet ((test (ma mb sa sb
)
24 (wait-on-semaphore sb
)
28 ;; (assert (plusp (length ...))) prevents
30 (assert (plusp (length (princ-to-string e
))))
32 (let* ((m1 (make-mutex :name
"M1"))
33 (m2 (make-mutex :name
"M2"))
34 (s1 (make-semaphore :name
"S1"))
35 (s2 (make-semaphore :name
"S2"))
36 (t1 (make-thread (test m1 m2 s1 s2
) :name
"T1"))
37 (t2 (make-thread (test m2 m1 s2 s1
) :name
"T2")))
38 ;; One will deadlock, and the other will then complete normally.
39 (let ((res (list (join-thread t1
)
41 (assert (or (equal '(:deadlock
:ok
) res
)
42 (equal '(:ok
:deadlock
) res
))))))))
44 (with-test (:name
:deadlock-detection
.2)
45 (let* ((m1 (make-mutex :name
"M1"))
46 (m2 (make-mutex :name
"M2"))
47 (s1 (make-semaphore :name
"S1"))
48 (s2 (make-semaphore :name
"S2"))
53 (wait-on-semaphore s2
)
59 (handler-bind ((thread-deadlock
62 ;; Make sure we can print the condition
64 (let ((*print-circle
* nil
))
65 (setf err
(princ-to-string e
)))
69 (assert (eq :ok
(with-mutex (m2)
72 (wait-on-semaphore s1
)
76 (assert (stringp err
)))
77 (assert (eq :ok
(join-thread t1
)))))
79 (with-test (:name
:deadlock-detection
.3
80 :broken-on
(and :darwin
:gc-stress
))
81 (let* ((m1 (make-mutex :name
"M1"))
82 (m2 (make-mutex :name
"M2"))
83 (s1 (make-semaphore :name
"S1"))
84 (s2 (make-semaphore :name
"S2"))
89 (wait-on-semaphore s2
)
93 ;; Currently we don't consider it a deadlock
94 ;; if there is a timeout in the chain.
99 (wait-on-semaphore s1
)
101 (sb-sys:with-deadline
(:seconds
0.1)
104 (sb-sys:deadline-timeout
()
108 (assert (eq :ok
(join-thread t1
)))))
110 (with-test (:name
(:deadlock-detection
:interrupts
)
112 (let* ((m1 (sb-thread:make-mutex
:name
"M1"))
113 (m2 (sb-thread:make-mutex
:name
"M2"))
114 (t1-can-go (sb-thread:make-semaphore
:name
"T1 can go"))
115 (t2-can-go (sb-thread:make-semaphore
:name
"T2 can go"))
116 (t1 (sb-thread:make-thread
118 (sb-thread:with-mutex
(m1)
119 (sb-thread:wait-on-semaphore t1-can-go
)
122 (t2 (sb-thread:make-thread
124 (sb-ext:wait-for
(eq t1
(sb-thread:mutex-owner m1
)))
125 (sb-thread:with-mutex
(m1 :wait-p t
)
126 (sb-thread:wait-on-semaphore t2-can-go
)
129 (sb-ext:wait-for
(eq m1
(sb-thread::thread-waiting-for t2
)))
130 (sb-thread:interrupt-thread t2
(lambda ()
131 (sb-thread:with-mutex
(m2 :wait-p t
)
133 (eq m2
(sb-thread::thread-waiting-for t1
)))
134 (sb-thread:signal-semaphore t2-can-go
))))
135 (sb-ext:wait-for
(eq t2
(sb-thread:mutex-owner m2
)))
136 (sb-thread:interrupt-thread t1
(lambda ()
137 (sb-thread:with-mutex
(m2 :wait-p t
)
138 (sb-thread:signal-semaphore t1-can-go
))))
139 ;; both threads should finish without a deadlock or deadlock
141 (let ((res (list (sb-thread:join-thread t1
)
142 (sb-thread:join-thread t2
))))
143 (assert (equal '(:ok1
:ok2
) res
)))))
145 (with-test (:name
(:deadlock-detection
:gc
))
146 ;; To semi-reliably trigger the error (in SBCL's where)
147 ;; it was present you had to run this for > 30 seconds,
148 ;; but that's a bit long for a single test.
149 (let* ((stop (+ 5 (get-universal-time)))
150 (m1 (sb-thread:make-mutex
:name
"m1"))
151 (t1 (sb-thread:make-thread
153 (loop until
(> (get-universal-time) stop
)
154 do
(sb-thread:with-mutex
(m1)
155 (eval `(make-array 24))))
157 (t2 (sb-thread:make-thread
159 (loop until
(> (get-universal-time) stop
)
160 do
(sb-thread:with-mutex
(m1)
161 (eval `(make-array 24))))
163 (let ((res (list (sb-thread:join-thread t1
)
164 (sb-thread:join-thread t2
))))
165 (assert (equal '(:ok
:ok
) res
)))))
167 #+deadlock-test-timing
169 (defun clock-gettime () (sb-unix:clock-gettime sb-unix
:clock-realtime
))
170 (defun seconds-since (start_sec start_nsec
)
171 (multiple-value-bind (stop_sec stop_nsec
) (clock-gettime)
172 (+ (/ (coerce (- stop_nsec start_nsec
) 'double-float
) 1000000000)
173 (- stop_sec start_sec
)))))
175 (defglobal *max-avl-tree-total
* 0)
176 (defglobal *max-avl-tree-born
* 0)
177 (defglobal *max-avl-tree-running
* 0)
178 (defglobal *max-avl-tree-died
* 0)
180 (defun avl-maptree (fun tree
)
181 (sb-int:named-let recurse
((node tree
))
184 (recurse (sb-thread::avlnode-left node
))
185 (recurse (sb-thread::avlnode-right node
)))))
187 (defun thread-count (&optional
(tree sb-thread
::*all-threads
*))
191 (sb-int:dx-flet
((mapfun (node)
192 (ecase (sb-thread::thread-%visible
(sb-thread::avlnode-data node
))
193 (0 (incf born
)) ; "can't happen" ?
196 (avl-maptree #'mapfun tree
))
197 (let ((total (+ born running died
)))
198 (macrolet ((max-into (global mine
)
199 `(let ((old ,global
))
201 (when (<= ,mine old
) (return))
202 (let ((actual (cas ,global old
,mine
)))
203 (if (>= actual old
) (return)) (setq old actual
))))))
204 (max-into *max-avl-tree-total
* total
)
205 (max-into *max-avl-tree-born
* born
)
206 (max-into *max-avl-tree-running
* running
)
207 (max-into *max-avl-tree-died
* died
))
208 (list total born running died
))))
210 ;;; I would optimistically guess that this test can no longer fail,
211 ;;; even for :win32, since there is no lisp mutex around *all-threads*.
212 ;;; (There is still the C one)
214 ;;; With the sesion lock acquisition as part of thread startup,
215 ;;; but with a patch that allows the thread creator to run while
216 ;;; the child is waiting, all this test managed to do was create thousands
217 ;;; of threads all blocked on the *SESSION-LOCK*, because MAKE-THREAD could
218 ;;; return the created thread did anything at all.
219 ;;; The way that happens is as follows:
220 ;;; thread 0: grab session lock, initiate GC
221 ;;; ... return from GC but not release session lock yet
222 ;;; thread 1: start sleeping
223 ;;; thread 2: try to grab session lock, enter a wait
224 ;;; thread 3: start sleeping
225 ;;; thread 4: try to grab session lock, enter a wait
226 ;;; thread 5: start sleeping
228 ;;; So the number of threads running simultaneously depends entirely on when
229 ;;; the even numbered threads get CPU time to release the session lock.
231 ;;; That said, I was curious why this test complete _so_ much slowly
232 ;;; with faster thread start, and more quickly with slower thread start.
233 ;;; Enabling the :deadlock-test-timing feature shows:
235 ;;; Maxima attained: 2923 1 2923 2916 current=(2923 0 1 2922)
237 ;;; tree node count ^ ^ nunber of threads in "run" state
239 ;;; The answer is obvious: the time spent in each GC is proportional to
240 ;;; the number of threads, and with faster thread start, we can actually
241 ;;; achieve *nearly* 3000 threads running at the same time.
242 ;;; With each thread allocating slightly over 4MiB of memory, that's
243 ;;; about 12GiB of additional memory for the OS to manage which has
244 ;;; a 2nd-order effect on our runtime as well.
245 ;;; Contrast this with the "old way" where we were essentially
246 ;;; firing off two threads at a time and letting them finish
247 ;;; before getting to the next two.
249 ;; (pushnew :deadlock-test-timing *features*)
251 (defglobal *message-in-counter
* 0)
252 (defglobal *message-out-counter
* 0)
253 (declaim (fixnum *message-in-counter
* *message-out-counter
*))
254 (defparameter *huge-n-threads
* 3000)
255 (defglobal *messages
* (make-array (* 2 *huge-n-threads
*)))
256 (declaim (simple-vector *messages
*))
258 (defun show-queued-messages ()
259 (loop while
(< *message-out-counter
* *message-in-counter
*)
260 do
(let ((args (aref *messages
* *message-out-counter
*)))
261 ;; If a store did not get into the array yet, bail out
262 ;; and hope it shows up by the next time we're here.
264 (return-from show-queued-messages
))
265 (apply #'format t
(concatenate 'string
"~4d " (car args
)) (cdr args
))
267 (incf *message-out-counter
*)))
269 ;;; Atomically log a message for output by the main thread (to avoid interleaving)
270 (defun message (control &rest args
)
271 (let* ((data (cons control args
))
272 (index (atomic-incf *message-in-counter
*)))
273 (setf (aref *messages
* index
) data
)))
276 (test-util::disable-profiling
)
278 ;;; This encounters the "backing off for retry" error if attempting
279 ;;; to start too many threads.
280 (defparameter *max-runnable-threads
* #+x86-64
100 #-x86-64
5)
281 (with-test (:name
:gc-deadlock
283 #+nil
(write-line "WARNING: THIS TEST WILL HANG ON FAILURE!")
284 ;; Prior to 0.9.16.46 thread exit potentially deadlocked the
285 ;; GC due to *all-threads-lock* and session lock. On earlier
286 ;; versions and at least on one specific box this test is good enough
287 ;; to catch that typically well before the 1500th iteration.
290 with n
= *huge-n-threads
*
294 (show-queued-messages)
296 #-deadlock-test-timing
297 (when (zerop (mod i
100))
300 (when (> (length running
) *max-runnable-threads
*)
301 (let ((last (car (last running
))))
302 (sb-thread:join-thread last
)
303 (setq running
(nbutlast running
))))
308 (lambda (i &aux
(rand (random 0.001)))
309 (declare (ignorable i
))
310 #-deadlock-test-timing
312 #+deadlock-test-timing
313 (multiple-value-bind (t0_sec t0_nsec
) (clock-gettime)
314 (message "Sleep ~f, threads=~d" i rand
(thread-count))
316 (message "Done (~f sec)" i
(seconds-since t0_sec t0_nsec
))))
318 :name
(format nil
"SLEEP-~D" i
))
321 (declare (ignorable i
))
322 ;; KLUDGE: what we are doing here is explicit,
323 ;; but the same can happen because of a regular
324 ;; MAKE-THREAD or LIST-ALL-THREADS, and various
325 ;; session functions.
326 #-deadlock-test-timing
328 (sb-thread::with-session-lock
(sb-thread::*session
*)
330 #+deadlock-test-timing
331 (sb-int:binding
* (((t0_sec t0_nsec
) (clock-gettime))
332 ((t1_sec t1_nsec wait-time
) (values nil nil nil
)))
333 (message "GC, threads=~d" i
(thread-count))
334 (sb-thread::with-session-lock
(sb-thread::*session
*)
335 (setq wait-time
(seconds-since t0_sec t0_nsec
))
336 (multiple-value-setq (t1_sec t1_nsec
) (clock-gettime))
338 (let ((gc-time (seconds-since t1_sec t1_nsec
)))
339 (message "GC Done (~f wait + ~f gc)~A"
341 (if (> gc-time
.5) "***" "")))))
343 :name
(format nil
"GC-~D" i
)))
346 ;; Not sure why this needs to back off - at most it was running 2 threads.
347 (format t
"~%error creating thread ~D: ~A -- backing off for retry~%" i e
)
350 #+deadlock-test-timing
352 (format t
"~&Main thread: draining messages~%") (force-output)
353 (loop while
(< (cas *message-out-counter
* 0 0) (length *messages
*))
354 do
(show-queued-messages)
356 (format t
"Maxima attained: ~d ~d ~d ~d current=~d~%"
359 *max-avl-tree-running
*
362 (sb-thread:make-thread
#'list
:name
"list")
363 (format t
"After another make-thread: current=~d~%" (thread-count))
365 (sb-int:dovector
(x *messages
*)
366 (when (string= (first x
) "GC Done" :end1
7)
367 (push (fourth x
) gc-times
)))
368 (let ((min (reduce #'min gc-times
))
369 (max (reduce #'max gc-times
))
370 (sum (reduce #'+ gc-times
)))
371 (format t
"~&GC time: min=~f max=~f avg=~f sum=~f~%"
372 min max
(/ sum
(length gc-times
)) sum
)))))