1 #-sb-thread
(invoke-restart 'run-tests
::skip-file
)
3 (import '(sb-thread:join-thread
5 sb-thread
:make-semaphore
7 sb-thread
:signal-semaphore
8 sb-thread
:thread-deadlock
9 sb-thread
:wait-on-semaphore
10 sb-thread
:with-mutex
))
12 (when (sb-sys:find-dynamic-foreign-symbol-address
"show_gc_generation_throughput")
13 (setf (extern-alien "show_gc_generation_throughput" int
) 0))
15 (with-test (:name
:deadlock-detection
.1)
18 do
(flet ((test (ma mb sa sb
)
23 (wait-on-semaphore sb
)
27 ;; (assert (plusp (length ...))) prevents
29 (assert (plusp (length (princ-to-string e
))))
31 (let* ((m1 (make-mutex :name
"M1"))
32 (m2 (make-mutex :name
"M2"))
33 (s1 (make-semaphore :name
"S1"))
34 (s2 (make-semaphore :name
"S2"))
35 (t1 (make-thread (test m1 m2 s1 s2
) :name
"T1"))
36 (t2 (make-thread (test m2 m1 s2 s1
) :name
"T2")))
37 ;; One will deadlock, and the other will then complete normally.
38 (let ((res (list (join-thread t1
)
40 (assert (or (equal '(:deadlock
:ok
) res
)
41 (equal '(:ok
:deadlock
) res
))))))))
43 (with-test (:name
:deadlock-detection
.2)
44 (let* ((m1 (make-mutex :name
"M1"))
45 (m2 (make-mutex :name
"M2"))
46 (s1 (make-semaphore :name
"S1"))
47 (s2 (make-semaphore :name
"S2"))
52 (wait-on-semaphore s2
)
58 (handler-bind ((thread-deadlock
61 ;; Make sure we can print the condition
63 (let ((*print-circle
* nil
))
64 (setf err
(princ-to-string e
)))
68 (assert (eq :ok
(with-mutex (m2)
71 (wait-on-semaphore s1
)
75 (assert (stringp err
)))
76 (assert (eq :ok
(join-thread t1
)))))
78 (with-test (:name
:deadlock-detection
.3
79 :broken-on
(and :darwin
:gc-stress
))
80 (let* ((m1 (make-mutex :name
"M1"))
81 (m2 (make-mutex :name
"M2"))
82 (s1 (make-semaphore :name
"S1"))
83 (s2 (make-semaphore :name
"S2"))
88 (wait-on-semaphore s2
)
92 ;; Currently we don't consider it a deadlock
93 ;; if there is a timeout in the chain.
98 (wait-on-semaphore s1
)
100 (sb-sys:with-deadline
(:seconds
0.1)
103 (sb-sys:deadline-timeout
()
107 (assert (eq :ok
(join-thread t1
)))))
109 (with-test (:name
(:deadlock-detection
:interrupts
)
111 (let* ((m1 (sb-thread:make-mutex
:name
"M1"))
112 (m2 (sb-thread:make-mutex
:name
"M2"))
113 (t1-can-go (sb-thread:make-semaphore
:name
"T1 can go"))
114 (t2-can-go (sb-thread:make-semaphore
:name
"T2 can go"))
115 (t1 (sb-thread:make-thread
117 (sb-thread:with-mutex
(m1)
118 (sb-thread:wait-on-semaphore t1-can-go
)
121 (t2 (sb-thread:make-thread
123 (sb-ext:wait-for
(eq t1
(sb-thread:mutex-owner m1
)))
124 (sb-thread:with-mutex
(m1 :wait-p t
)
125 (sb-thread:wait-on-semaphore t2-can-go
)
128 (sb-ext:wait-for
(eq m1
(sb-thread::thread-waiting-for t2
)))
129 (sb-thread:interrupt-thread t2
(lambda ()
130 (sb-thread:with-mutex
(m2 :wait-p t
)
132 (eq m2
(sb-thread::thread-waiting-for t1
)))
133 (sb-thread:signal-semaphore t2-can-go
))))
134 (sb-ext:wait-for
(eq t2
(sb-thread:mutex-owner m2
)))
135 (sb-thread:interrupt-thread t1
(lambda ()
136 (sb-thread:with-mutex
(m2 :wait-p t
)
137 (sb-thread:signal-semaphore t1-can-go
))))
138 ;; both threads should finish without a deadlock or deadlock
140 (let ((res (list (sb-thread:join-thread t1
)
141 (sb-thread:join-thread t2
))))
142 (assert (equal '(:ok1
:ok2
) res
)))))
144 (with-test (:name
(:deadlock-detection
:gc
))
145 ;; To semi-reliably trigger the error (in SBCL's where)
146 ;; it was present you had to run this for > 30 seconds,
147 ;; but that's a bit long for a single test.
148 (let* ((stop (+ 5 (get-universal-time)))
149 (m1 (sb-thread:make-mutex
:name
"m1"))
150 (t1 (sb-thread:make-thread
152 (loop until
(> (get-universal-time) stop
)
153 do
(sb-thread:with-mutex
(m1)
154 (eval `(make-array 24))))
156 (t2 (sb-thread:make-thread
158 (loop until
(> (get-universal-time) stop
)
159 do
(sb-thread:with-mutex
(m1)
160 (eval `(make-array 24))))
162 (let ((res (list (sb-thread:join-thread t1
)
163 (sb-thread:join-thread t2
))))
164 (assert (equal '(:ok
:ok
) res
)))))
166 #+deadlock-test-timing
168 (defun clock-gettime () (sb-unix:clock-gettime sb-unix
:clock-realtime
))
169 (defun seconds-since (start_sec start_nsec
)
170 (multiple-value-bind (stop_sec stop_nsec
) (clock-gettime)
171 (+ (/ (coerce (- stop_nsec start_nsec
) 'double-float
) 1000000000)
172 (- stop_sec start_sec
)))))
174 (defglobal *max-avl-tree-total
* 0)
175 (defglobal *max-avl-tree-born
* 0)
176 (defglobal *max-avl-tree-running
* 0)
177 (defglobal *max-avl-tree-died
* 0)
179 (defun avl-maptree (fun tree
)
180 (sb-int:named-let recurse
((node tree
))
183 (recurse (sb-thread::avlnode-left node
))
184 (recurse (sb-thread::avlnode-right node
)))))
186 (defun thread-count (&optional
(tree sb-thread
::*all-threads
*))
190 (sb-int:dx-flet
((mapfun (node)
191 (ecase (sb-thread::thread-%visible
(sb-thread::avlnode-data node
))
192 (0 (incf born
)) ; "can't happen" ?
195 (avl-maptree #'mapfun tree
))
196 (let ((total (+ born running died
)))
197 (macrolet ((max-into (global mine
)
198 `(let ((old ,global
))
200 (when (<= ,mine old
) (return))
201 (let ((actual (cas ,global old
,mine
)))
202 (if (>= actual old
) (return)) (setq old actual
))))))
203 (max-into *max-avl-tree-total
* total
)
204 (max-into *max-avl-tree-born
* born
)
205 (max-into *max-avl-tree-running
* running
)
206 (max-into *max-avl-tree-died
* died
))
207 (list total born running died
))))
209 ;;; I would optimistically guess that this test can no longer fail,
210 ;;; even for :win32, since there is no lisp mutex around *all-threads*.
211 ;;; (There is still the C one)
213 ;;; With the sesion lock acquisition as part of thread startup,
214 ;;; but with a patch that allows the thread creator to run while
215 ;;; the child is waiting, all this test managed to do was create thousands
216 ;;; of threads all blocked on the *SESSION-LOCK*, because MAKE-THREAD could
217 ;;; return the created thread did anything at all.
218 ;;; The way that happens is as follows:
219 ;;; thread 0: grab session lock, initiate GC
220 ;;; ... return from GC but not release session lock yet
221 ;;; thread 1: start sleeping
222 ;;; thread 2: try to grab session lock, enter a wait
223 ;;; thread 3: start sleeping
224 ;;; thread 4: try to grab session lock, enter a wait
225 ;;; thread 5: start sleeping
227 ;;; So the number of threads running simultaneously depends entirely on when
228 ;;; the even numbered threads get CPU time to release the session lock.
230 ;;; That said, I was curious why this test complete _so_ much slowly
231 ;;; with faster thread start, and more quickly with slower thread start.
232 ;;; Enabling the :deadlock-test-timing feature shows:
234 ;;; Maxima attained: 2923 1 2923 2916 current=(2923 0 1 2922)
236 ;;; tree node count ^ ^ nunber of threads in "run" state
238 ;;; The answer is obvious: the time spent in each GC is proportional to
239 ;;; the number of threads, and with faster thread start, we can actually
240 ;;; achieve *nearly* 3000 threads running at the same time.
241 ;;; With each thread allocating slightly over 4MiB of memory, that's
242 ;;; about 12GiB of additional memory for the OS to manage which has
243 ;;; a 2nd-order effect on our runtime as well.
244 ;;; Contrast this with the "old way" where we were essentially
245 ;;; firing off two threads at a time and letting them finish
246 ;;; before getting to the next two.
248 ;; (pushnew :deadlock-test-timing *features*)
250 (defglobal *message-in-counter
* 0)
251 (defglobal *message-out-counter
* 0)
252 (declaim (fixnum *message-in-counter
* *message-out-counter
*))
253 (defparameter *huge-n-threads
* 3000)
254 (defglobal *messages
* (make-array (* 2 *huge-n-threads
*)))
255 (declaim (simple-vector *messages
*))
257 (defun show-queued-messages ()
258 (loop while
(< *message-out-counter
* *message-in-counter
*)
259 do
(let ((args (aref *messages
* *message-out-counter
*)))
260 ;; If a store did not get into the array yet, bail out
261 ;; and hope it shows up by the next time we're here.
263 (return-from show-queued-messages
))
264 (apply #'format t
(concatenate 'string
"~4d " (car args
)) (cdr args
))
266 (incf *message-out-counter
*)))
268 ;;; Atomically log a message for output by the main thread (to avoid interleaving)
269 (defun message (control &rest args
)
270 (let* ((data (cons control args
))
271 (index (atomic-incf *message-in-counter
*)))
272 (setf (aref *messages
* index
) data
)))
275 (test-util::disable-profiling
)
277 ;;; This encounters the "backing off for retry" error if attempting
278 ;;; to start too many threads.
279 (defparameter *max-runnable-threads
* #+x86-64
100 #-x86-64
5)
280 (with-test (:name
:gc-deadlock
282 #+nil
(write-line "WARNING: THIS TEST WILL HANG ON FAILURE!")
283 ;; Prior to 0.9.16.46 thread exit potentially deadlocked the
284 ;; GC due to *all-threads-lock* and session lock. On earlier
285 ;; versions and at least on one specific box this test is good enough
286 ;; to catch that typically well before the 1500th iteration.
289 with n
= *huge-n-threads
*
293 (show-queued-messages)
295 #-deadlock-test-timing
296 (when (zerop (mod i
100))
299 (when (> (length running
) *max-runnable-threads
*)
300 (let ((last (car (last running
))))
301 (sb-thread:join-thread last
)
302 (setq running
(nbutlast running
))))
307 (lambda (i &aux
(rand (random 0.001)))
308 (declare (ignorable i
))
309 #-deadlock-test-timing
311 #+deadlock-test-timing
312 (multiple-value-bind (t0_sec t0_nsec
) (clock-gettime)
313 (message "Sleep ~f, threads=~d" i rand
(thread-count))
315 (message "Done (~f sec)" i
(seconds-since t0_sec t0_nsec
))))
317 :name
(format nil
"SLEEP-~D" i
))
320 (declare (ignorable i
))
321 ;; KLUDGE: what we are doing here is explicit,
322 ;; but the same can happen because of a regular
323 ;; MAKE-THREAD or LIST-ALL-THREADS, and various
324 ;; session functions.
325 #-deadlock-test-timing
327 (sb-thread::with-session-lock
(sb-thread::*session
*)
329 #+deadlock-test-timing
330 (sb-int:binding
* (((t0_sec t0_nsec
) (clock-gettime))
331 ((t1_sec t1_nsec wait-time
) (values nil nil nil
)))
332 (message "GC, threads=~d" i
(thread-count))
333 (sb-thread::with-session-lock
(sb-thread::*session
*)
334 (setq wait-time
(seconds-since t0_sec t0_nsec
))
335 (multiple-value-setq (t1_sec t1_nsec
) (clock-gettime))
337 (let ((gc-time (seconds-since t1_sec t1_nsec
)))
338 (message "GC Done (~f wait + ~f gc)~A"
340 (if (> gc-time
.5) "***" "")))))
342 :name
(format nil
"GC-~D" i
)))
345 ;; Not sure why this needs to back off - at most it was running 2 threads.
346 (format t
"~%error creating thread ~D: ~A -- backing off for retry~%" i e
)
349 #+deadlock-test-timing
351 (format t
"~&Main thread: draining messages~%") (force-output)
352 (loop while
(< (cas *message-out-counter
* 0 0) (length *messages
*))
353 do
(show-queued-messages)
355 (format t
"Maxima attained: ~d ~d ~d ~d current=~d~%"
358 *max-avl-tree-running
*
361 (sb-thread:make-thread
#'list
:name
"list")
362 (format t
"After another make-thread: current=~d~%" (thread-count))
364 (sb-int:dovector
(x *messages
*)
365 (when (string= (first x
) "GC Done" :end1
7)
366 (push (fourth x
) gc-times
)))
367 (let ((min (reduce #'min gc-times
))
368 (max (reduce #'max gc-times
))
369 (sum (reduce #'+ gc-times
)))
370 (format t
"~&GC time: min=~f max=~f avg=~f sum=~f~%"
371 min max
(/ sum
(length gc-times
)) sum
)))))