1 ;;; Copyright (C) 2003 Gerd Moellmann <gerd.moellmann@t-online.de>
2 ;;; All rights reserved.
4 ;;; Redistribution and use in source and binary forms, with or without
5 ;;; modification, are permitted provided that the following conditions
8 ;;; 1. Redistributions of source code must retain the above copyright
9 ;;; notice, this list of conditions and the following disclaimer.
10 ;;; 2. Redistributions in binary form must reproduce the above copyright
11 ;;; notice, this list of conditions and the following disclaimer in the
12 ;;; documentation and/or other materials provided with the distribution.
13 ;;; 3. The name of the author may not be used to endorse or promote
14 ;;; products derived from this software without specific prior written
17 ;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
18 ;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19 ;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 ;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE
21 ;;; LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 ;;; CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
23 ;;; OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
24 ;;; BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
25 ;;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 ;;; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
27 ;;; USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
30 ;;; Statistical profiler.
34 ;;; This profiler arranges for SIGPROF interrupts to interrupt a
35 ;;; running program at regular intervals. Each time a SIGPROF occurs,
36 ;;; the current program counter and return address is recorded in a
37 ;;; vector, until a configurable maximum number of samples have been
40 ;;; A profiling report is generated from the samples array by
41 ;;; determining the Lisp functions corresponding to the recorded
42 ;;; addresses. Each program counter/return address pair forms one
43 ;;; edge in a call graph.
47 ;;; The code being generated on x86 makes determining callers reliably
48 ;;; something between extremely difficult and impossible. Example:
50 ;;; 10979F00: .entry eval::eval-stack-args(arg-count)
51 ;;; 18: pop dword ptr [ebp-8]
52 ;;; 1B: lea esp, [ebp-32]
57 ;;; 29: mov [ebp-12], edi
58 ;;; 2C: mov dword ptr [ebp-16], #x28F0000B ; nil
59 ;;; ; No-arg-parsing entry point
60 ;;; 33: mov dword ptr [ebp-20], 0
62 ;;; 3C: L0: mov edx, esp
64 ;;; 41: mov eax, [#x10979EF8] ; #<FDEFINITION object for eval::eval-stack-pop>
66 ;;; 49: mov [edx-4], ebp
68 ;;; 4E: call dword ptr [eax+5]
71 ;;; Suppose this function is interrupted by SIGPROF at 4E. At that
72 ;;; point, the frame pointer EBP has been modified so that the
73 ;;; original return address of the caller of eval-stack-args is no
74 ;;; longer where it can be found by x86-call-context, and the new
75 ;;; return address, for the call to eval-stack-pop, is not yet on the
76 ;;; stack. The effect is that x86-call-context returns something
77 ;;; bogus, which leads to wrong edges in the call graph.
79 ;;; One thing that one might try is filtering cases where the program
80 ;;; is interrupted at a call instruction. But since the above example
81 ;;; of an interrupt at a call instruction isn't the only case where
82 ;;; the stack is something x86-call-context can't really cope with,
83 ;;; this is not a general solution.
85 ;;; Random ideas for implementation:
87 ;;; * Space profiler. Sample when new pages are allocated instead of
90 ;;; * Record a configurable number of callers up the stack. That
91 ;;; could give a more complete graph when there are many small
94 ;;; * Print help strings for reports, include hints to the problem
97 ;;; * Make flat report the default since call-graph isn't that
100 (defpackage #:sb-sprof
101 (:use
#:cl
#:sb-ext
#:sb-unix
#:sb-alien
#:sb-sys
)
102 (:export
#:*sample-interval
* #:*max-samples
* #:*alloc-interval
*
103 #:start-sampling
#:stop-sampling
#:with-sampling
104 #:with-profiling
#:start-profiling
#:stop-profiling
107 (in-package #:sb-sprof
)
112 (defstruct (vertex (:constructor make-vertex
)
113 (:constructor make-scc
(scc-vertices edges
)))
114 (visited nil
:type boolean
)
115 (root nil
:type
(or null vertex
))
117 (edges () :type list
)
118 (scc-vertices () :type list
))
121 (vertex (sb-impl::missing-arg
) :type vertex
))
124 (vertices () :type list
))
126 (declaim (inline scc-p
))
127 (defun scc-p (vertex)
128 (not (null (vertex-scc-vertices vertex
))))
130 (defmacro do-vertices
((vertex graph
) &body body
)
131 `(dolist (,vertex
(graph-vertices ,graph
))
134 (defmacro do-edges
((edge edge-to vertex
) &body body
)
135 `(dolist (,edge
(vertex-edges ,vertex
))
136 (let ((,edge-to
(edge-vertex ,edge
)))
139 (defun self-cycle-p (vertex)
140 (do-edges (e to vertex
)
144 (defun map-vertices (fn vertices
)
146 (setf (vertex-visited v
) nil
))
148 (unless (vertex-visited v
)
151 ;;; Eeko Nuutila, Eljas Soisalon-Soininen, around 1992. Improves on
152 ;;; Tarjan's original algorithm by not using the stack when processing
153 ;;; trivial components. Trivial components should appear frequently
154 ;;; in a call-graph such as ours, I think. Same complexity O(V+E) as
156 (defun strong-components (vertices)
157 (let ((in-component (make-array (length vertices
)
158 :element-type
'boolean
159 :initial-element nil
))
163 (labels ((min-root (x y
)
164 (let ((rx (vertex-root x
))
165 (ry (vertex-root y
)))
166 (if (< (vertex-dfn rx
) (vertex-dfn ry
))
170 (aref in-component
(vertex-dfn v
)))
171 ((setf in-component
) (in v
)
172 (setf (aref in-component
(vertex-dfn v
)) in
))
174 (> (vertex-dfn x
) (vertex-dfn y
)))
176 (setf (vertex-dfn v
) (incf dfn
)
179 (vertex-visited v
) t
)
181 (unless (vertex-visited w
)
183 (unless (in-component w
)
184 (setf (vertex-root v
) (min-root v w
))))
185 (if (eq v
(vertex-root v
))
186 (loop while
(and stack
(vertex-> (car stack
) v
))
188 collect w into this-component
189 do
(setf (in-component w
) t
)
191 (setf (in-component v
) t
)
192 (push (cons v this-component
) components
))
194 (map-vertices #'visit vertices
)
197 ;;; Given a dag as a list of vertices, return the list sorted
198 ;;; topologically, children first.
199 (defun topological-sort (dag)
202 (labels ((rec-sort (v)
203 (setf (vertex-visited v
) t
)
204 (setf (vertex-dfn v
) (incf dfn
))
205 (dolist (e (vertex-edges v
))
206 (unless (vertex-visited (edge-vertex e
))
207 (rec-sort (edge-vertex e
))))
209 (map-vertices #'rec-sort dag
)
212 ;;; Reduce graph G to a dag by coalescing strongly connected components
213 ;;; into vertices. Sort the result topologically.
214 (defun reduce-graph (graph &optional
(scc-constructor #'make-scc
))
215 (sb-int:collect
((sccs) (trivial))
216 (dolist (c (strong-components (graph-vertices graph
)))
217 (if (or (cdr c
) (self-cycle-p (car c
)))
218 (sb-int:collect
((outgoing))
223 (sccs (funcall scc-constructor c
(outgoing))))
226 (dolist (v (trivial))
228 (when (member w
(vertex-scc-vertices scc
))
229 (setf (edge-vertex e
) scc
)))))
230 (setf (graph-vertices graph
)
231 (topological-sort (nconc (sccs) (trivial))))))
236 "Type used for addresses, for instance, program counters,
237 code start/end locations etc."
238 '(unsigned-byte #.sb-vm
::n-machine-word-bits
))
240 (defconstant +unknown-address
+ 0
241 "Constant representing an address that cannot be determined.")
243 ;;; A call graph. Vertices are NODE structures, edges are CALL
245 (defstruct (call-graph (:include graph
)
246 (:constructor %make-call-graph
))
247 ;; the value of *SAMPLE-INTERVAL* or *ALLOC-INTERVAL* at the time
248 ;; the graph was created (depending on the current allocation mode)
249 (sample-interval (sb-impl::missing-arg
) :type number
)
250 ;; the sampling-mode that was used for the profiling run
251 (sampling-mode (sb-impl::missing-arg
) :type
(member :cpu
:alloc
))
252 ;; number of samples taken
253 (nsamples (sb-impl::missing-arg
) :type sb-impl
::index
)
254 ;; sample count for samples not in any function
255 (elsewhere-count (sb-impl::missing-arg
) :type sb-impl
::index
)
256 ;; a flat list of NODEs, sorted by sample count
257 (flat-nodes () :type list
))
259 ;;; A node in a call graph, representing a function that has been
260 ;;; sampled. The edges of a node are CALL structures that represent
261 ;;; functions called from a given node.
262 (defstruct (node (:include vertex
)
263 (:constructor %make-node
))
264 ;; A numeric label for the node. The most frequently called function
265 ;; gets label 1. This is just for identification purposes in the
267 (index 0 :type fixnum
)
268 ;; Start and end address of the function's code. Depending on the
269 ;; debug-info, this might be either as absolute addresses for things
270 ;; that won't move around in memory, or as relative offsets from
271 ;; some point for things that might move.
272 (start-pc-or-offset 0 :type address
)
273 (end-pc-or-offset 0 :type address
)
274 ;; the name of the function
276 ;; sample count for this function
277 (count 0 :type fixnum
)
278 ;; count including time spent in functions called from this one
279 (accrued-count 0 :type fixnum
)
280 ;; the debug-info that this node was created from
281 (debug-info nil
:type t
)
282 ;; list of NODEs for functions calling this one
283 (callers () :type list
))
285 ;;; A cycle in a call graph. The functions forming the cycle are
286 ;;; found in the SCC-VERTICES slot of the VERTEX structure.
287 (defstruct (cycle (:include node
)))
289 ;;; An edge in a call graph. EDGE-VERTEX is the function being
291 (defstruct (call (:include edge
)
292 (:constructor make-call
(vertex)))
293 ;; number of times the call was sampled
294 (count 1 :type sb-impl
::index
))
296 ;;; Encapsulate all the information about a sampling run
298 ;; When this vector fills up, we allocate a new one and copy over
300 (vector (make-array (* *max-samples
*
301 ;; Arbitrary guess at how many samples we'll be
302 ;; taking for each trace. The exact amount doesn't
303 ;; matter, this is just to decrease the amount of
304 ;; re-allocation that will need to be done.
306 ;; Each sample takes two cells in the vector
309 (trace-count 0 :type sb-impl
::index
)
310 (index 0 :type sb-impl
::index
)
311 (mode nil
:type
(member :cpu
:alloc
))
312 (sample-interval *sample-interval
* :type number
)
313 (alloc-interval *alloc-interval
* :type number
)
314 (max-depth most-positive-fixnum
:type number
)
315 (max-samples *max-samples
* :type sb-impl
::index
))
317 (defmethod print-object ((call-graph call-graph
) stream
)
318 (print-unreadable-object (call-graph stream
:type t
:identity t
)
319 (format stream
"~d samples" (call-graph-nsamples call-graph
))))
321 (defmethod print-object ((node node
) stream
)
322 (print-unreadable-object (node stream
:type t
:identity t
)
323 (format stream
"~s [~d]" (node-name node
) (node-index node
))))
325 (defmethod print-object ((call call
) stream
)
326 (print-unreadable-object (call stream
:type t
:identity t
)
327 (format stream
"~s [~d]" (node-name (call-vertex call
))
328 (node-index (call-vertex call
)))))
330 (deftype report-type
()
331 '(member nil
:flat
:graph
))
333 (defvar *sampling-mode
* :cpu
334 "Default sampling mode. :CPU for cpu profiling, :ALLOC for allocation
336 (declaim (type (member :cpu
:alloc
) *sampling-mode
*))
338 (defvar *sample-interval
* 0.01
339 "Default number of seconds between samples.")
340 (declaim (number *sample-interval
*))
342 (defvar *alloc-region-size
*
345 ;; This hardcoded 2 matches the one in gc_find_freeish_pages. It's not
346 ;; really worth genesifying.
348 (* 2 sb-vm
:gencgc-page-size
))
349 (declaim (number *alloc-region-size
*))
351 (defvar *alloc-interval
* 4
352 "Default number of allocation region openings between samples.")
353 (declaim (number *alloc-interval
*))
355 (defvar *max-samples
* 50000
356 "Default number of traces taken. This variable is somewhat misnamed:
357 each trace may actually consist of an arbitrary number of samples, depending
358 on the depth of the call stack.")
359 (declaim (type sb-impl
::index
*max-samples
*))
361 (defvar *samples
* nil
)
362 (declaim (type (or null samples
) *samples
*))
364 (defvar *profiling
* nil
)
365 (defvar *sampling
* nil
)
366 (declaim (type boolean
*profiling
* *sampling
*))
368 (defvar *show-progress
* nil
)
370 (defvar *old-sampling
* nil
)
372 (defun turn-off-sampling ()
373 (setq *old-sampling
* *sampling
*)
374 (setq *sampling
* nil
))
376 (defun turn-on-sampling ()
377 (setq *sampling
* *old-sampling
*))
379 (defun show-progress (format-string &rest args
)
380 (when *show-progress
*
381 (apply #'format t format-string args
)
384 (defun start-sampling ()
385 "Switch on statistical sampling."
388 (defun stop-sampling ()
389 "Switch off statistical sampling."
390 (setq *sampling
* nil
))
392 (defmacro with-sampling
((&optional
(on t
)) &body body
)
393 "Evaluate body with statistical sampling turned on or off."
394 `(let ((*sampling
* ,on
)
395 (sb-vm:*alloc-signal
* sb-vm
:*alloc-signal
*))
398 ;;; Return something serving as debug info for address PC.
399 (declaim (inline debug-info
))
400 (defun debug-info (pc)
401 (declare (type system-area-pointer pc
)
402 (muffle-conditions compiler-note
))
403 (let ((ptr (sb-di::component-ptr-from-pc pc
)))
404 (cond ((sap= ptr
(int-sap 0))
405 (let ((name (sap-foreign-symbol pc
)))
407 (values (format nil
"foreign function ~a" name
)
409 (values nil
(sap-int pc
)))))
411 (let* ((code (sb-di::component-from-component-ptr ptr
))
412 (code-header-len (* (sb-kernel:get-header-data code
)
414 (pc-offset (- (sap-int pc
)
415 (- (sb-kernel:get-lisp-obj-address code
)
416 sb-vm
:other-pointer-lowtag
)
418 (df (sb-di::debug-fun-from-pc code pc-offset
)))
419 (cond ((typep df
'sb-di
::bogus-debug-fun
)
420 (values code
(sap-int pc
)))
422 ;; The code component might be moved by the GC. Store
423 ;; a PC offset, and reconstruct the data in
424 ;; SAMPLE-PC-FROM-PC-OR-OFFSET.
425 (values df pc-offset
))
427 (values nil
0))))))))
429 (defun ensure-samples-vector (samples)
430 (let ((vector (samples-vector samples
))
431 (index (samples-index samples
)))
432 ;; Allocate a new sample vector if the old one is full
433 (if (= (length vector
) index
)
434 (let ((new-vector (make-array (* 2 index
))))
435 (format *trace-output
* "Profiler sample vector full (~a traces / ~a samples), doubling the size~%"
436 (samples-trace-count samples
)
438 (replace new-vector vector
)
439 (setf (samples-vector samples
) new-vector
))
442 (declaim (inline record
))
443 (defun record (samples pc
)
444 (declare (type system-area-pointer pc
)
445 (muffle-conditions compiler-note
))
446 (multiple-value-bind (info pc-or-offset
)
448 (let ((vector (ensure-samples-vector samples
))
449 (index (samples-index samples
)))
450 (declare (type simple-vector vector
))
451 ;; Allocate a new sample vector if the old one is full
452 (when (= (length vector
) index
)
453 (let ((new-vector (make-array (* 2 index
))))
454 (format *trace-output
* "Profiler sample vector full (~a traces / ~a samples), doubling the size~%"
455 (samples-trace-count samples
)
457 (replace new-vector vector
)
458 (setf vector new-vector
459 (samples-vector samples
) new-vector
)))
460 ;; For each sample, store the debug-info and the PC/offset into
462 (setf (aref vector index
) info
463 (aref vector
(1+ index
)) pc-or-offset
)))
464 (incf (samples-index samples
) 2))
466 (defun record-trace-start (samples)
467 ;; Mark the start of the trace.
468 (let ((vector (ensure-samples-vector samples
)))
469 (declare (type simple-vector vector
))
470 (setf (aref vector
(samples-index samples
))
472 (incf (samples-index samples
) 2))
474 ;;; Ensure that only one thread at a time will be executing sigprof handler.
475 (defvar *sigprof-handler-lock
* (sb-thread:make-mutex
:name
"SIGPROF handler"))
477 ;;; SIGPROF handler. Record current PC and return address in
480 (defun sigprof-handler (signal code scp
)
481 (declare (ignore signal code
)
482 (optimize speed
(space 0))
483 (muffle-conditions compiler-note
)
484 (disable-package-locks sb-di
::x86-call-context
)
485 (type system-area-pointer scp
))
486 (sb-sys:without-interrupts
487 (let ((sb-vm:*alloc-signal
* nil
)
489 (when (and *sampling
*
491 (< (samples-trace-count samples
)
492 (samples-max-samples samples
)))
493 (sb-sys:without-gcing
494 (sb-thread:with-mutex
(*sigprof-handler-lock
*)
495 (with-alien ((scp (* os-context-t
) :local scp
))
496 (let* ((pc-ptr (sb-vm:context-pc scp
))
497 (fp (sb-vm::context-register scp
#.sb-vm
::ebp-offset
)))
498 ;; For some reason completely bogus small values for the
499 ;; frame pointer are returned every now and then, leading
500 ;; to segfaults. Try to avoid these cases.
502 ;; FIXME: Do a more thorough sanity check on ebp, or figure
503 ;; out why this is happening.
504 ;; -- JES, 2005-01-11
506 (return-from sigprof-handler nil
))
507 (incf (samples-trace-count samples
))
508 (let ((fp (int-sap fp
))
510 (declare (type system-area-pointer fp pc-ptr
))
511 ;; FIXME: How annoying. The XC doesn't store enough
512 ;; type information about SB-DI::X86-CALL-CONTEXT,
513 ;; even if we declaim the ftype explicitly in
514 ;; src/code/debug-int. And for some reason that type
515 ;; information is needed for the inlined version to
516 ;; be compiled without boxing the returned saps. So
517 ;; we declare the correct ftype here manually, even
518 ;; if the compiler should be able to deduce this
519 ;; exact same information.
520 (declare (ftype (function (system-area-pointer)
521 (values (member nil t
)
523 system-area-pointer
))
524 sb-di
::x86-call-context
))
525 (record-trace-start samples
)
526 (dotimes (i (samples-max-depth samples
))
527 (record samples pc-ptr
)
528 (setf (values ok pc-ptr fp
)
529 (sb-di::x86-call-context fp
))
532 ;; Reset the allocation counter
533 (when (and sb-vm
:*alloc-signal
*
534 (<= sb-vm
:*alloc-signal
* 0))
535 (setf sb-vm
:*alloc-signal
* (1- *alloc-interval
*)))
538 ;; FIXME: On non-x86 platforms we don't yet walk the call stack deeper
541 (defun sigprof-handler (signal code scp
)
542 (declare (ignore signal code
))
543 (sb-sys:without-interrupts
544 (let ((samples *samples
*))
545 (when (and *sampling
*
547 (< (samples-trace-count samples
)
548 (samples-max-samples samples
)))
549 (sb-sys:without-gcing
550 (with-alien ((scp (* os-context-t
) :local scp
))
551 (locally (declare (optimize (inhibit-warnings 2)))
552 (record-trace-start samples
)
553 (let* ((pc-ptr (sb-vm:context-pc scp
))
554 (fp (sb-vm::context-register scp
#.sb-vm
::cfp-offset
))
557 (* sb-vm
::lra-save-offset sb-vm
::n-word-bytes
))))
558 (record samples pc-ptr
)
559 (record samples
(int-sap ra
))))))))))
561 ;;; Return the start address of CODE.
562 (defun code-start (code)
563 (declare (type sb-kernel
:code-component code
))
564 (sap-int (sb-kernel:code-instructions code
)))
566 ;;; Return start and end address of CODE as multiple values.
567 (defun code-bounds (code)
568 (declare (type sb-kernel
:code-component code
))
569 (let* ((start (code-start code
))
570 (end (+ start
(sb-kernel:%code-code-size code
))))
573 (defmacro with-profiling
((&key
(sample-interval '*sample-interval
*)
574 (alloc-interval '*alloc-interval
*)
575 (max-samples '*max-samples
*)
577 (mode '*sampling-mode
*)
579 (max-depth most-positive-fixnum
)
581 (report nil report-p
))
583 "Repeatedly evaluate BODY with statistical profiling turned on.
584 In multi-threaded operation, only the thread in which WITH-PROFILING
585 was evaluated will be profiled by default. If you want to profile
586 multiple threads, invoke the profiler with START-PROFILING.
588 The following keyword args are recognized:
591 Take a sample every <n> seconds. Default is *SAMPLE-INTERVAL*.
594 Take a sample every time <n> allocation regions (approximately
595 8kB) have been allocated since the last sample. Default is
599 If :CPU, run the profiler in CPU profiling mode. If :ALLOC, run
600 the profiler in allocation profiling mode.
603 Repeat evaluating body until <max> samples are taken.
604 Default is *MAX-SAMPLES*.
607 Maximum call stack depth that the profiler should consider. Only
608 has an effect on x86 and x86-64.
611 If specified, call REPORT with :TYPE <type> at the end.
614 It true, call RESET at the beginning.
617 If true (the default) repeatedly evaluate BODY. If false, evaluate
619 (declare (type report-type report
))
620 `(let* ((*sample-interval
* ,sample-interval
)
621 (*alloc-interval
* ,alloc-interval
)
623 (sb-vm:*alloc-signal
* nil
)
624 (*sampling-mode
* ,mode
)
625 (*max-samples
* ,max-samples
))
626 ,@(when reset
'((reset)))
629 (start-profiling :max-depth
',max-depth
)
631 (when (>= (samples-trace-count *samples
*)
632 (samples-max-samples *samples
*))
634 ,@(when show-progress
635 `((format t
"~&===> ~d of ~d samples taken.~%"
636 (samples-trace-count *samples
*)
637 (samples-max-samples *samples
*))))
638 (let ((.last-index.
(samples-index *samples
*)))
640 (when (= .last-index.
(samples-index *samples
*))
641 (warn "No sampling progress; possibly a profiler bug.")
646 ,@(when report-p
`((report :type
,report
)))))
648 (defun start-profiling (&key
(max-samples *max-samples
*)
649 (mode *sampling-mode
*)
650 (sample-interval *sample-interval
*)
651 (alloc-interval *alloc-interval
*)
652 (max-depth most-positive-fixnum
)
654 "Start profiling statistically if not already profiling.
655 The following keyword args are recognized:
658 Take a sample every <n> seconds. Default is *SAMPLE-INTERVAL*.
661 Take a sample every time <n> allocation regions (approximately
662 8kB) have been allocated since the last sample. Default is
666 If :CPU, run the profiler in CPU profiling mode. If :ALLOC, run
667 the profiler in allocation profiling mode.
670 Maximum number of samples. Default is *MAX-SAMPLES*.
673 Maximum call stack depth that the profiler should consider. Only
674 has an effect on x86 and x86-64.
677 If true, the default, start sampling right away.
678 If false, START-SAMPLING can be used to turn sampling on."
680 (when (eq mode
:alloc
)
681 (error "Allocation profiling is only supported for builds using the generational garbage collector."))
683 (multiple-value-bind (secs usecs
)
684 (multiple-value-bind (secs rest
)
685 (truncate sample-interval
)
686 (values secs
(truncate (* rest
1000000))))
687 (setf *sampling
* sampling
688 *samples
* (make-samples :max-depth max-depth
689 :max-samples max-samples
691 (sb-sys:enable-interrupt sb-unix
:sigprof
#'sigprof-handler
)
693 (setf sb-vm
:*alloc-signal
* (1- alloc-interval
))
695 (unix-setitimer :profile secs usecs secs usecs
)
696 (setf sb-vm
:*alloc-signal
* nil
)))
697 (setq *profiling
* t
)))
700 (defun stop-profiling ()
701 "Stop profiling if profiling."
703 (unix-setitimer :profile
0 0 0 0)
704 ;; Even with the timer shut down we cannot be sure that there is
705 ;; no undelivered sigprof. Besides, leaving the signal handler
706 ;; installed won't hurt.
707 (setq *sampling
* nil
)
708 (setq sb-vm
:*alloc-signal
* nil
)
709 (setq *profiling
* nil
))
713 "Reset the profiler."
715 (setq *sampling
* nil
)
719 ;;; Make a NODE for debug-info INFO.
720 (defun make-node (info)
721 (flet ((clean-name (name)
722 (if (and (consp name
)
724 '(sb-c::xep sb-c
::tl-xep sb-c
::&more-processor
727 sb-c
::hairy-arg-processor
728 sb-c
::&optional-processor
)))
732 (sb-kernel::code-component
733 (multiple-value-bind (start end
)
736 (%make-node
:name
(or (sb-disassem::find-assembler-routine start
)
737 (format nil
"~a" info
))
739 :start-pc-or-offset start
740 :end-pc-or-offset end
)
742 (sb-di::compiled-debug-fun
743 (let* ((name (sb-di::debug-fun-name info
))
744 (cdf (sb-di::compiled-debug-fun-compiler-debug-fun info
))
745 (start-offset (sb-c::compiled-debug-fun-start-pc cdf
))
746 (end-offset (sb-c::compiled-debug-fun-elsewhere-pc cdf
))
747 (component (sb-di::compiled-debug-fun-component info
))
748 (start-pc (code-start component
)))
749 ;; Call graphs are mostly useless unless we somehow
750 ;; distinguish a gazillion different (LAMBDA ())'s.
751 (when (equal name
'(lambda ()))
752 (setf name
(format nil
"Unknown component: #x~x" start-pc
)))
753 (values (%make-node
:name
(clean-name name
)
755 :start-pc-or-offset start-offset
756 :end-pc-or-offset end-offset
)
759 (%make-node
:name
(clean-name (sb-di::debug-fun-name info
))
762 (%make-node
:name
(coerce info
'string
)
763 :debug-info info
)))))
765 ;;; One function can have more than one COMPILED-DEBUG-FUNCTION with
766 ;;; the same name. Reduce the number of calls to Debug-Info by first
767 ;;; looking for a given PC in a red-black tree. If not found in the
768 ;;; tree, get debug info, and look for a node in a hash-table by
769 ;;; function name. If not found in the hash-table, make a new node.
771 (defvar *name-
>node
*)
773 (defmacro with-lookup-tables
(() &body body
)
774 `(let ((*name-
>node
* (make-hash-table :test
'equal
)))
777 ;;; Find or make a new node for INFO. Value is the NODE found or
778 ;;; made; NIL if not enough information exists to make a NODE for INFO.
779 (defun lookup-node (info)
781 (multiple-value-bind (new key
)
783 (let* ((key (cons (node-name new
) key
))
784 (found (gethash key
*name-
>node
*)))
786 (setf (node-start-pc-or-offset found
)
787 (min (node-start-pc-or-offset found
)
788 (node-start-pc-or-offset new
)))
789 (setf (node-end-pc-or-offset found
)
790 (max (node-end-pc-or-offset found
)
791 (node-end-pc-or-offset new
)))
794 (setf (gethash key
*name-
>node
*) new
)
797 ;;; Return a list of all nodes created by LOOKUP-NODE.
798 (defun collect-nodes ()
799 (loop for node being the hash-values of
*name-
>node
*
802 ;;; Value is a CALL-GRAPH for the current contents of *SAMPLES*.
803 (defun make-call-graph-1 (max-depth)
804 (let ((elsewhere-count 0)
806 (with-lookup-tables ()
807 (loop for i below
(- (samples-index *samples
*) 2) by
2
809 for debug-info
= (aref (samples-vector *samples
*) i
)
810 for next-info
= (aref (samples-vector *samples
*)
812 do
(if (eq debug-info
'trace-start
)
814 (let ((callee (lookup-node debug-info
))
815 (caller (unless (eq next-info
'trace-start
)
816 (lookup-node next-info
))))
817 (when (< depth max-depth
)
819 (setf visited-nodes nil
)
821 (incf (node-accrued-count callee
))
822 (incf (node-count callee
)))
824 (incf elsewhere-count
))))
827 (push callee visited-nodes
))
829 (unless (member caller visited-nodes
)
830 (incf (node-accrued-count caller
)))
832 (let ((call (find callee
(node-edges caller
)
833 :key
#'call-vertex
)))
834 (pushnew caller
(node-callers callee
))
836 (unless (member caller visited-nodes
)
837 (incf (call-count call
)))
838 (push (make-call callee
)
839 (node-edges caller
))))))))))
840 (let ((sorted-nodes (sort (collect-nodes) #'> :key
#'node-count
)))
841 (loop for node in sorted-nodes and i from
1 do
842 (setf (node-index node
) i
))
843 (%make-call-graph
:nsamples
(samples-trace-count *samples
*)
844 :sample-interval
(if (eq (samples-mode *samples
*)
846 (samples-alloc-interval *samples
*)
847 (samples-sample-interval *samples
*))
848 :sampling-mode
(samples-mode *samples
*)
849 :elsewhere-count elsewhere-count
850 :vertices sorted-nodes
)))))
852 ;;; Reduce CALL-GRAPH to a dag, creating CYCLE structures for call
854 (defun reduce-call-graph (call-graph)
856 (flet ((make-one-cycle (vertices edges
)
857 (let* ((name (format nil
"<Cycle ~d>" (incf cycle-no
)))
858 (count (loop for v in vertices sum
(node-count v
))))
859 (make-cycle :name name
862 :scc-vertices vertices
864 (reduce-graph call-graph
#'make-one-cycle
))))
866 ;;; For all nodes in CALL-GRAPH, compute times including the time
867 ;;; spent in functions called from them. Note that the call-graph
868 ;;; vertices are in reverse topological order, children first, so we
869 ;;; will have computed accrued counts of called functions before they
870 ;;; are used to compute accrued counts for callers.
871 (defun compute-accrued-counts (call-graph)
872 (do-vertices (from call-graph
)
873 (setf (node-accrued-count from
) (node-count from
))
874 (do-edges (call to from
)
875 (incf (node-accrued-count from
)
876 (round (* (/ (call-count call
) (node-count to
))
877 (node-accrued-count to
)))))))
879 ;;; Return a CALL-GRAPH structure for the current contents of
880 ;;; *SAMPLES*. The result contain a list of nodes sorted by self-time
881 ;;; in the FLAT-NODES slot, and a dag in VERTICES, with call cycles
882 ;;; reduced to CYCLE structures.
883 (defun make-call-graph (max-depth)
885 (show-progress "~&Computing call graph ")
886 (let ((call-graph (without-gcing (make-call-graph-1 max-depth
))))
887 (setf (call-graph-flat-nodes call-graph
)
888 (copy-list (graph-vertices call-graph
)))
889 (show-progress "~&Finding cycles")
891 (reduce-call-graph call-graph
)
892 (show-progress "~&Propagating counts")
894 (compute-accrued-counts call-graph
)
900 (defun print-separator (&key
(length 72) (char #\-
))
901 (format t
"~&~V,,,V<~>~%" length char
))
903 (defun samples-percent (call-graph count
)
905 (* 100.0 (/ count
(call-graph-nsamples call-graph
)))
908 (defun print-call-graph-header (call-graph)
909 (let ((nsamples (call-graph-nsamples call-graph
))
910 (interval (call-graph-sample-interval call-graph
))
911 (ncycles (loop for v in
(graph-vertices call-graph
)
913 (if (eq (call-graph-sampling-mode call-graph
) :alloc
)
914 (format t
"~2&Number of samples: ~d~%~
915 Sample interval: ~a regions (approximately ~a kB)~%~
916 Total sampling amount: ~a regions (approximately ~a kB)~%~
917 Number of cycles: ~d~2%"
920 (truncate (* interval
*alloc-region-size
*) 1024)
921 (* nsamples interval
)
922 (truncate (* nsamples interval
*alloc-region-size
*) 1024)
924 (format t
"~2&Number of samples: ~d~%~
925 Sample interval: ~f seconds~%~
926 Total sampling time: ~f seconds~%~
927 Number of cycles: ~d~2%"
930 (* nsamples interval
)
933 (defun print-flat (call-graph &key
(stream *standard-output
*) max
934 min-percent
(print-header t
))
935 (let ((*standard-output
* stream
)
939 (min-count (if min-percent
940 (round (* (/ min-percent
100.0)
941 (call-graph-nsamples call-graph
)))
944 (print-call-graph-header call-graph
))
945 (format t
"~& Self Total Cumul~%")
946 (format t
"~& Nr Count % Count % Count % Function~%")
948 (let ((elsewhere-count (call-graph-elsewhere-count call-graph
))
950 (dolist (node (call-graph-flat-nodes call-graph
))
951 (when (or (and max
(> (incf i
) max
))
952 (< (node-count node
) min-count
))
954 (let* ((count (node-count node
))
955 (percent (samples-percent call-graph count
))
956 (accrued-count (node-accrued-count node
))
957 (accrued-percent (samples-percent call-graph accrued-count
)))
958 (incf total-count count
)
959 (incf total-percent percent
)
960 (format t
"~&~4d ~6d ~5,1f ~6d ~5,1f ~6d ~5,1f ~s~%"
971 (format t
"~& ~6d ~5,1f elsewhere~%"
973 (samples-percent call-graph elsewhere-count
)))))
975 (defun print-cycles (call-graph)
976 (when (some #'cycle-p
(graph-vertices call-graph
))
977 (format t
"~& Cycle~%")
978 (format t
"~& Count % Parts~%")
979 (do-vertices (node call-graph
)
981 (flet ((print-info (indent index count percent name
)
982 (format t
"~&~6d ~5,1f ~11@t ~V@t ~s [~d]~%"
983 count percent indent name index
)))
985 (format t
"~&~6d ~5,1f ~a...~%"
987 (samples-percent call-graph
(cycle-count node
))
989 (dolist (v (vertex-scc-vertices node
))
990 (print-info 4 (node-index v
) (node-count v
)
991 (samples-percent call-graph
(node-count v
))
996 (defun print-graph (call-graph &key
(stream *standard-output
*)
998 (let ((*standard-output
* stream
)
999 (*print-pretty
* nil
))
1000 (print-call-graph-header call-graph
)
1001 (print-cycles call-graph
)
1002 (flet ((find-call (from to
)
1003 (find to
(node-edges from
) :key
#'call-vertex
))
1004 (print-info (indent index count percent name
)
1005 (format t
"~&~6d ~5,1f ~11@t ~V@t ~s [~d]~%"
1006 count percent indent name index
)))
1007 (format t
"~& Callers~%")
1008 (format t
"~& Total. Function~%")
1009 (format t
"~& Count % Count % Callees~%")
1010 (do-vertices (node call-graph
)
1013 ;; Print caller information.
1014 (dolist (caller (node-callers node
))
1015 (let ((call (find-call caller node
)))
1016 (print-info 4 (node-index caller
)
1018 (samples-percent call-graph
(call-count call
))
1019 (node-name caller
))))
1020 ;; Print the node itself.
1021 (format t
"~&~6d ~5,1f ~6d ~5,1f ~s [~d]~%"
1023 (samples-percent call-graph
(node-count node
))
1024 (node-accrued-count node
)
1025 (samples-percent call-graph
(node-accrued-count node
))
1029 (do-edges (call called node
)
1030 (print-info 4 (node-index called
)
1032 (samples-percent call-graph
(call-count call
))
1033 (node-name called
))))
1036 (print-flat call-graph
:stream stream
:max max
1037 :min-percent min-percent
:print-header nil
))))
1039 (defun report (&key
(type :graph
) max min-percent call-graph
1040 (stream *standard-output
*) ((:show-progress
*show-progress
*)))
1041 "Report statistical profiling results. The following keyword
1042 args are recognized:
1045 Specifies the type of report to generate. If :FLAT, show
1046 flat report, if :GRAPH show a call graph and a flat report.
1047 If nil, don't print out a report.
1050 Specify a stream to print the report on. Default is
1054 Don't show more than <max> entries in the flat report.
1056 :MIN-PERCENT <min-percent>
1057 Don't show functions taking less than <min-percent> of the
1058 total time in the flat report.
1060 :SHOW-PROGRESS <bool>
1061 If true, print progress messages while generating the call graph.
1064 Print a report from <graph> instead of the latest profiling
1067 Value of this function is a CALL-GRAPH object representing the
1068 resulting call-graph."
1069 (let ((graph (or call-graph
(make-call-graph most-positive-fixnum
))))
1072 (print-flat graph
:stream stream
:max max
:min-percent min-percent
))
1074 (print-graph graph
:stream stream
:max max
:min-percent min-percent
))
1078 ;;; Interface to DISASSEMBLE
1080 (defun sample-pc-from-pc-or-offset (sample pc-or-offset
)
1082 ;; Assembly routines or foreign functions don't move around, so we've
1084 ((or sb-kernel
:code-component string
)
1086 ;; Lisp functions might move, so we've stored a offset from the
1087 ;; start of the code component.
1088 (sb-di::compiled-debug-fun
1089 (let* ((component (sb-di::compiled-debug-fun-component sample
))
1090 (start-pc (code-start component
)))
1091 (+ start-pc pc-or-offset
)))))
1093 (defun add-disassembly-profile-note (chunk stream dstate
)
1094 (declare (ignore chunk stream
))
1096 (let* ((location (+ (sb-disassem::seg-virtual-location
1097 (sb-disassem:dstate-segment dstate
))
1098 (sb-disassem::dstate-cur-offs dstate
)))
1099 (samples (loop with index
= (samples-index *samples
*)
1100 for x from
0 below
(- index
2) by
2
1101 for last-sample
= nil then sample
1102 for sample
= (aref (samples-vector *samples
*) x
)
1103 for pc-or-offset
= (aref (samples-vector *samples
*)
1105 when
(and sample
(eq last-sample
'trace-start
))
1107 (sample-pc-from-pc-or-offset sample
1109 (unless (zerop samples
)
1110 (sb-disassem::note
(format nil
"~A/~A samples"
1111 samples
(samples-trace-count *samples
*))
1114 (pushnew 'add-disassembly-profile-note sb-disassem
::*default-dstate-hooks
*)
1118 (defun test-0 (n &optional
(depth 0))
1119 (declare (optimize (debug 3)))
1122 (test-0 n
(1+ depth
))
1123 (test-0 n
(1+ depth
)))))
1126 (with-profiling (:reset t
:max-samples
1000 :report
:graph
)