1 ;;; Allocation profiler
3 ;;; Quick start: To enable this code, you must DECLAIM or DECLARE
4 ;;; (OPTIMIZE SB-C::INSTRUMENT-CONSING)
5 ;;; to affect functions in which instrumentation should be inserted.
6 ;;; After that, the simplest way to run the function under test is to pass
7 ;;; it to APROF-RUN. Alternatively, collection can be toggled with APROF-START
8 ;;; and APROF-STOP, but some setup is needed (as done by APROF-RUN).
10 ;;; Note: This is x86-64 only at the moment.
12 ;;; Profiling works via compile-time instrumentation.
13 ;;; Each point in code emitted by ALLOCATION adds a few instructions
14 ;;; prior to the actual inline allocation sequence. The extra instructions
15 ;;; are normally disabled (there is a jump around them). When the profiler
16 ;;; is switched on, the instructions are modified to atomically increment
17 ;;; a counter (or two). If the allocation is a compile-time fixed size,
18 ;;; only one counter is needed. If the size is variable, then two counters
19 ;;; are used: hits and total bytes.
20 ;;; The array of counters is prepared by the profiler when first started.
21 ;;; The index into the array for any particular counter is determined
22 ;;; the first time that counter is hit. Subsequent hits are cheap.
24 ;;; On x86-64, instrumented code initially looks like this:
26 ;;; 9E98: 4D8B5D18 MOV R11, [R13+24] ; thread.profile-data
27 ;;; 9E9C: 4D85DB TEST R11, R11
28 ;;; 9E9F: EB17 JMP L21
29 ;;; 9EA1: 0F1F8000000000 NOP
30 ;;; 9EA8: E8D366FDFF CALL #x21B00580 ; ENABLE-SIZED-ALLOC-COUNTER
32 ;;; 9EB0: 4885C9 TEST RCX, RCX
33 ;;; 9EB3: 0F1F440000 NOP
34 ;;; 9EB8: L21: 4D8B5D20 MOV R11, [R13+32] ; thread.alloc-region
36 ;;; The first TEST instruction's result is ignored.
37 ;;; The second TEST conveys the register holding the size to allocate in bytes.
39 ;;; After enabling profiling, the first JMP becomes a JEQ so that
40 ;;; if the data buffer exists, the CALL is performed.
41 ;;; If this code is actually reached, and the call performed,
42 ;;; then the call instruction is modified to (something like):
44 ;;; 9E98: 4D8B5D18 MOV R11, [R13+24] ; thread.profile-data
45 ;;; 9E9C: 4D85DB TEST R11, R11
46 ;;; 9E9F: 7417 JEQ L21
47 ;;; 9EA1: 0F1F8000000000 NOP
48 ;;; 9EA8: F049FF83C8020000 LOCK INC QWORD PTR [R11+712]
49 ;;; 9EB0: F049018BD0020000 LOCK ADD [R11+720], RCX
50 ;;; 9EB8: L21: 4D8B5D20 MOV R11, [R13+32] ; thread.alloc-region
52 ;;; All of the modifications to running code can be made safely
53 ;;; even if multiple threads are in that code at the same time, because:
55 ;;; * changing a JMP to a JNE requires writing only 1 byte.
56 ;;; Instruction fetch will either see the new byte or the old,
57 ;;; and it doesn't matter which.
59 ;;; * changing CALL to LOCK INC writes one naturally aligned octword.
60 ;;; Moreover, RIP gets decremented so that the next fetch won't read
61 ;;; the NOP or only part of the bytes of the LOCK INC.
63 ;;; See the example run at the bottom of this file.
65 (defpackage #:sb-aprof
66 (:use
#:cl
#:sb-ext
#:sb-alien
#:sb-sys
#:sb-int
#:sb-kernel
)
67 (:export
#:aprof-run
#:aprof-show
#:aprof-reset
68 #:aprof-start
#:aprof-stop
#:patch-all-code
)
69 (:import-from
#:sb-di
#:valid-tagged-pointer-p
)
70 (:import-from
#:sb-vm
#:thread-reg
)
71 (:import-from
#:sb-x86-64-asm
72 #:register-p
#:get-gpr
#:reg
#:reg-num
73 #:machine-ea
#:machine-ea-p
74 #:machine-ea-disp
#:machine-ea-base
#:machine-ea-index
77 (in-package #:sb-aprof
)
78 (setf (system-package-p *package
*) t
)
80 (defstruct (alloc (:constructor make-alloc
(bytes count type pc
)))
82 (declaim (freeze-type alloc
))
84 (defglobal *allocation-profile-metadata
* nil
)
86 (define-alien-variable alloc-profile-buffer system-area-pointer
)
88 (let ((buffer alloc-profile-buffer
))
89 (unless (= (sap-int buffer
) 0)
90 (alien-funcall (extern-alien "memset" (function void system-area-pointer int size-t
))
93 (* (/ (length *allocation-profile-metadata
*) 2)
94 sb-vm
:n-word-bytes
)))))
96 (defun patch-code (code locs enable
&aux
(n 0) (n-patched 0))
97 (let ((enable-counted (sb-fasl:get-asm-routine
'sb-vm
::enable-alloc-counter
))
98 (enable-sized (sb-fasl:get-asm-routine
'sb-vm
::enable-sized-alloc-counter
))
99 (enable-counted-indirect
100 (sb-fasl:get-asm-routine
'sb-vm
::enable-alloc-counter t
))
101 (enable-sized-indirect
102 (sb-fasl:get-asm-routine
'sb-vm
::enable-sized-alloc-counter t
))
103 (stack (make-array 1 :element-type
'sb-vm
:word
))
104 (insts (code-instructions code
)))
105 (declare (dynamic-extent stack
))
106 (with-alien ((allocation-tracker-counted (function void system-area-pointer
) :extern
)
107 (allocation-tracker-sized (function void system-area-pointer
) :extern
))
108 (do-packed-varints (loc locs
)
110 (let ((byte (sap-ref-8 insts loc
)))
111 (when (eql byte
#xEB
)
112 (setf (sap-ref-8 insts loc
) #x74
) ; JEQ
114 (let* ((next-pc (+ loc
2))
115 (aligned-next-pc (align-up next-pc
8))
116 (opcode (sap-ref-8 insts aligned-next-pc
)))
119 (let* ((rel32 (signed-sap-ref-32 insts
(1+ aligned-next-pc
)))
120 (return-pc (sap+ insts
(+ aligned-next-pc
5)))
121 (target (sap+ return-pc rel32
)))
122 ;; The C rountine looks at the return address to see where
123 ;; to patch in the new instructions, and it gets the return address
124 ;; from the stack, the pointer to which is supplied as the arg.
125 ;; So STACK is the simulated stack containing the return PC.
126 (setf (aref stack
0) (sap-int return-pc
))
127 (cond ((= (sap-int target
) enable-counted
)
128 (alien-funcall allocation-tracker-counted
(vector-sap stack
)))
129 ((= (sap-int target
) enable-sized
)
130 (alien-funcall allocation-tracker-sized
(vector-sap stack
)))
131 ;; Dynamic-space code can't encode call32 to an asm routine. Instead
132 ;; might see CALL to a JMP within the code blob so that we don't
133 ;; produce one absolute fixup per allocation site.
134 ((and (sap>= target insts
)
135 (sap< target
(sap+ insts
(%code-text-size code
)))
136 (= (sap-ref-8 target
0) #xFF
)
137 (= (sap-ref-8 target
1) #x24
)
138 (= (sap-ref-8 target
2) #x25
))
139 (let ((ea (sap-ref-32 target
3)))
140 (cond ((= ea enable-counted-indirect
)
141 (alien-funcall allocation-tracker-counted
(vector-sap stack
)))
142 ((= ea enable-sized-indirect
)
143 (alien-funcall allocation-tracker-sized
(vector-sap stack
)))
145 (error "Unrecognized CALL [EA] at patch site in ~A @ ~X"
148 (error "Unrecognized CALL at patch site in ~A @ ~X"
151 (error "Unrecognized opcode at patch site in ~A @ ~X"
154 (values n n-patched
)))
156 ;; Fixed-size allocations consume 1 entry (hit count).
157 ;; Variable-size consume 2 (hit count and total bytes).
158 ;; Counters 0 and 1 are reserved for variable-size allocations
159 ;; (hit count and total size) that overflow the maximum counter index.
160 ;; Counter 2 is reserved for fixed-size allocations.
161 (define-load-time-global *n-profile-sites
* 3)
163 (defun aprof-start ()
164 (with-alien ((alloc-profile-data unsigned
:extern
))
165 (when (zerop alloc-profile-data
) ; raw data buffer not made yet
166 (let ((v *allocation-profile-metadata
*))
167 ;; Lisp metadata may or may not exist depending on whether you saved a core image
168 ;; with the metadata baked in.
170 (setq v
(make-array 300000)
171 *allocation-profile-metadata
* v
))
172 (with-pinned-objects (v)
173 (setf alloc-profile-data
(get-lisp-obj-address v
))))))
174 (alien-funcall (extern-alien "allocation_profiler_start" (function void
))))
177 (alien-funcall (extern-alien "allocation_profiler_stop" (function void
))))
179 ;;; Passing ENABLE presumes that you want to later use the profiler
180 ;;; without relying on self-modifying code. Everything can be patched now
181 ;;; and then the image dumped. This plays more nicely with some sandboxed
182 ;;; environments that forbid executable text segments, which is
183 ;;; predominantly a concern for ELFinated cores.
184 (defun patch-all-code (&optional enable
)
186 ;; Somewhat un-obviously, we have to "start" the profiler so that the C
187 ;; support figures out how many alloc site indices it can utilize
188 ;; based on the size of the profile data vector passed from lisp.
190 ;; Then stop, because we don't actually want to profile anything now.
192 (let ((total-n-patch-points 0)
194 (ht sb-c
::*allocation-patch-points
*))
195 (dohash ((code locs
) ht
)
197 (multiple-value-bind (n-patch-points n-patched
)
198 ;; just being pedantic about pinning here for documentation
199 (with-pinned-objects (code)
200 (patch-code code locs enable
))
201 (incf total-n-patch-points n-patch-points
)
202 (incf total-n-patched n-patched
)))
203 (values total-n-patch-points total-n-patched
)))
205 (defglobal *tag-to-type
*
208 (cond ((sb-vm::specialized-array-element-type-properties-p x
)
209 (let ((et (sb-vm:saetp-specifier x
)))
210 (sb-kernel:type-specifier
211 (sb-kernel:specifier-type
`(simple-array ,et
1)))))
213 (sb-vm::room-info-name x
))))
216 (defun layout-name (ptr)
217 (if (eql (valid-tagged-pointer-p (int-sap ptr
)) 0)
219 (layout-classoid-name (make-lisp-obj ptr
))))
221 ;;; These EAs are s-expressions, not instances of EA or MACHINE-EA.
223 (defconstant-eqx p-a-flag
`(ea ,(ash sb-vm
::thread-pseudo-atomic-bits-slot sb-vm
:word-shift
)
224 ,(get-gpr :qword sb-vm
::thread-reg
))
227 ;;; Templates to try in order. The one for unknown headered objects should be last
228 ;;; so that we try to match a store to the header word if possible.
229 (defglobal *allocation-templates
* nil
)
230 ;;; Running sb-aprof with #+sb-show is not an important concern,
231 ;;; and I don't care to fix it. It gets an error here:
232 ;;; "don't know how to dump R13 (default MAKE-LOAD-FORM method called)."
234 (setq *allocation-templates
*
237 (cmp :qword ?end
:tlab-limit
)
239 (mov :qword
:tlab-freeptr ?end
)
240 (:or
(add ?end ?bias
) (dec ?end
))
241 (mov ?_
(ea ?_ ?end
) ?header
))
245 (cmp :qword ?end
:tlab-limit
)
247 (mov :qword
:tlab-freeptr ?end
)
249 (mov ?_
(ea ?_ ?end
) ?header
)
250 (mov ?_
(ea ?_ ?end
) ?vector-len
))
253 ;; after the xadd, SIZE holds the original value of free-ptr
254 ;; and free-ptr points to the end of the putative data block.
256 (cmp :qword ?free
:tlab-limit
)
258 (mov :qword
:tlab-freeptr ?free
)
259 ;; Could have one or two stores prior to ORing in a lowtag.
260 (:optional
(mov ?_
(ea 0 ?size
) ?header
))
261 (:optional
(mov ?_
(ea 8 ?size
) ?vector-len
))
262 (:or
(or ?size ?lowtag
)
263 (lea :qword ?result
(ea ?lowtag ?size
))))
265 (any (:or
(lea :qword ?end
(ea ?nbytes ?free ?nbytes-var
))
266 ;; LEA with scale=1 can have base and index swapped
267 (lea :qword ?end
(ea 0 ?nbytes-var ?free
))
268 (add ?end ?free
)) ; ?end originally holds the size in bytes
269 (cmp :qword ?end
:tlab-limit
)
271 (mov :qword
:tlab-freeptr ?end
)
272 (mov ?_
(ea 0 ?free
) ?header
)
273 (:optional
(mov ?_
(ea ?_ ?free
) ?vector-len
))
274 (:or
(or ?free ?lowtag
)
275 (lea :qword ?result
(ea ?lowtag ?free
))))
277 ;; LISTIFY-REST-ARG computes a tagged pointer to the _last_ cons in the memory block,
278 ;; not the first cons.
279 (list (lea :qword ?end
(ea 0 ?nbytes ?free
))
280 (cmp :qword ?end
:tlab-limit
)
282 (mov :qword
:tlab-freeptr ?end
)
283 (lea :qword ?free
(ea ,(- sb-vm
:list-pointer-lowtag
284 (* sb-vm
:cons-size sb-vm
:n-word-bytes
))
288 (acons (lea :qword ?end
(ea 32 ?free
))
289 (cmp :qword ?end
:tlab-limit
)
291 (mov :qword
:tlab-freeptr ?end
)
292 (:repeat
(mov . ignore
))
293 (lea :qword ?result
(ea #.
(+ 16 sb-vm
:list-pointer-lowtag
) ?free
)))
295 ;; either non-headered object (cons) or unknown header or unknown nbytes
296 (unknown-header (:or
(lea :qword ?end
(ea ?nbytes ?free ?nbytes-var
))
297 (lea :qword ?end
(ea 0 ?nbytes-var ?free
))
299 (cmp :qword ?end
:tlab-limit
)
301 (mov :qword
:tlab-freeptr ?end
)
302 (:repeat
(:or
(mov . ignore
) (lea . ignore
)))
303 (:or
(or ?free ?lowtag
)
304 (lea :qword ?result
(ea ?lowtag ?free
))))))
306 (defglobal *allocation-templates-large
* nil
)
307 (setq *allocation-templates-large
*
308 `((array (push ?nbytes
)
311 (mov ?_
(ea 0 ?result
) ?header
)
312 (mov ?_
(ea ?_ ?result
) ?vector-len
)
313 (or ?result ?lowtag
))
314 ;; not really "large" but same as preceding
315 (funinstance (push ?nbytes
)
318 (mov ?_
(ea 0 ?result
) ?header
)
319 (or ?result ?lowtag
))
323 (or ?result ?lowtag
))))
325 (defun iterator-begin (iterator pc code
)
326 (let ((segment (sb-disassem:make-code-segment
328 (sb-sys:sap-
(sb-sys:int-sap pc
) (sb-kernel:code-instructions code
))
330 (dstate (cddr iterator
)))
331 (setf (sb-disassem:dstate-segment dstate
) segment
332 (sb-disassem:dstate-segment-sap dstate
) (funcall (sb-disassem:seg-sap-maker segment
))
333 (sb-disassem:dstate-cur-offs dstate
) 0)))
335 (defun get-instruction (iterator)
336 (destructuring-bind (pos vector . dstate
) iterator
337 (if (< pos
(length vector
))
339 (let ((inst (sb-disassem:disassemble-instruction dstate
)))
340 ;; FIXME: this drops any LOCK prefix, but that seems to be ok
341 (do ((tail (cdr inst
) (cdr tail
)))
343 ;; This takes an instruction expressed thusly:
344 ;; (MOV (#S(MACHINE-EA :DISP n :BASE n) . :QWORD) RDX)
345 ;; and turns it into:
346 ;; (MOV :QWORD #S(MACHINE-EA :DISP n :BASE n) RDX)
347 (when (typep (car tail
) '(cons machine-ea
))
348 (let ((ea (caar tail
)))
349 (setf inst
(list* (car inst
) (cdar tail
) (cdr inst
))) ; insert the :size)
350 (when (eq (machine-ea-base ea
) sb-vm
::thread-reg
)
351 ;; Figure out if we're looking at an allocation buffer
352 (let ((disp (ash (machine-ea-disp ea
) (- sb-vm
:word-shift
))))
354 ((#.sb-vm
::thread-sys-mixed-tlab-slot
355 #.sb-vm
::thread-sys-cons-tlab-slot
356 #.sb-vm
::thread-mixed-tlab-slot
357 #.sb-vm
::thread-cons-tlab-slot
) :tlab-freeptr
)
358 ((#.
(1+ sb-vm
::thread-sys-mixed-tlab-slot
)
359 #.
(1+ sb-vm
::thread-sys-cons-tlab-slot
)
360 #.
(1+ sb-vm
::thread-mixed-tlab-slot
)
361 #.
(1+ sb-vm
::thread-cons-tlab-slot
)) :tlab-limit
))
363 (setf (car tail
) ea
)) ; change the EA
364 ;; There can be at most one EA per instruction, so we're done
366 (vector-push-extend inst vector
)
369 (defparameter *debug-deduce-type
* nil
)
370 (eval-when (:compile-toplevel
:execute
)
371 (defmacro note
(&rest args
)
372 `(when *debug-deduce-type
*
373 (let ((*print-pretty
* nil
))
374 (format t
,@args
)))))
376 ;;; If INPUT matches PATTERN then return a potentially amended
377 ;;; list of BINDINGS, otherwise return :FAIL.
378 (defun %matchp
(input pattern bindings
)
379 (when (eq (car pattern
) :or
)
380 ;; Match the first thing possible. There is no backtracking.
381 (dolist (choice (cdr pattern
) (return-from %matchp
:fail
))
382 (let ((new-bindings (%matchp input choice bindings
)))
383 (unless (eq new-bindings
:fail
)
384 (return-from %matchp new-bindings
)))))
385 (let ((inst (get-instruction input
)))
386 (note "Pat=~S Inst=~S bindings=~S~%" pattern inst bindings
)
387 (when (eq (car pattern
) :if
)
388 (unless (funcall (cadr pattern
) inst bindings
)
389 (return-from %matchp
:fail
))
390 (setq pattern
(caddr pattern
)))
391 (unless (string= (car inst
) (car pattern
))
392 (return-from %matchp
:fail
))
395 (when (eq pattern
'ignore
) ; don't parse operands
396 (return-from %matchp bindings
))
397 (labels ((match-atom (pattern input
&optional ea-reg-p
)
398 (note " match-atom ~s ~s ~s ~s~%" pattern input ea-reg-p bindings
)
399 (when (and (integerp input
) ea-reg-p
)
400 (setq input
(get-gpr :qword input
)))
401 (cond ((eq pattern
'?_
) t
) ; match and ignore anything
402 ((and (symbolp pattern
) (char= (char (string pattern
) 0) #\?))
403 ;; free variable binds to input, otherwise binding must match
404 (let* ((cell (assq pattern bindings
))
405 (binding (cdr cell
)))
407 (if (and (typep input
'reg
) (typep binding
'reg
))
408 ;; ignore the operand size I guess?
409 (= (reg-num input
) (reg-num binding
))
410 (eql input binding
)))
411 ((ok-binding pattern input
)
412 (note " binding ~s~%" pattern
)
413 (push (cons pattern input
) bindings
)))))
414 (t (eql pattern input
))))
415 (ok-binding (pattern input
)
418 (memq input
`(,sb-vm
:instance-pointer-lowtag
,sb-vm
:list-pointer-lowtag
419 ,sb-vm
:fun-pointer-lowtag
,sb-vm
:other-pointer-lowtag
)))
422 (let* ((pattern-operand (pop pattern
))
423 (inst-operand (pop inst
))
425 (etypecase pattern-operand
427 (match-atom pattern-operand inst-operand
))
429 (and (typep inst-operand
'machine-ea
)
430 (destructuring-bind (disp base
&optional index
) (cdr pattern-operand
)
431 (and (match-atom disp
(or (machine-ea-disp inst-operand
) 0))
432 (match-atom base
(machine-ea-base inst-operand
) t
)
433 (match-atom index
(machine-ea-index inst-operand
) t
)))))
434 ((or (integer 1 15) ; looks like a widetag
436 (eql pattern-operand inst-operand
)))))
438 (return-from %matchp
:fail
)))
440 (return (if (null inst
) bindings
)))))))
442 (defun matchp (input template bindings
&aux
(start (car input
)))
443 (macrolet ((inst-matchp (input pattern
)
444 `(let ((new-bindings (%matchp
,input
,pattern bindings
)))
445 (note "bindings <- ~s~%" new-bindings
)
446 (cond ((eq new-bindings
:fail
) nil
)
447 (t (setq bindings new-bindings
)
448 (incf (car input
))))))
450 `(progn (setf (car input
) start
) ; rewind the iterator
451 (return-from matchp
:fail
))))
452 (loop (when (endp template
) (return bindings
))
453 (let ((pattern (pop template
)))
456 ;; :OPTIONAL is greedy, preferring to match if it can,
457 ;; but if the rest of the template fails, we'll backtrack
458 ;; and skip this pattern.
459 (when (inst-matchp input
(cadr pattern
))
460 (let ((bindings (matchp input template bindings
)))
461 (cond ((eq bindings
:fail
)
462 (setf (car input
) start
)) ; don't match
464 (return bindings
))))))
466 ;; :REPEAT matches zero or more instructions, as few as possible.
467 (let ((next-pattern (pop template
)))
468 (loop (when (inst-matchp input next-pattern
) (return))
469 (unless (inst-matchp input
(cadr pattern
))
472 (unless (inst-matchp input pattern
)
475 (defun deduce-layout (iterator bindings
)
476 (unless (assq '?result bindings
)
477 (push `(?result .
,(cdr (assq '?free bindings
))) bindings
))
478 (when (eql (cdr (assq '?lowtag bindings
)) sb-vm
:instance-pointer-lowtag
)
479 (destructuring-bind (pos vector . dstate
) iterator
480 (let ((inst (aref vector
(1- pos
))))
481 (aver (eq (car inst
) 'or
))
482 (let* ((iterator (list* (- pos
2) vector dstate
))
483 (bindings (matchp iterator
484 (load-time-value `((mov :dword
(ea 4 ?result
) ?layout
)) t
)
486 (if (eq bindings
:fail
)
488 (layout-name (cdr (assq '?layout bindings
)))))))))
490 (defun deduce-fun-subtype (iterator bindings
)
491 (declare (ignorable iterator bindings
))
492 #-immobile-space
'function
497 `((mov ?scratch ?header
)
499 (ea ,(ash sb-vm
::thread-function-layout-slot sb-vm
:word-shift
)
500 ,(get-gpr :qword sb-vm
::thread-reg
)))
501 (mov :qword
(ea ,(- sb-vm
:fun-pointer-lowtag
) ?result
) ?scratch
))
504 (header (and (listp bindings
) (cdr (assoc '?header bindings
)))))
505 (if (and (integerp header
) (eq (logand header
#xFF
) sb-vm
:closure-widetag
))
509 (defun deduce-type (pc dstate code
&optional
(*debug-deduce-type
* *debug-deduce-type
*)
511 (dx-let ((iterator (list* 0 (make-array 16 :fill-pointer
0) dstate
)))
512 (iterator-begin iterator pc code
)
514 ;; Expect an increment of the allocation point hit counter
515 (let* ((inst (get-instruction iterator
))
516 (ea (third inst
))) ; (INC :qword EA)
517 (when (and (eq (car inst
) 'inc
) (machine-ea-base ea
) (null (machine-ea-index ea
)))
518 (incf (car iterator
))
519 (let ((profiler-base (machine-ea-base ea
))
520 (profiler-index (machine-ea-disp ea
)))
521 ;; Optional: the total number of bytes at the allocation point
522 (let* ((inst (get-instruction iterator
))
524 (when (and (eq (car inst
) 'add
)
526 (eql (machine-ea-base ea
) profiler-base
)
527 (null (machine-ea-index ea
))
528 (eql (machine-ea-disp ea
) (+ profiler-index sb-vm
:n-word-bytes
)))
529 (incf (car iterator
)))))))
531 ;; Expect a store to the pseudo-atomic flag
533 (when (eq (matchp iterator
534 (load-time-value `((mov :qword
,p-a-flag
,(get-gpr :qword thread-reg
))) t
)
536 (return-from deduce-type
(values nil nil
)))
540 (matchp iterator
`((mov :qword ?free
:tlab-freeptr
)) nil
))
543 (list (find template-name
*allocation-templates
* :key
'car
)))
546 *allocation-templates-large
*)
547 (t *allocation-templates
*))))
548 (dolist (allocator templates
549 (error "Unrecognized allocator at ~x in ~s:~{~%~S~}"
550 pc code
(coerce (second iterator
) 'list
)))
551 (note "Trying ~a~%" (car allocator
))
552 (let ((new-bindings (matchp iterator
(cdr allocator
) bindings
)))
553 (unless (eq new-bindings
:fail
)
554 (setq bindings new-bindings type
(car allocator
))
556 (if (eq bindings
:fail
)
558 (let ((nbytes (cdr (assoc '?nbytes bindings
)))
559 (header (cdr (assoc '?header bindings
)))
560 (lowtag (cdr (assoc '?lowtag bindings
))))
561 ;; matchp converts NIL in a machine-ea to 0. The disassembler uses
562 ;; NIL to signify that there was no displacement, which makes sense
563 ;; when register indirect mode is used without a SIB byte.
566 (cond ((and (member type
'(fixed+header var-array var-xadd any
))
567 (typep header
'(or sb-vm
:word sb-vm
:signed-word
)))
568 (setq type
(aref *tag-to-type
* (logand header
#xFF
)))
569 (when (register-p nbytes
)
571 (when (eq type
'instance
)
572 (setq type
(deduce-layout iterator bindings
))))
573 ((eq type
'list
) ; listify-rest-arg
574 (unless (integerp nbytes
)
577 (setq type
'list nbytes
(* 2 sb-vm
:cons-size sb-vm
:n-word-bytes
)))
578 ((member type
'(any unknown-header
))
579 (setq type
(case lowtag
580 (#.sb-vm
:list-pointer-lowtag
'list
)
581 (#.sb-vm
:instance-pointer-lowtag
'instance
)
582 (#.sb-vm
:fun-pointer-lowtag
583 (deduce-fun-subtype iterator bindings
))
585 (values type nbytes
))))))
587 ;;; Return a name for PC-OFFSET in CODE. PC-OFFSET is relative to
588 ;;; CODE-INSTRUCTIONS.
589 (defun pc-offset-to-fun-name (pc-offset code
)
590 (if (eq sb-vm
::*assembler-routines
* code
)
592 (maphash (lambda (k v
) ; FIXME: OAOO violation, at least twice over
593 (when (<= (car v
) pc-offset
(cadr v
))
595 (%code-debug-info code
)))
596 (sb-c::compiled-debug-fun-name
597 (sb-di::compiled-debug-fun-compiler-debug-fun
598 (sb-di::debug-fun-from-pc code pc-offset
)))))
600 (defun aprof-collect (stream)
601 (sb-disassem:get-inst-space
) ; for effect
602 (let* ((metadata *allocation-profile-metadata
*)
603 (n-hit *n-profile-sites
*)
604 (metadata-len (/ (length metadata
) 2))
605 (n-counters (min metadata-len n-hit
))
606 (sap (extern-alien "alloc_profile_buffer" system-area-pointer
))
608 (dstate (sb-disassem:make-dstate nil
))
609 (collection (make-hash-table :test
'equal
)))
611 (format stream
"~&~d (of ~d max) profile entries consumed~2%"
614 (when (>= index n-counters
)
616 (let ((count (sap-ref-word sap
(* index
8))))
617 (multiple-value-bind (code pc-offset total-bytes
)
618 (if (null (aref metadata
(ash index
1))) ; sized alloc
619 (values (aref metadata
(+ (ash index
1) 2))
620 (aref metadata
(+ (ash index
1) 3))
621 (sap-ref-word sap
(* (1+ index
) 8)))
622 (values (aref metadata
(+ (ash index
1) 0))
623 (aref metadata
(+ (ash index
1) 1))))
624 (incf index
(if total-bytes
2 1)) ; <count,bytes> or just a count
625 (unless (eql count
0)
626 (with-pinned-objects (code)
627 (let ((pc (+ (get-lisp-obj-address code
)
628 (- sb-vm
:other-pointer-lowtag
)
630 (name (pc-offset-to-fun-name
631 ;; Relativize to CODE-INSTRUCTIONS, not the base address
632 (- pc-offset
(ash (code-header-words code
) sb-vm
:word-shift
))
634 (multiple-value-bind (type size
) (deduce-type pc dstate code
)
635 (cond (size ; fixed-size allocation
636 (aver (not total-bytes
))
637 (push (make-alloc (* count size
) count type pc
)
638 (gethash name collection
)))
641 ;; The only allocator that determines TYPE at run-time
642 ;; is ALLOCATE-VECTOR-ON-HEAP. VAR-ALLOC uses a codegen
643 ;; arg, not a TN, for the widetag, but it looks like
644 ;; the type is dynamic because it kind of is: the
645 ;; header words is computed into a register and written
646 ;; in a single store.
647 (push (make-alloc total-bytes count type pc
)
648 (gethash name collection
)))))))))))))
650 (defun collapse-by-type (data &aux new
)
651 (dolist (datum data new
)
652 (let ((found (find (alloc-type datum
) new
:key
#'alloc-type
655 (incf (alloc-bytes found
) (alloc-bytes datum
))
656 (incf (alloc-count found
) (alloc-count datum
)))
658 (push datum new
))))))
660 ;; DETAIL NIL shows just function name and percent space consumption
661 ;; DETAIL T shows function, bytes, percent,
662 ;; and unless there is only one detail line, the detail lines
664 (defun aprof-show (&key
(top-n 20) (detail t
) (collapse t
) (stream *standard-output
*))
667 (let* ((collection (%hash-table-alist
(aprof-collect stream
)))
671 (reduce #'+ (cdr x
) :key
#'alloc-bytes
)
674 (sorted (sort (copy-list summary
) #'> :key
#'second
))
675 (total-bytes (reduce #'+ sorted
:key
#'second
))
680 (when (eq stream nil
)
681 (setq stream sb-impl
::*null-broadcast-stream
*)) ; lazy's person's approach
683 (format stream
"~& % Sum % Bytes Allocations Function~%")
684 (format stream
"~& ------- ------- ----------- ----------- --------~%"))
686 (format stream
"~& % Bytes Count ~:[~; PC ~]Function~%"
688 (format stream
"~& ------- ----------- --------- ~:[~;---------- ~]--------~%"
691 ;; In detailed view, each function takes up either one line or
692 ;; more than one line. In the interest of saving space, newlines are
693 ;; omitted between consecutive functions each of whose detail consists
694 ;; of a single line. But to avoid ambiguity, esure that there is always
695 ;; a blank line before and after each function having multiline detail.
696 (let ((emitted-newline t
))
698 (destructuring-bind (name bytes . data
) x
701 (setq data
(collapse-by-type data
)))
702 (setq data
(sort data
#'> :key
#'alloc-bytes
)))
703 (assert (eq bytes
(reduce #'+ data
:key
#'alloc-bytes
)))
704 (when (and detail
(cdr data
) (not emitted-newline
))
706 (incf sum-pct
(float (/ bytes total-bytes
)))
707 ;; Show summary for the function
709 (format stream
" ~5,1,2f ~5,1,2f ~12d~15d ~s~%"
710 (/ bytes total-bytes
)
713 (reduce #'+ data
:key
#'alloc-count
)
716 (format stream
" ~5,1,2f ~12d ~:[~10@t~;~:*~10d~]~@[~14@a~] ~s~@[ - ~s~]~%"
717 (/ bytes total-bytes
)
719 (if (cdr data
) nil
(alloc-count (car data
)))
722 (t (write-to-string (alloc-pc (car data
)) :base
16)))
724 (if (cdr data
) nil
(alloc-type (car data
)))
726 (when (and detail
(cdr data
))
728 (format stream
" ~5,1,2f ~12d ~10d~@[~14x~]~@[ ~s~]~%"
729 (/ (alloc-bytes point
) bytes
) ; fraction within function
732 (if collapse nil
(alloc-pc point
))
733 (alloc-type point
))))
734 (incf sum-bytes bytes
)
736 (setq emitted-newline
(not (null (cdr data
)))))
739 (if (and (neq top-n
:all
) (>= i top-n
)) (return))))
740 ; (assert (= sum-bytes total-bytes))
742 (format stream
"~19@t===========~%~19@t~11d~%" sum-bytes
))
744 (format stream
" ======= ===========~%~6,1,2f ~12d~%"
748 ;;; Call FUN and return the exact number of bytes it (an all descendant
749 ;;; calls) allocated, provided that they were instrumented for precise
751 ;;; STREAM is where to report, defaulting to *standard-output*.
752 ;;; The convention is that of map-segment-instructions, meaning NIL is a sink.
753 (defun aprof-run (fun &key
(report t
) (stream *standard-output
*) arguments
)
756 (dx-let ((arglist (cons arguments nil
))) ; so no consing in here
757 (when (listp arguments
)
758 (setq arglist
(car arglist
))) ; was already a list
761 (progn (aprof-start) (apply fun arglist
))
764 (setq nbytes
(aprof-show :stream stream
))
765 (when stream
(terpri stream
)))
771 ;;; The default output is a report showing the top 20 allocators
772 ;;; (by function name) with a line of detail for each distinct
773 ;;; type of object allocated within the function.
775 ;;; * (aprof-run (lambda () (compile-file "~/aprof")))
776 ;;; % Bytes Count Function
777 ;;; ------- ----------- --------- --------
778 ;;; 6.6 1980160 INIT-SB-VECTORS
779 ;;; 79.1 1565824 48880 SIMPLE-BIT-VECTOR
780 ;;; 20.9 414336 1872 SIMPLE-VECTOR
782 ;;; 4.6 1368960 MAKE-TN
783 ;;; 83.3 1140800 7130 TN
784 ;;; 16.7 228160 7130 SIMPLE-BIT-VECTOR
786 ;;; The report can be made more detailed by not combining lines
787 ;;; for the same object type within a function:
789 ;;; * (aprof-show :detail t :collapse nil)
791 ;;; % Bytes Count PC Function
792 ;;; ------- ----------- --------- ---------- --------
793 ;;; 6.6 1980160 INIT-SB-VECTORS
794 ;;; 76.0 1504256 47008 21D07E88 SIMPLE-BIT-VECTOR
795 ;;; 20.9 414336 1872 21D07D28 SIMPLE-VECTOR
796 ;;; 3.1 61568 1872 21D07F30 SIMPLE-BIT-VECTOR
798 ;;; 4.6 1368960 MAKE-TN
799 ;;; 83.3 1140800 7130 21C98638 TN
800 ;;; 16.7 228160 7130 21C985D8 SIMPLE-BIT-VECTOR
802 ;;; 4.0 1204160 MAKE-HASH-TABLE
803 ;;; 39.3 473472 1644 21B2A008 SIMPLE-VECTOR
804 ;;; 37.1 447168 1644 21B29F50 (SIMPLE-ARRAY (UNSIGNED-BYTE 64) (*))
805 ;;; 21.8 263040 1644 21B29FA8 (SIMPLE-ARRAY (UNSIGNED-BYTE 64) (*))
806 ;;; 1.7 20480 128 21B2A098 (SIMPLE-ARRAY (UNSIGNED-BYTE 64) (*))
808 ;;; Or less detailed by combining all lines within a function:
810 ;;; * (sb-aprof::aprof-show :detail nil :top-n 1000) ; or anything
811 ;;; % Sum % Bytes Allocations Function
812 ;;; ------- ------- ----------- ----------- --------
813 ;;; 6.6 6.6 1980160 50752 INIT-SB-VECTORS
814 ;;; 4.6 11.1 1368960 14260 MAKE-TN
815 ;;; 4.0 15.1 1204160 5060 MAKE-HASH-TABLE
816 ;;; 3.8 19.0 1154384 28727 CONSTRAIN-REF-TYPE
817 ;;; 3.8 22.8 1152160 13084 COPY-CONSET
818 ;;; 3.6 26.5 1094240 13678 MAKE-TN-REF
819 ;;; ... many more lines ...
820 ;;; 00.0 100.0 16 1 %ENTER-NEW-NICKNAMES
821 ;;; 00.0 100.0 16 1 UNIX-LSTAT