%other-pointer-widetag derive-type: derive for simple-array.
[sbcl.git] / src / code / aprof.lisp
blob8adca72c8ae402524a293ba0dde8cca04dc56d01
1 ;;; Allocation profiler
3 ;;; Quick start: To enable this code, you must DECLAIM or DECLARE
4 ;;; (OPTIMIZE SB-C::INSTRUMENT-CONSING)
5 ;;; to affect functions in which instrumentation should be inserted.
6 ;;; After that, the simplest way to run the function under test is to pass
7 ;;; it to APROF-RUN. Alternatively, collection can be toggled with APROF-START
8 ;;; and APROF-STOP, but some setup is needed (as done by APROF-RUN).
9 ;;;
10 ;;; Note: This is x86-64 only at the moment.
12 ;;; Profiling works via compile-time instrumentation.
13 ;;; Each point in code emitted by ALLOCATION adds a few instructions
14 ;;; prior to the actual inline allocation sequence. The extra instructions
15 ;;; are normally disabled (there is a jump around them). When the profiler
16 ;;; is switched on, the instructions are modified to atomically increment
17 ;;; a counter (or two). If the allocation is a compile-time fixed size,
18 ;;; only one counter is needed. If the size is variable, then two counters
19 ;;; are used: hits and total bytes.
20 ;;; The array of counters is prepared by the profiler when first started.
21 ;;; The index into the array for any particular counter is determined
22 ;;; the first time that counter is hit. Subsequent hits are cheap.
24 ;;; On x86-64, instrumented code initially looks like this:
26 ;;; 9E98: 4D8B5D18 MOV R11, [R13+24] ; thread.profile-data
27 ;;; 9E9C: 4D85DB TEST R11, R11
28 ;;; 9E9F: EB17 JMP L21
29 ;;; 9EA1: 0F1F8000000000 NOP
30 ;;; 9EA8: E8D366FDFF CALL #x21B00580 ; ENABLE-SIZED-ALLOC-COUNTER
31 ;;; 9EAD: 0F1F00 NOP
32 ;;; 9EB0: 4885C9 TEST RCX, RCX
33 ;;; 9EB3: 0F1F440000 NOP
34 ;;; 9EB8: L21: 4D8B5D20 MOV R11, [R13+32] ; thread.alloc-region
35 ;;;
36 ;;; The first TEST instruction's result is ignored.
37 ;;; The second TEST conveys the register holding the size to allocate in bytes.
39 ;;; After enabling profiling, the first JMP becomes a JEQ so that
40 ;;; if the data buffer exists, the CALL is performed.
41 ;;; If this code is actually reached, and the call performed,
42 ;;; then the call instruction is modified to (something like):
43 ;;;
44 ;;; 9E98: 4D8B5D18 MOV R11, [R13+24] ; thread.profile-data
45 ;;; 9E9C: 4D85DB TEST R11, R11
46 ;;; 9E9F: 7417 JEQ L21
47 ;;; 9EA1: 0F1F8000000000 NOP
48 ;;; 9EA8: F049FF83C8020000 LOCK INC QWORD PTR [R11+712]
49 ;;; 9EB0: F049018BD0020000 LOCK ADD [R11+720], RCX
50 ;;; 9EB8: L21: 4D8B5D20 MOV R11, [R13+32] ; thread.alloc-region
52 ;;; All of the modifications to running code can be made safely
53 ;;; even if multiple threads are in that code at the same time, because:
54 ;;;
55 ;;; * changing a JMP to a JNE requires writing only 1 byte.
56 ;;; Instruction fetch will either see the new byte or the old,
57 ;;; and it doesn't matter which.
58 ;;;
59 ;;; * changing CALL to LOCK INC writes one naturally aligned octword.
60 ;;; Moreover, RIP gets decremented so that the next fetch won't read
61 ;;; the NOP or only part of the bytes of the LOCK INC.
63 ;;; See the example run at the bottom of this file.
65 (defpackage #:sb-aprof
66 (:use #:cl #:sb-ext #:sb-alien #:sb-sys #:sb-int #:sb-kernel)
67 (:export #:aprof-run #:aprof-show #:aprof-reset
68 #:aprof-start #:aprof-stop #:patch-all-code)
69 (:import-from #:sb-di #:valid-tagged-pointer-p)
70 (:import-from #:sb-vm #:thread-reg)
71 (:import-from #:sb-x86-64-asm
72 #:register-p #:get-gpr #:reg #:reg-num
73 #:machine-ea #:machine-ea-p
74 #:machine-ea-disp #:machine-ea-base #:machine-ea-index
75 #:inc #:add #:mov))
77 (in-package #:sb-aprof)
78 (setf (system-package-p *package*) t)
80 (defstruct (alloc (:constructor make-alloc (bytes count type pc)))
81 bytes count type pc)
82 (declaim (freeze-type alloc))
84 (defglobal *allocation-profile-metadata* nil)
86 (define-alien-variable alloc-profile-buffer system-area-pointer)
87 (defun aprof-reset ()
88 (let ((buffer alloc-profile-buffer))
89 (unless (= (sap-int buffer) 0)
90 (alien-funcall (extern-alien "memset" (function void system-area-pointer int size-t))
91 alloc-profile-buffer
93 (* (/ (length *allocation-profile-metadata*) 2)
94 sb-vm:n-word-bytes)))))
96 (defun patch-code (code locs enable &aux (n 0) (n-patched 0))
97 (let ((enable-counted (sb-fasl:get-asm-routine 'sb-vm::enable-alloc-counter))
98 (enable-sized (sb-fasl:get-asm-routine 'sb-vm::enable-sized-alloc-counter))
99 (enable-counted-indirect
100 (sb-fasl:get-asm-routine 'sb-vm::enable-alloc-counter t))
101 (enable-sized-indirect
102 (sb-fasl:get-asm-routine 'sb-vm::enable-sized-alloc-counter t))
103 (stack (make-array 1 :element-type 'sb-vm:word))
104 (insts (code-instructions code)))
105 (declare (dynamic-extent stack))
106 (with-alien ((allocation-tracker-counted (function void system-area-pointer) :extern)
107 (allocation-tracker-sized (function void system-area-pointer) :extern))
108 (do-packed-varints (loc locs)
109 (incf n)
110 (let ((byte (sap-ref-8 insts loc)))
111 (when (eql byte #xEB)
112 (setf (sap-ref-8 insts loc) #x74) ; JEQ
113 (when enable
114 (let* ((next-pc (+ loc 2))
115 (aligned-next-pc (align-up next-pc 8))
116 (opcode (sap-ref-8 insts aligned-next-pc)))
117 (case opcode
118 (#xE8 ; CALL rel32
119 (let* ((rel32 (signed-sap-ref-32 insts (1+ aligned-next-pc)))
120 (return-pc (sap+ insts (+ aligned-next-pc 5)))
121 (target (sap+ return-pc rel32)))
122 ;; The C rountine looks at the return address to see where
123 ;; to patch in the new instructions, and it gets the return address
124 ;; from the stack, the pointer to which is supplied as the arg.
125 ;; So STACK is the simulated stack containing the return PC.
126 (setf (aref stack 0) (sap-int return-pc))
127 (cond ((= (sap-int target) enable-counted)
128 (alien-funcall allocation-tracker-counted (vector-sap stack)))
129 ((= (sap-int target) enable-sized)
130 (alien-funcall allocation-tracker-sized (vector-sap stack)))
131 ;; Dynamic-space code can't encode call32 to an asm routine. Instead
132 ;; might see CALL to a JMP within the code blob so that we don't
133 ;; produce one absolute fixup per allocation site.
134 ((and (sap>= target insts)
135 (sap< target (sap+ insts (%code-text-size code)))
136 (= (sap-ref-8 target 0) #xFF)
137 (= (sap-ref-8 target 1) #x24)
138 (= (sap-ref-8 target 2) #x25))
139 (let ((ea (sap-ref-32 target 3)))
140 (cond ((= ea enable-counted-indirect)
141 (alien-funcall allocation-tracker-counted (vector-sap stack)))
142 ((= ea enable-sized-indirect)
143 (alien-funcall allocation-tracker-sized (vector-sap stack)))
145 (error "Unrecognized CALL [EA] at patch site in ~A @ ~X"
146 code loc)))))
148 (error "Unrecognized CALL at patch site in ~A @ ~X"
149 code loc)))))
151 (error "Unrecognized opcode at patch site in ~A @ ~X"
152 code loc)))))
153 (incf n-patched)))))
154 (values n n-patched)))
156 ;; Fixed-size allocations consume 1 entry (hit count).
157 ;; Variable-size consume 2 (hit count and total bytes).
158 ;; Counters 0 and 1 are reserved for variable-size allocations
159 ;; (hit count and total size) that overflow the maximum counter index.
160 ;; Counter 2 is reserved for fixed-size allocations.
161 (define-load-time-global *n-profile-sites* 3)
163 (defun aprof-start ()
164 (with-alien ((alloc-profile-data unsigned :extern))
165 (when (zerop alloc-profile-data) ; raw data buffer not made yet
166 (let ((v *allocation-profile-metadata*))
167 ;; Lisp metadata may or may not exist depending on whether you saved a core image
168 ;; with the metadata baked in.
169 (unless v
170 (setq v (make-array 300000)
171 *allocation-profile-metadata* v))
172 (with-pinned-objects (v)
173 (setf alloc-profile-data (get-lisp-obj-address v))))))
174 (alien-funcall (extern-alien "allocation_profiler_start" (function void))))
176 (defun aprof-stop ()
177 (alien-funcall (extern-alien "allocation_profiler_stop" (function void))))
179 ;;; Passing ENABLE presumes that you want to later use the profiler
180 ;;; without relying on self-modifying code. Everything can be patched now
181 ;;; and then the image dumped. This plays more nicely with some sandboxed
182 ;;; environments that forbid executable text segments, which is
183 ;;; predominantly a concern for ELFinated cores.
184 (defun patch-all-code (&optional enable)
185 (when enable
186 ;; Somewhat un-obviously, we have to "start" the profiler so that the C
187 ;; support figures out how many alloc site indices it can utilize
188 ;; based on the size of the profile data vector passed from lisp.
189 (aprof-start)
190 ;; Then stop, because we don't actually want to profile anything now.
191 (aprof-stop))
192 (let ((total-n-patch-points 0)
193 (total-n-patched 0)
194 (ht sb-c::*allocation-patch-points*))
195 (dohash ((code locs) ht)
196 (remhash code ht)
197 (multiple-value-bind (n-patch-points n-patched)
198 ;; just being pedantic about pinning here for documentation
199 (with-pinned-objects (code)
200 (patch-code code locs enable))
201 (incf total-n-patch-points n-patch-points)
202 (incf total-n-patched n-patched)))
203 (values total-n-patch-points total-n-patched)))
205 (defglobal *tag-to-type*
206 (map 'vector
207 (lambda (x)
208 (cond ((sb-vm::specialized-array-element-type-properties-p x)
209 (let ((et (sb-vm:saetp-specifier x)))
210 (sb-kernel:type-specifier
211 (sb-kernel:specifier-type `(simple-array ,et 1)))))
213 (sb-vm::room-info-name x))))
214 sb-vm::*room-info*))
216 (defun layout-name (ptr)
217 (if (eql (valid-tagged-pointer-p (int-sap ptr)) 0)
218 'structure
219 (layout-classoid-name (make-lisp-obj ptr))))
221 ;;; These EAs are s-expressions, not instances of EA or MACHINE-EA.
222 #-sb-safepoint
223 (defconstant-eqx p-a-flag `(ea ,(ash sb-vm::thread-pseudo-atomic-bits-slot sb-vm:word-shift)
224 ,(get-gpr :qword sb-vm::thread-reg))
225 #'equal)
227 ;;; Templates to try in order. The one for unknown headered objects should be last
228 ;;; so that we try to match a store to the header word if possible.
229 (defglobal *allocation-templates* nil)
230 ;;; Running sb-aprof with #+sb-show is not an important concern,
231 ;;; and I don't care to fix it. It gets an error here:
232 ;;; "don't know how to dump R13 (default MAKE-LOAD-FORM method called)."
233 #-sb-show
234 (setq *allocation-templates*
235 `((fixed+header
236 (add ?end ?nbytes)
237 (cmp :qword ?end :tlab-limit)
238 (jmp :a ?_)
239 (mov :qword :tlab-freeptr ?end)
240 (:or (add ?end ?bias) (dec ?end))
241 (mov ?_ (ea ?_ ?end) ?header))
243 (var-array
244 (add ?end ?nbytes)
245 (cmp :qword ?end :tlab-limit)
246 (jmp :a ?_)
247 (mov :qword :tlab-freeptr ?end)
248 (sub ?end ?nbytes)
249 (mov ?_ (ea ?_ ?end) ?header)
250 (mov ?_ (ea ?_ ?end) ?vector-len))
252 (var-xadd
253 ;; after the xadd, SIZE holds the original value of free-ptr
254 ;; and free-ptr points to the end of the putative data block.
255 (xadd ?free ?size)
256 (cmp :qword ?free :tlab-limit)
257 (jmp :a ?_)
258 (mov :qword :tlab-freeptr ?free)
259 ;; Could have one or two stores prior to ORing in a lowtag.
260 (:optional (mov ?_ (ea 0 ?size) ?header))
261 (:optional (mov ?_ (ea 8 ?size) ?vector-len))
262 (:or (or ?size ?lowtag)
263 (lea :qword ?result (ea ?lowtag ?size))))
265 (any (:or (lea :qword ?end (ea ?nbytes ?free ?nbytes-var))
266 ;; LEA with scale=1 can have base and index swapped
267 (lea :qword ?end (ea 0 ?nbytes-var ?free))
268 (add ?end ?free)) ; ?end originally holds the size in bytes
269 (cmp :qword ?end :tlab-limit)
270 (jmp :a ?_)
271 (mov :qword :tlab-freeptr ?end)
272 (mov ?_ (ea 0 ?free) ?header)
273 (:optional (mov ?_ (ea ?_ ?free) ?vector-len))
274 (:or (or ?free ?lowtag)
275 (lea :qword ?result (ea ?lowtag ?free))))
277 ;; LISTIFY-REST-ARG computes a tagged pointer to the _last_ cons in the memory block,
278 ;; not the first cons.
279 (list (lea :qword ?end (ea 0 ?nbytes ?free))
280 (cmp :qword ?end :tlab-limit)
281 (jmp :a ?_)
282 (mov :qword :tlab-freeptr ?end)
283 (lea :qword ?free (ea ,(- sb-vm:list-pointer-lowtag
284 (* sb-vm:cons-size sb-vm:n-word-bytes))
285 ?free ?nbytes))
286 (shr ?nbytes 4))
288 (acons (lea :qword ?end (ea 32 ?free))
289 (cmp :qword ?end :tlab-limit)
290 (jmp :a ?_)
291 (mov :qword :tlab-freeptr ?end)
292 (:repeat (mov . ignore))
293 (lea :qword ?result (ea #.(+ 16 sb-vm:list-pointer-lowtag) ?free)))
295 ;; either non-headered object (cons) or unknown header or unknown nbytes
296 (unknown-header (:or (lea :qword ?end (ea ?nbytes ?free ?nbytes-var))
297 (lea :qword ?end (ea 0 ?nbytes-var ?free))
298 (add ?end ?free))
299 (cmp :qword ?end :tlab-limit)
300 (jmp :a ?_)
301 (mov :qword :tlab-freeptr ?end)
302 (:repeat (:or (mov . ignore) (lea . ignore)))
303 (:or (or ?free ?lowtag)
304 (lea :qword ?result (ea ?lowtag ?free))))))
306 (defglobal *allocation-templates-large* nil)
307 (setq *allocation-templates-large*
308 `((array (push ?nbytes)
309 (call . ignore)
310 (pop ?result)
311 (mov ?_ (ea 0 ?result) ?header)
312 (mov ?_ (ea ?_ ?result) ?vector-len)
313 (or ?result ?lowtag))
314 ;; not really "large" but same as preceding
315 (funinstance (push ?nbytes)
316 (call . ignore)
317 (pop ?result)
318 (mov ?_ (ea 0 ?result) ?header)
319 (or ?result ?lowtag))
320 (list (push ?nbytes)
321 (call . ignore)
322 (pop ?result)
323 (or ?result ?lowtag))))
325 (defun iterator-begin (iterator pc code)
326 (let ((segment (sb-disassem:make-code-segment
327 code
328 (sb-sys:sap- (sb-sys:int-sap pc) (sb-kernel:code-instructions code))
329 1000)) ; arbitrary
330 (dstate (cddr iterator)))
331 (setf (sb-disassem:dstate-segment dstate) segment
332 (sb-disassem:dstate-segment-sap dstate) (funcall (sb-disassem:seg-sap-maker segment))
333 (sb-disassem:dstate-cur-offs dstate) 0)))
335 (defun get-instruction (iterator)
336 (destructuring-bind (pos vector . dstate) iterator
337 (if (< pos (length vector))
338 (aref vector pos)
339 (let ((inst (sb-disassem:disassemble-instruction dstate)))
340 ;; FIXME: this drops any LOCK prefix, but that seems to be ok
341 (do ((tail (cdr inst) (cdr tail)))
342 ((null tail))
343 ;; This takes an instruction expressed thusly:
344 ;; (MOV (#S(MACHINE-EA :DISP n :BASE n) . :QWORD) RDX)
345 ;; and turns it into:
346 ;; (MOV :QWORD #S(MACHINE-EA :DISP n :BASE n) RDX)
347 (when (typep (car tail) '(cons machine-ea))
348 (let ((ea (caar tail)))
349 (setf inst (list* (car inst) (cdar tail) (cdr inst))) ; insert the :size)
350 (when (eq (machine-ea-base ea) sb-vm::thread-reg)
351 ;; Figure out if we're looking at an allocation buffer
352 (let ((disp (ash (machine-ea-disp ea) (- sb-vm:word-shift))))
353 (awhen (case disp
354 ((#.sb-vm::thread-sys-mixed-tlab-slot
355 #.sb-vm::thread-sys-cons-tlab-slot
356 #.sb-vm::thread-mixed-tlab-slot
357 #.sb-vm::thread-cons-tlab-slot) :tlab-freeptr)
358 ((#.(1+ sb-vm::thread-sys-mixed-tlab-slot)
359 #.(1+ sb-vm::thread-sys-cons-tlab-slot)
360 #.(1+ sb-vm::thread-mixed-tlab-slot)
361 #.(1+ sb-vm::thread-cons-tlab-slot)) :tlab-limit))
362 (setq ea it))))
363 (setf (car tail) ea)) ; change the EA
364 ;; There can be at most one EA per instruction, so we're done
365 (return)))
366 (vector-push-extend inst vector)
367 inst))))
369 (defparameter *debug-deduce-type* nil)
370 (eval-when (:compile-toplevel :execute)
371 (defmacro note (&rest args)
372 `(when *debug-deduce-type*
373 (let ((*print-pretty* nil))
374 (format t ,@args)))))
376 ;;; If INPUT matches PATTERN then return a potentially amended
377 ;;; list of BINDINGS, otherwise return :FAIL.
378 (defun %matchp (input pattern bindings)
379 (when (eq (car pattern) :or)
380 ;; Match the first thing possible. There is no backtracking.
381 (dolist (choice (cdr pattern) (return-from %matchp :fail))
382 (let ((new-bindings (%matchp input choice bindings)))
383 (unless (eq new-bindings :fail)
384 (return-from %matchp new-bindings)))))
385 (let ((inst (get-instruction input)))
386 (note "Pat=~S Inst=~S bindings=~S~%" pattern inst bindings)
387 (when (eq (car pattern) :if)
388 (unless (funcall (cadr pattern) inst bindings)
389 (return-from %matchp :fail))
390 (setq pattern (caddr pattern)))
391 (unless (string= (car inst) (car pattern))
392 (return-from %matchp :fail))
393 (pop pattern)
394 (pop inst)
395 (when (eq pattern 'ignore) ; don't parse operands
396 (return-from %matchp bindings))
397 (labels ((match-atom (pattern input &optional ea-reg-p)
398 (note " match-atom ~s ~s ~s ~s~%" pattern input ea-reg-p bindings)
399 (when (and (integerp input) ea-reg-p)
400 (setq input (get-gpr :qword input)))
401 (cond ((eq pattern '?_) t) ; match and ignore anything
402 ((and (symbolp pattern) (char= (char (string pattern) 0) #\?))
403 ;; free variable binds to input, otherwise binding must match
404 (let* ((cell (assq pattern bindings))
405 (binding (cdr cell)))
406 (cond (cell
407 (if (and (typep input 'reg) (typep binding 'reg))
408 ;; ignore the operand size I guess?
409 (= (reg-num input) (reg-num binding))
410 (eql input binding)))
411 ((ok-binding pattern input)
412 (note " binding ~s~%" pattern)
413 (push (cons pattern input) bindings)))))
414 (t (eql pattern input))))
415 (ok-binding (pattern input)
416 (case pattern
417 (?lowtag
418 (memq input `(,sb-vm:instance-pointer-lowtag ,sb-vm:list-pointer-lowtag
419 ,sb-vm:fun-pointer-lowtag ,sb-vm:other-pointer-lowtag)))
420 (t t))))
421 (loop
422 (let* ((pattern-operand (pop pattern))
423 (inst-operand (pop inst))
424 (matchp
425 (etypecase pattern-operand
426 (symbol
427 (match-atom pattern-operand inst-operand))
428 ((cons (eql ea))
429 (and (typep inst-operand 'machine-ea)
430 (destructuring-bind (disp base &optional index) (cdr pattern-operand)
431 (and (match-atom disp (or (machine-ea-disp inst-operand) 0))
432 (match-atom base (machine-ea-base inst-operand) t)
433 (match-atom index (machine-ea-index inst-operand) t)))))
434 ((or (integer 1 15) ; looks like a widetag
435 reg)
436 (eql pattern-operand inst-operand)))))
437 (unless matchp
438 (return-from %matchp :fail)))
439 (when (null pattern)
440 (return (if (null inst) bindings)))))))
442 (defun matchp (input template bindings &aux (start (car input)))
443 (macrolet ((inst-matchp (input pattern)
444 `(let ((new-bindings (%matchp ,input ,pattern bindings)))
445 (note "bindings <- ~s~%" new-bindings)
446 (cond ((eq new-bindings :fail) nil)
447 (t (setq bindings new-bindings)
448 (incf (car input))))))
449 (fail ()
450 `(progn (setf (car input) start) ; rewind the iterator
451 (return-from matchp :fail))))
452 (loop (when (endp template) (return bindings))
453 (let ((pattern (pop template)))
454 (case (car pattern)
455 (:optional
456 ;; :OPTIONAL is greedy, preferring to match if it can,
457 ;; but if the rest of the template fails, we'll backtrack
458 ;; and skip this pattern.
459 (when (inst-matchp input (cadr pattern))
460 (let ((bindings (matchp input template bindings)))
461 (cond ((eq bindings :fail)
462 (setf (car input) start)) ; don't match
464 (return bindings))))))
465 (:repeat
466 ;; :REPEAT matches zero or more instructions, as few as possible.
467 (let ((next-pattern (pop template)))
468 (loop (when (inst-matchp input next-pattern) (return))
469 (unless (inst-matchp input (cadr pattern))
470 (fail)))))
472 (unless (inst-matchp input pattern)
473 (fail))))))))
475 (defun deduce-layout (iterator bindings)
476 (unless (assq '?result bindings)
477 (push `(?result . ,(cdr (assq '?free bindings))) bindings))
478 (when (eql (cdr (assq '?lowtag bindings)) sb-vm:instance-pointer-lowtag)
479 (destructuring-bind (pos vector . dstate) iterator
480 (let ((inst (aref vector (1- pos))))
481 (aver (eq (car inst) 'or))
482 (let* ((iterator (list* (- pos 2) vector dstate))
483 (bindings (matchp iterator
484 (load-time-value `((mov :dword (ea 4 ?result) ?layout)) t)
485 bindings)))
486 (if (eq bindings :fail)
487 'instance
488 (layout-name (cdr (assq '?layout bindings)))))))))
490 (defun deduce-fun-subtype (iterator bindings)
491 (declare (ignorable iterator bindings))
492 #-immobile-space 'function
493 #+immobile-space
494 (let* ((bindings
495 (matchp iterator
496 (load-time-value
497 `((mov ?scratch ?header)
498 (or :qword ?scratch
499 (ea ,(ash sb-vm::thread-function-layout-slot sb-vm:word-shift)
500 ,(get-gpr :qword sb-vm::thread-reg)))
501 (mov :qword (ea ,(- sb-vm:fun-pointer-lowtag) ?result) ?scratch))
503 bindings))
504 (header (and (listp bindings) (cdr (assoc '?header bindings)))))
505 (if (and (integerp header) (eq (logand header #xFF) sb-vm:closure-widetag))
506 'closure
507 'function)))
509 (defun deduce-type (pc dstate code &optional (*debug-deduce-type* *debug-deduce-type*)
510 template-name)
511 (dx-let ((iterator (list* 0 (make-array 16 :fill-pointer 0) dstate)))
512 (iterator-begin iterator pc code)
514 ;; Expect an increment of the allocation point hit counter
515 (let* ((inst (get-instruction iterator))
516 (ea (third inst))) ; (INC :qword EA)
517 (when (and (eq (car inst) 'inc) (machine-ea-base ea) (null (machine-ea-index ea)))
518 (incf (car iterator))
519 (let ((profiler-base (machine-ea-base ea))
520 (profiler-index (machine-ea-disp ea)))
521 ;; Optional: the total number of bytes at the allocation point
522 (let* ((inst (get-instruction iterator))
523 (ea (third inst)))
524 (when (and (eq (car inst) 'add)
525 (machine-ea-p ea)
526 (eql (machine-ea-base ea) profiler-base)
527 (null (machine-ea-index ea))
528 (eql (machine-ea-disp ea) (+ profiler-index sb-vm:n-word-bytes)))
529 (incf (car iterator)))))))
531 ;; Expect a store to the pseudo-atomic flag
532 #-sb-safepoint
533 (when (eq (matchp iterator
534 (load-time-value `((mov :qword ,p-a-flag ,(get-gpr :qword thread-reg))) t)
535 nil) :fail)
536 (return-from deduce-type (values nil nil)))
538 (let* ((type)
539 (bindings
540 (matchp iterator `((mov :qword ?free :tlab-freeptr)) nil))
541 (templates
542 (cond (template-name
543 (list (find template-name *allocation-templates* :key 'car)))
544 ((eq bindings :fail)
545 (setq bindings nil)
546 *allocation-templates-large*)
547 (t *allocation-templates*))))
548 (dolist (allocator templates
549 (error "Unrecognized allocator at ~x in ~s:~{~%~S~}"
550 pc code (coerce (second iterator) 'list)))
551 (note "Trying ~a~%" (car allocator))
552 (let ((new-bindings (matchp iterator (cdr allocator) bindings)))
553 (unless (eq new-bindings :fail)
554 (setq bindings new-bindings type (car allocator))
555 (return))))
556 (if (eq bindings :fail)
557 (values nil nil)
558 (let ((nbytes (cdr (assoc '?nbytes bindings)))
559 (header (cdr (assoc '?header bindings)))
560 (lowtag (cdr (assoc '?lowtag bindings))))
561 ;; matchp converts NIL in a machine-ea to 0. The disassembler uses
562 ;; NIL to signify that there was no displacement, which makes sense
563 ;; when register indirect mode is used without a SIB byte.
564 (when (eq nbytes 0)
565 (setq nbytes nil))
566 (cond ((and (member type '(fixed+header var-array var-xadd any))
567 (typep header '(or sb-vm:word sb-vm:signed-word)))
568 (setq type (aref *tag-to-type* (logand header #xFF)))
569 (when (register-p nbytes)
570 (setq nbytes nil))
571 (when (eq type 'instance)
572 (setq type (deduce-layout iterator bindings))))
573 ((eq type 'list) ; listify-rest-arg
574 (unless (integerp nbytes)
575 (setq nbytes nil)))
576 ((eq type 'acons)
577 (setq type 'list nbytes (* 2 sb-vm:cons-size sb-vm:n-word-bytes)))
578 ((member type '(any unknown-header))
579 (setq type (case lowtag
580 (#.sb-vm:list-pointer-lowtag 'list)
581 (#.sb-vm:instance-pointer-lowtag 'instance)
582 (#.sb-vm:fun-pointer-lowtag
583 (deduce-fun-subtype iterator bindings))
584 (t '#:|unknown|)))))
585 (values type nbytes))))))
587 ;;; Return a name for PC-OFFSET in CODE. PC-OFFSET is relative to
588 ;;; CODE-INSTRUCTIONS.
589 (defun pc-offset-to-fun-name (pc-offset code)
590 (if (eq sb-vm::*assembler-routines* code)
591 (block nil
592 (maphash (lambda (k v) ; FIXME: OAOO violation, at least twice over
593 (when (<= (car v) pc-offset (cadr v))
594 (return k)))
595 (%code-debug-info code)))
596 (sb-c::compiled-debug-fun-name
597 (sb-di::compiled-debug-fun-compiler-debug-fun
598 (sb-di::debug-fun-from-pc code pc-offset)))))
600 (defun aprof-collect (stream)
601 (sb-disassem:get-inst-space) ; for effect
602 (let* ((metadata *allocation-profile-metadata*)
603 (n-hit *n-profile-sites*)
604 (metadata-len (/ (length metadata) 2))
605 (n-counters (min metadata-len n-hit))
606 (sap (extern-alien "alloc_profile_buffer" system-area-pointer))
607 (index 3)
608 (dstate (sb-disassem:make-dstate nil))
609 (collection (make-hash-table :test 'equal)))
610 (when stream
611 (format stream "~&~d (of ~d max) profile entries consumed~2%"
612 n-hit metadata-len))
613 (loop
614 (when (>= index n-counters)
615 (return collection))
616 (let ((count (sap-ref-word sap (* index 8))))
617 (multiple-value-bind (code pc-offset total-bytes)
618 (if (null (aref metadata (ash index 1))) ; sized alloc
619 (values (aref metadata (+ (ash index 1) 2))
620 (aref metadata (+ (ash index 1) 3))
621 (sap-ref-word sap (* (1+ index) 8)))
622 (values (aref metadata (+ (ash index 1) 0))
623 (aref metadata (+ (ash index 1) 1))))
624 (incf index (if total-bytes 2 1)) ; <count,bytes> or just a count
625 (unless (eql count 0)
626 (with-pinned-objects (code)
627 (let ((pc (+ (get-lisp-obj-address code)
628 (- sb-vm:other-pointer-lowtag)
629 pc-offset))
630 (name (pc-offset-to-fun-name
631 ;; Relativize to CODE-INSTRUCTIONS, not the base address
632 (- pc-offset (ash (code-header-words code) sb-vm:word-shift))
633 code)))
634 (multiple-value-bind (type size) (deduce-type pc dstate code)
635 (cond (size ; fixed-size allocation
636 (aver (not total-bytes))
637 (push (make-alloc (* count size) count type pc)
638 (gethash name collection)))
639 (t ; variable-size
640 (aver total-bytes)
641 ;; The only allocator that determines TYPE at run-time
642 ;; is ALLOCATE-VECTOR-ON-HEAP. VAR-ALLOC uses a codegen
643 ;; arg, not a TN, for the widetag, but it looks like
644 ;; the type is dynamic because it kind of is: the
645 ;; header words is computed into a register and written
646 ;; in a single store.
647 (push (make-alloc total-bytes count type pc)
648 (gethash name collection)))))))))))))
650 (defun collapse-by-type (data &aux new)
651 (dolist (datum data new)
652 (let ((found (find (alloc-type datum) new :key #'alloc-type
653 :test #'equal)))
654 (cond (found
655 (incf (alloc-bytes found) (alloc-bytes datum))
656 (incf (alloc-count found) (alloc-count datum)))
658 (push datum new))))))
660 ;; DETAIL NIL shows just function name and percent space consumption
661 ;; DETAIL T shows function, bytes, percent,
662 ;; and unless there is only one detail line, the detail lines
664 (defun aprof-show (&key (top-n 20) (detail t) (collapse t) (stream *standard-output*))
665 (unless top-n
666 (setq top-n 1000))
667 (let* ((collection (%hash-table-alist (aprof-collect stream)))
668 (summary
669 (mapcar (lambda (x)
670 (list* (car x)
671 (reduce #'+ (cdr x) :key #'alloc-bytes)
672 (cdr x)))
673 collection))
674 (sorted (sort (copy-list summary) #'> :key #'second))
675 (total-bytes (reduce #'+ sorted :key #'second))
676 (*print-pretty* nil)
677 (i 0)
678 (sum-pct 0)
679 (sum-bytes 0))
680 (when (eq stream nil)
681 (setq stream sb-impl::*null-broadcast-stream*)) ; lazy's person's approach
682 (cond ((not detail)
683 (format stream "~& % Sum % Bytes Allocations Function~%")
684 (format stream "~& ------- ------- ----------- ----------- --------~%"))
686 (format stream "~& % Bytes Count ~:[~; PC ~]Function~%"
687 (not collapse))
688 (format stream "~& ------- ----------- --------- ~:[~;---------- ~]--------~%"
689 (not collapse))))
691 ;; In detailed view, each function takes up either one line or
692 ;; more than one line. In the interest of saving space, newlines are
693 ;; omitted between consecutive functions each of whose detail consists
694 ;; of a single line. But to avoid ambiguity, esure that there is always
695 ;; a blank line before and after each function having multiline detail.
696 (let ((emitted-newline t))
697 (dolist (x sorted)
698 (destructuring-bind (name bytes . data) x
699 (when detail
700 (when collapse
701 (setq data (collapse-by-type data)))
702 (setq data (sort data #'> :key #'alloc-bytes)))
703 (assert (eq bytes (reduce #'+ data :key #'alloc-bytes)))
704 (when (and detail (cdr data) (not emitted-newline))
705 (terpri stream))
706 (incf sum-pct (float (/ bytes total-bytes)))
707 ;; Show summary for the function
708 (cond ((not detail)
709 (format stream " ~5,1,2f ~5,1,2f ~12d~15d ~s~%"
710 (/ bytes total-bytes)
711 sum-pct
712 bytes
713 (reduce #'+ data :key #'alloc-count)
714 name))
716 (format stream " ~5,1,2f ~12d ~:[~10@t~;~:*~10d~]~@[~14@a~] ~s~@[ - ~s~]~%"
717 (/ bytes total-bytes)
718 bytes
719 (if (cdr data) nil (alloc-count (car data)))
720 (cond (collapse nil)
721 ((cdr data) "")
722 (t (write-to-string (alloc-pc (car data)) :base 16)))
723 name
724 (if (cdr data) nil (alloc-type (car data)))
726 (when (and detail (cdr data))
727 (dolist (point data)
728 (format stream " ~5,1,2f ~12d ~10d~@[~14x~]~@[ ~s~]~%"
729 (/ (alloc-bytes point) bytes) ; fraction within function
730 (alloc-bytes point)
731 (alloc-count point)
732 (if collapse nil (alloc-pc point))
733 (alloc-type point))))
734 (incf sum-bytes bytes)
735 (when (and detail
736 (setq emitted-newline (not (null (cdr data)))))
737 (terpri stream)))
738 (incf i)
739 (if (and (neq top-n :all) (>= i top-n)) (return))))
740 ; (assert (= sum-bytes total-bytes))
741 (cond ((not detail)
742 (format stream "~19@t===========~%~19@t~11d~%" sum-bytes))
744 (format stream " ======= ===========~%~6,1,2f ~12d~%"
745 sum-pct sum-bytes)))
746 sum-bytes))
748 ;;; Call FUN and return the exact number of bytes it (an all descendant
749 ;;; calls) allocated, provided that they were instrumented for precise
750 ;;; cons profiling.
751 ;;; STREAM is where to report, defaulting to *standard-output*.
752 ;;; The convention is that of map-segment-instructions, meaning NIL is a sink.
753 (defun aprof-run (fun &key (report t) (stream *standard-output*) arguments)
754 (aprof-reset)
755 (patch-all-code)
756 (dx-let ((arglist (cons arguments nil))) ; so no consing in here
757 (when (listp arguments)
758 (setq arglist (car arglist))) ; was already a list
759 (let (nbytes)
760 (unwind-protect
761 (progn (aprof-start) (apply fun arglist))
762 (aprof-stop))
763 (when report
764 (setq nbytes (aprof-show :stream stream))
765 (when stream (terpri stream)))
766 nbytes)))
768 ;;;;
770 ;;; Example:
771 ;;; The default output is a report showing the top 20 allocators
772 ;;; (by function name) with a line of detail for each distinct
773 ;;; type of object allocated within the function.
775 ;;; * (aprof-run (lambda () (compile-file "~/aprof")))
776 ;;; % Bytes Count Function
777 ;;; ------- ----------- --------- --------
778 ;;; 6.6 1980160 INIT-SB-VECTORS
779 ;;; 79.1 1565824 48880 SIMPLE-BIT-VECTOR
780 ;;; 20.9 414336 1872 SIMPLE-VECTOR
782 ;;; 4.6 1368960 MAKE-TN
783 ;;; 83.3 1140800 7130 TN
784 ;;; 16.7 228160 7130 SIMPLE-BIT-VECTOR
786 ;;; The report can be made more detailed by not combining lines
787 ;;; for the same object type within a function:
789 ;;; * (aprof-show :detail t :collapse nil)
791 ;;; % Bytes Count PC Function
792 ;;; ------- ----------- --------- ---------- --------
793 ;;; 6.6 1980160 INIT-SB-VECTORS
794 ;;; 76.0 1504256 47008 21D07E88 SIMPLE-BIT-VECTOR
795 ;;; 20.9 414336 1872 21D07D28 SIMPLE-VECTOR
796 ;;; 3.1 61568 1872 21D07F30 SIMPLE-BIT-VECTOR
798 ;;; 4.6 1368960 MAKE-TN
799 ;;; 83.3 1140800 7130 21C98638 TN
800 ;;; 16.7 228160 7130 21C985D8 SIMPLE-BIT-VECTOR
802 ;;; 4.0 1204160 MAKE-HASH-TABLE
803 ;;; 39.3 473472 1644 21B2A008 SIMPLE-VECTOR
804 ;;; 37.1 447168 1644 21B29F50 (SIMPLE-ARRAY (UNSIGNED-BYTE 64) (*))
805 ;;; 21.8 263040 1644 21B29FA8 (SIMPLE-ARRAY (UNSIGNED-BYTE 64) (*))
806 ;;; 1.7 20480 128 21B2A098 (SIMPLE-ARRAY (UNSIGNED-BYTE 64) (*))
808 ;;; Or less detailed by combining all lines within a function:
810 ;;; * (sb-aprof::aprof-show :detail nil :top-n 1000) ; or anything
811 ;;; % Sum % Bytes Allocations Function
812 ;;; ------- ------- ----------- ----------- --------
813 ;;; 6.6 6.6 1980160 50752 INIT-SB-VECTORS
814 ;;; 4.6 11.1 1368960 14260 MAKE-TN
815 ;;; 4.0 15.1 1204160 5060 MAKE-HASH-TABLE
816 ;;; 3.8 19.0 1154384 28727 CONSTRAIN-REF-TYPE
817 ;;; 3.8 22.8 1152160 13084 COPY-CONSET
818 ;;; 3.6 26.5 1094240 13678 MAKE-TN-REF
819 ;;; ... many more lines ...
820 ;;; 00.0 100.0 16 1 %ENTER-NEW-NICKNAMES
821 ;;; 00.0 100.0 16 1 UNIX-LSTAT
822 ;;; ===========
823 ;;; 30054816