2 * qsp.c - QEMU Synchronization Profiler
4 * Copyright (C) 2018, Emilio G. Cota <cota@braap.org>
6 * License: GNU GPL, version 2 or later.
7 * See the COPYING file in the top-level directory.
9 * QSP profiles the time spent in synchronization primitives, which can
10 * help diagnose performance problems, e.g. scalability issues when
13 * The primitives currently supported are mutexes, recursive mutexes and
14 * condition variables. Note that not all related functions are intercepted;
15 * instead we profile only those functions that can have a performance impact,
16 * either due to blocking (e.g. cond_wait, mutex_lock) or cache line
17 * contention (e.g. mutex_lock, mutex_trylock).
19 * QSP's design focuses on speed and scalability. This is achieved
20 * by having threads do their profiling entirely on thread-local data.
21 * The appropriate thread-local data is found via a QHT, i.e. a concurrent hash
22 * table. To aggregate data in order to generate a report, we iterate over
23 * all entries in the hash table. Depending on the number of threads and
24 * synchronization objects this might be expensive, but note that it is
25 * very rarely called -- reports are generated only when requested by users.
27 * Reports are generated as a table where each row represents a call site. A
28 * call site is the triplet formed by the __file__ and __LINE__ of the caller
29 * as well as the address of the "object" (i.e. mutex, rec. mutex or condvar)
30 * being operated on. Optionally, call sites that operate on different objects
31 * of the same type can be coalesced, which can be particularly useful when
32 * profiling dynamically-allocated objects.
34 * Alternative designs considered:
36 * - Use an off-the-shelf profiler such as mutrace. This is not a viable option
37 * for us because QEMU has __malloc_hook set (by one of the libraries it
38 * uses); leaving this hook unset is required to avoid deadlock in mutrace.
40 * - Use a glib HT for each thread, protecting each HT with its own lock.
41 * This isn't simpler than the current design, and is 10% slower in the
42 * atomic_add-bench microbenchmark (-m option).
44 * - For reports, just use a binary tree as we aggregate data, instead of having
45 * an intermediate hash table. This would simplify the code only slightly, but
46 * would perform badly if there were many threads and objects to track.
48 * - Wrap operations on qsp entries with RCU read-side critical sections, so
49 * that qsp_reset() can delete entries. Unfortunately, the overhead of calling
50 * rcu_read_lock/unlock slows down atomic_add-bench -m by 24%. Having
51 * a snapshot that is updated on qsp_reset() avoids this overhead.
54 * - Lennart Poettering's mutrace: http://0pointer.de/blog/projects/mutrace.html
55 * - Lozi, David, Thomas, Lawall and Muller. "Remote Core Locking: Migrating
56 * Critical-Section Execution to Improve the Performance of Multithreaded
57 * Applications", USENIX ATC'12.
60 #include "qemu/osdep.h"
61 #include "qemu/qemu-print.h"
62 #include "qemu/thread.h"
63 #include "qemu/timer.h"
66 #include "qemu/xxhash.h"
77 const char *file
; /* i.e. __FILE__; shortened later */
81 typedef struct QSPCallSite QSPCallSite
;
85 const QSPCallSite
*callsite
;
88 unsigned int n_objs
; /* count of coalesced objs; only used for reporting */
90 typedef struct QSPEntry QSPEntry
;
96 typedef struct QSPSnapshot QSPSnapshot
;
98 /* initial sizing for hash tables */
99 #define QSP_INITIAL_SIZE 64
101 /* If this file is moved, QSP_REL_PATH should be updated accordingly */
102 #define QSP_REL_PATH "util/qsp.c"
104 /* this file's full path. Used to present all call sites with relative paths */
105 static size_t qsp_qemu_path_len
;
107 /* the address of qsp_thread gives us a unique 'thread ID' */
108 static __thread
int qsp_thread
;
111 * Call sites are the same for all threads, so we track them in a separate hash
112 * table to save memory.
114 static struct qht qsp_callsite_ht
;
116 static struct qht qsp_ht
;
117 static QSPSnapshot
*qsp_snapshot
;
118 static bool qsp_initialized
, qsp_initializing
;
120 static const char * const qsp_typenames
[] = {
121 [QSP_MUTEX
] = "mutex",
122 [QSP_BQL_MUTEX
] = "BQL mutex",
123 [QSP_REC_MUTEX
] = "rec_mutex",
124 [QSP_CONDVAR
] = "condvar",
127 QemuMutexLockFunc qemu_bql_mutex_lock_func
= qemu_mutex_lock_impl
;
128 QemuMutexLockFunc qemu_mutex_lock_func
= qemu_mutex_lock_impl
;
129 QemuMutexTrylockFunc qemu_mutex_trylock_func
= qemu_mutex_trylock_impl
;
130 QemuRecMutexLockFunc qemu_rec_mutex_lock_func
= qemu_rec_mutex_lock_impl
;
131 QemuRecMutexTrylockFunc qemu_rec_mutex_trylock_func
=
132 qemu_rec_mutex_trylock_impl
;
133 QemuCondWaitFunc qemu_cond_wait_func
= qemu_cond_wait_impl
;
136 * It pays off to _not_ hash callsite->file; hashing a string is slow, and
137 * without it we still get a pretty unique hash.
140 uint32_t do_qsp_callsite_hash(const QSPCallSite
*callsite
, uint64_t ab
)
142 uint64_t cd
= (uint64_t)(uintptr_t)callsite
->obj
;
143 uint32_t e
= callsite
->line
;
144 uint32_t f
= callsite
->type
;
146 return qemu_xxhash6(ab
, cd
, e
, f
);
150 uint32_t qsp_callsite_hash(const QSPCallSite
*callsite
)
152 return do_qsp_callsite_hash(callsite
, 0);
155 static inline uint32_t do_qsp_entry_hash(const QSPEntry
*entry
, uint64_t a
)
157 return do_qsp_callsite_hash(entry
->callsite
, a
);
160 static uint32_t qsp_entry_hash(const QSPEntry
*entry
)
162 return do_qsp_entry_hash(entry
, (uint64_t)(uintptr_t)entry
->thread_ptr
);
165 static uint32_t qsp_entry_no_thread_hash(const QSPEntry
*entry
)
167 return do_qsp_entry_hash(entry
, 0);
170 /* without the objects we need to hash the file name to get a decent hash */
171 static uint32_t qsp_entry_no_thread_obj_hash(const QSPEntry
*entry
)
173 const QSPCallSite
*callsite
= entry
->callsite
;
174 uint64_t ab
= g_str_hash(callsite
->file
);
175 uint64_t cd
= callsite
->line
;
176 uint32_t e
= callsite
->type
;
178 return qemu_xxhash5(ab
, cd
, e
);
181 static bool qsp_callsite_cmp(const void *ap
, const void *bp
)
183 const QSPCallSite
*a
= ap
;
184 const QSPCallSite
*b
= bp
;
188 a
->line
== b
->line
&&
189 a
->type
== b
->type
&&
190 (a
->file
== b
->file
|| !strcmp(a
->file
, b
->file
)));
193 static bool qsp_callsite_no_obj_cmp(const void *ap
, const void *bp
)
195 const QSPCallSite
*a
= ap
;
196 const QSPCallSite
*b
= bp
;
199 (a
->line
== b
->line
&&
200 a
->type
== b
->type
&&
201 (a
->file
== b
->file
|| !strcmp(a
->file
, b
->file
)));
204 static bool qsp_entry_no_thread_cmp(const void *ap
, const void *bp
)
206 const QSPEntry
*a
= ap
;
207 const QSPEntry
*b
= bp
;
209 return qsp_callsite_cmp(a
->callsite
, b
->callsite
);
212 static bool qsp_entry_no_thread_obj_cmp(const void *ap
, const void *bp
)
214 const QSPEntry
*a
= ap
;
215 const QSPEntry
*b
= bp
;
217 return qsp_callsite_no_obj_cmp(a
->callsite
, b
->callsite
);
220 static bool qsp_entry_cmp(const void *ap
, const void *bp
)
222 const QSPEntry
*a
= ap
;
223 const QSPEntry
*b
= bp
;
225 return a
->thread_ptr
== b
->thread_ptr
&&
226 qsp_callsite_cmp(a
->callsite
, b
->callsite
);
230 * Normally we'd call this from a constructor function, but we want it to work
231 * via libutil as well.
233 static void qsp_do_init(void)
235 /* make sure this file's path in the tree is up to date with QSP_REL_PATH */
236 g_assert(strstr(__FILE__
, QSP_REL_PATH
));
237 qsp_qemu_path_len
= strlen(__FILE__
) - strlen(QSP_REL_PATH
);
239 qht_init(&qsp_ht
, qsp_entry_cmp
, QSP_INITIAL_SIZE
,
240 QHT_MODE_AUTO_RESIZE
| QHT_MODE_RAW_MUTEXES
);
241 qht_init(&qsp_callsite_ht
, qsp_callsite_cmp
, QSP_INITIAL_SIZE
,
242 QHT_MODE_AUTO_RESIZE
| QHT_MODE_RAW_MUTEXES
);
245 static __attribute__((noinline
)) void qsp_init__slowpath(void)
247 if (atomic_cmpxchg(&qsp_initializing
, false, true) == false) {
249 atomic_set(&qsp_initialized
, true);
251 while (!atomic_read(&qsp_initialized
)) {
257 /* qsp_init() must be called from _all_ exported functions */
258 static inline void qsp_init(void)
260 if (likely(atomic_read(&qsp_initialized
))) {
263 qsp_init__slowpath();
266 static QSPCallSite
*qsp_callsite_find(const QSPCallSite
*orig
)
268 QSPCallSite
*callsite
;
271 hash
= qsp_callsite_hash(orig
);
272 callsite
= qht_lookup(&qsp_callsite_ht
, orig
, hash
);
273 if (callsite
== NULL
) {
274 void *existing
= NULL
;
276 callsite
= g_new(QSPCallSite
, 1);
277 memcpy(callsite
, orig
, sizeof(*callsite
));
278 qht_insert(&qsp_callsite_ht
, callsite
, hash
, &existing
);
279 if (unlikely(existing
)) {
288 qsp_entry_create(struct qht
*ht
, const QSPEntry
*entry
, uint32_t hash
)
291 void *existing
= NULL
;
293 e
= g_new0(QSPEntry
, 1);
294 e
->thread_ptr
= entry
->thread_ptr
;
295 e
->callsite
= qsp_callsite_find(entry
->callsite
);
297 qht_insert(ht
, e
, hash
, &existing
);
298 if (unlikely(existing
)) {
306 qsp_entry_find(struct qht
*ht
, const QSPEntry
*entry
, uint32_t hash
)
310 e
= qht_lookup(ht
, entry
, hash
);
312 e
= qsp_entry_create(ht
, entry
, hash
);
318 * Note: Entries are never removed, so callers do not have to be in an RCU
319 * read-side critical section.
321 static QSPEntry
*qsp_entry_get(const void *obj
, const char *file
, int line
,
324 QSPCallSite callsite
= {
335 orig
.thread_ptr
= &qsp_thread
;
336 orig
.callsite
= &callsite
;
338 hash
= qsp_entry_hash(&orig
);
339 return qsp_entry_find(&qsp_ht
, &orig
, hash
);
343 * @e is in the global hash table; it is only written to by the current thread,
344 * so we write to it atomically (as in "write once") to prevent torn reads.
346 static inline void do_qsp_entry_record(QSPEntry
*e
, int64_t delta
, bool acq
)
348 atomic_set_u64(&e
->ns
, e
->ns
+ delta
);
350 atomic_set_u64(&e
->n_acqs
, e
->n_acqs
+ 1);
354 static inline void qsp_entry_record(QSPEntry
*e
, int64_t delta
)
356 do_qsp_entry_record(e
, delta
, true);
359 #define QSP_GEN_VOID(type_, qsp_t_, func_, impl_) \
360 static void func_(type_ *obj, const char *file, int line) \
366 impl_(obj, file, line); \
369 e = qsp_entry_get(obj, file, line, qsp_t_); \
370 qsp_entry_record(e, t1 - t0); \
373 #define QSP_GEN_RET1(type_, qsp_t_, func_, impl_) \
374 static int func_(type_ *obj, const char *file, int line) \
381 err = impl_(obj, file, line); \
384 e = qsp_entry_get(obj, file, line, qsp_t_); \
385 do_qsp_entry_record(e, t1 - t0, !err); \
389 QSP_GEN_VOID(QemuMutex
, QSP_BQL_MUTEX
, qsp_bql_mutex_lock
, qemu_mutex_lock_impl
)
390 QSP_GEN_VOID(QemuMutex
, QSP_MUTEX
, qsp_mutex_lock
, qemu_mutex_lock_impl
)
391 QSP_GEN_RET1(QemuMutex
, QSP_MUTEX
, qsp_mutex_trylock
, qemu_mutex_trylock_impl
)
393 QSP_GEN_VOID(QemuRecMutex
, QSP_REC_MUTEX
, qsp_rec_mutex_lock
,
394 qemu_rec_mutex_lock_impl
)
395 QSP_GEN_RET1(QemuRecMutex
, QSP_REC_MUTEX
, qsp_rec_mutex_trylock
,
396 qemu_rec_mutex_trylock_impl
)
402 qsp_cond_wait(QemuCond
*cond
, QemuMutex
*mutex
, const char *file
, int line
)
408 qemu_cond_wait_impl(cond
, mutex
, file
, line
);
411 e
= qsp_entry_get(cond
, file
, line
, QSP_CONDVAR
);
412 qsp_entry_record(e
, t1
- t0
);
415 bool qsp_is_enabled(void)
417 return atomic_read(&qemu_mutex_lock_func
) == qsp_mutex_lock
;
420 void qsp_enable(void)
422 atomic_set(&qemu_mutex_lock_func
, qsp_mutex_lock
);
423 atomic_set(&qemu_mutex_trylock_func
, qsp_mutex_trylock
);
424 atomic_set(&qemu_bql_mutex_lock_func
, qsp_bql_mutex_lock
);
425 atomic_set(&qemu_rec_mutex_lock_func
, qsp_rec_mutex_lock
);
426 atomic_set(&qemu_rec_mutex_trylock_func
, qsp_rec_mutex_trylock
);
427 atomic_set(&qemu_cond_wait_func
, qsp_cond_wait
);
430 void qsp_disable(void)
432 atomic_set(&qemu_mutex_lock_func
, qemu_mutex_lock_impl
);
433 atomic_set(&qemu_mutex_trylock_func
, qemu_mutex_trylock_impl
);
434 atomic_set(&qemu_bql_mutex_lock_func
, qemu_mutex_lock_impl
);
435 atomic_set(&qemu_rec_mutex_lock_func
, qemu_rec_mutex_lock_impl
);
436 atomic_set(&qemu_rec_mutex_trylock_func
, qemu_rec_mutex_trylock_impl
);
437 atomic_set(&qemu_cond_wait_func
, qemu_cond_wait_impl
);
440 static gint
qsp_tree_cmp(gconstpointer ap
, gconstpointer bp
, gpointer up
)
442 const QSPEntry
*a
= ap
;
443 const QSPEntry
*b
= bp
;
444 enum QSPSortBy sort_by
= *(enum QSPSortBy
*)up
;
445 const QSPCallSite
*ca
;
446 const QSPCallSite
*cb
;
449 case QSP_SORT_BY_TOTAL_WAIT_TIME
:
452 } else if (a
->ns
< b
->ns
) {
456 case QSP_SORT_BY_AVG_WAIT_TIME
:
458 double avg_a
= a
->n_acqs
? a
->ns
/ a
->n_acqs
: 0;
459 double avg_b
= b
->n_acqs
? b
->ns
/ b
->n_acqs
: 0;
463 } else if (avg_a
< avg_b
) {
469 g_assert_not_reached();
474 /* Break the tie with the object's address */
475 if (ca
->obj
< cb
->obj
) {
477 } else if (ca
->obj
> cb
->obj
) {
482 /* same obj. Break the tie with the callsite's file */
483 cmp
= strcmp(ca
->file
, cb
->file
);
487 /* same callsite file. Break the tie with the callsite's line */
488 g_assert(ca
->line
!= cb
->line
);
489 if (ca
->line
< cb
->line
) {
491 } else if (ca
->line
> cb
->line
) {
494 /* break the tie with the callsite's type */
495 return cb
->type
- ca
->type
;
500 static void qsp_sort(void *p
, uint32_t h
, void *userp
)
505 g_tree_insert(tree
, e
, NULL
);
508 static void qsp_aggregate(void *p
, uint32_t h
, void *up
)
511 const QSPEntry
*e
= p
;
515 hash
= qsp_entry_no_thread_hash(e
);
516 agg
= qsp_entry_find(ht
, e
, hash
);
518 * The entry is in the global hash table; read from it atomically (as in
521 agg
->ns
+= atomic_read_u64(&e
->ns
);
522 agg
->n_acqs
+= atomic_read_u64(&e
->n_acqs
);
525 static void qsp_iter_diff(void *p
, uint32_t hash
, void *htp
)
527 struct qht
*ht
= htp
;
531 new = qht_lookup(ht
, old
, hash
);
532 /* entries are never deleted, so we must have this one */
533 g_assert(new != NULL
);
534 /* our reading of the stats happened after the snapshot was taken */
535 g_assert(new->n_acqs
>= old
->n_acqs
);
536 g_assert(new->ns
>= old
->ns
);
538 new->n_acqs
-= old
->n_acqs
;
541 /* No point in reporting an empty entry */
542 if (new->n_acqs
== 0 && new->ns
== 0) {
543 bool removed
= qht_remove(ht
, new, hash
);
550 static void qsp_diff(struct qht
*orig
, struct qht
*new)
552 qht_iter(orig
, qsp_iter_diff
, new);
555 static void qsp_iter_callsite_coalesce(void *p
, uint32_t h
, void *htp
)
557 struct qht
*ht
= htp
;
562 hash
= qsp_entry_no_thread_obj_hash(old
);
563 e
= qht_lookup(ht
, old
, hash
);
565 e
= qsp_entry_create(ht
, old
, hash
);
567 } else if (e
->callsite
->obj
!= old
->callsite
->obj
) {
571 e
->n_acqs
+= old
->n_acqs
;
574 static void qsp_ht_delete(void *p
, uint32_t h
, void *htp
)
579 static void qsp_mktree(GTree
*tree
, bool callsite_coalesce
)
582 struct qht ht
, coalesce_ht
;
586 * First, see if there's a prior snapshot, so that we read the global hash
587 * table _after_ the snapshot has been created, which guarantees that
588 * the entries we'll read will be a superset of the snapshot's entries.
590 * We must remain in an RCU read-side critical section until we're done
594 snap
= atomic_rcu_read(&qsp_snapshot
);
596 /* Aggregate all results from the global hash table into a local one */
597 qht_init(&ht
, qsp_entry_no_thread_cmp
, QSP_INITIAL_SIZE
,
598 QHT_MODE_AUTO_RESIZE
| QHT_MODE_RAW_MUTEXES
);
599 qht_iter(&qsp_ht
, qsp_aggregate
, &ht
);
601 /* compute the difference wrt the snapshot, if any */
603 qsp_diff(&snap
->ht
, &ht
);
605 /* done with the snapshot; RCU can reclaim it */
609 if (callsite_coalesce
) {
610 qht_init(&coalesce_ht
, qsp_entry_no_thread_obj_cmp
, QSP_INITIAL_SIZE
,
611 QHT_MODE_AUTO_RESIZE
| QHT_MODE_RAW_MUTEXES
);
612 qht_iter(&ht
, qsp_iter_callsite_coalesce
, &coalesce_ht
);
614 /* free the previous hash table, and point htp to coalesce_ht */
615 qht_iter(&ht
, qsp_ht_delete
, NULL
);
620 /* sort the hash table elements by using a tree */
621 qht_iter(htp
, qsp_sort
, tree
);
623 /* free the hash table, but keep the elements (those are in the tree now) */
627 /* free string with g_free */
628 static char *qsp_at(const QSPCallSite
*callsite
)
630 GString
*s
= g_string_new(NULL
);
631 const char *shortened
;
633 /* remove the absolute path to qemu */
634 if (unlikely(strlen(callsite
->file
) < qsp_qemu_path_len
)) {
635 shortened
= callsite
->file
;
637 shortened
= callsite
->file
+ qsp_qemu_path_len
;
639 g_string_append_printf(s
, "%s:%u", shortened
, callsite
->line
);
640 return g_string_free(s
, FALSE
);
643 struct QSPReportEntry
{
646 const char *typename
;
652 typedef struct QSPReportEntry QSPReportEntry
;
655 QSPReportEntry
*entries
;
657 size_t max_n_entries
;
659 typedef struct QSPReport QSPReport
;
661 static gboolean
qsp_tree_report(gpointer key
, gpointer value
, gpointer udata
)
663 const QSPEntry
*e
= key
;
664 QSPReport
*report
= udata
;
665 QSPReportEntry
*entry
;
667 if (report
->n_entries
== report
->max_n_entries
) {
670 entry
= &report
->entries
[report
->n_entries
];
673 entry
->obj
= e
->callsite
->obj
;
674 entry
->n_objs
= e
->n_objs
;
675 entry
->callsite_at
= qsp_at(e
->callsite
);
676 entry
->typename
= qsp_typenames
[e
->callsite
->type
];
677 entry
->time_s
= e
->ns
* 1e-9;
678 entry
->n_acqs
= e
->n_acqs
;
679 entry
->ns_avg
= e
->n_acqs
? e
->ns
/ e
->n_acqs
: 0;
683 static void pr_report(const QSPReport
*rep
)
687 int callsite_len
= 0;
692 /* find out the maximum length of all 'callsite' fields */
693 for (i
= 0; i
< rep
->n_entries
; i
++) {
694 const QSPReportEntry
*e
= &rep
->entries
[i
];
695 size_t len
= strlen(e
->callsite_at
);
702 callsite_len
= MAX(max_len
, strlen("Call site"));
703 /* white space to leave to the right of "Call site" */
704 callsite_rspace
= callsite_len
- strlen("Call site");
706 qemu_printf("Type Object Call site%*s Wait Time (s) "
707 " Count Average (us)\n", callsite_rspace
, "");
709 /* build a horizontal rule with dashes */
710 n_dashes
= 79 + callsite_rspace
;
711 dashes
= g_malloc(n_dashes
+ 1);
712 memset(dashes
, '-', n_dashes
);
713 dashes
[n_dashes
] = '\0';
714 qemu_printf("%s\n", dashes
);
716 for (i
= 0; i
< rep
->n_entries
; i
++) {
717 const QSPReportEntry
*e
= &rep
->entries
[i
];
718 GString
*s
= g_string_new(NULL
);
720 g_string_append_printf(s
, "%-9s ", e
->typename
);
722 g_string_append_printf(s
, "[%12u]", e
->n_objs
);
724 g_string_append_printf(s
, "%14p", e
->obj
);
726 g_string_append_printf(s
, " %s%*s %13.5f %12" PRIu64
" %12.2f\n",
728 callsite_len
- (int)strlen(e
->callsite_at
), "",
729 e
->time_s
, e
->n_acqs
, e
->ns_avg
* 1e-3);
730 qemu_printf("%s", s
->str
);
731 g_string_free(s
, TRUE
);
734 qemu_printf("%s\n", dashes
);
738 static void report_destroy(QSPReport
*rep
)
742 for (i
= 0; i
< rep
->n_entries
; i
++) {
743 QSPReportEntry
*e
= &rep
->entries
[i
];
745 g_free(e
->callsite_at
);
747 g_free(rep
->entries
);
750 void qsp_report(size_t max
, enum QSPSortBy sort_by
,
751 bool callsite_coalesce
)
753 GTree
*tree
= g_tree_new_full(qsp_tree_cmp
, &sort_by
, g_free
, NULL
);
758 rep
.entries
= g_new0(QSPReportEntry
, max
);
760 rep
.max_n_entries
= max
;
762 qsp_mktree(tree
, callsite_coalesce
);
763 g_tree_foreach(tree
, qsp_tree_report
, &rep
);
764 g_tree_destroy(tree
);
767 report_destroy(&rep
);
770 static void qsp_snapshot_destroy(QSPSnapshot
*snap
)
772 qht_iter(&snap
->ht
, qsp_ht_delete
, NULL
);
773 qht_destroy(&snap
->ht
);
779 QSPSnapshot
*new = g_new(QSPSnapshot
, 1);
784 qht_init(&new->ht
, qsp_entry_cmp
, QSP_INITIAL_SIZE
,
785 QHT_MODE_AUTO_RESIZE
| QHT_MODE_RAW_MUTEXES
);
787 /* take a snapshot of the current state */
788 qht_iter(&qsp_ht
, qsp_aggregate
, &new->ht
);
790 /* replace the previous snapshot, if any */
791 old
= atomic_xchg(&qsp_snapshot
, new);
793 call_rcu(old
, qsp_snapshot_destroy
, rcu
);