1 /* Measure various lock acquisition times for empty critical sections.
2 Copyright (C) 2020-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
20 #define TEST_NAME "pthread-locks"
27 #include <semaphore.h>
28 #include <stdatomic.h>
31 #include "bench-timing.h"
34 /* The point of this benchmark is to measure the overhead of an empty
35 critical section or a small critical section. This is never going
36 to be indicative of real application performance. Instead we are
37 trying to benchmark the effects of the compiler and the runtime
38 coupled with a particular set of hardware atomic operations.
39 The numbers from this benchmark should be taken with a massive gain
40 of salt and viewed through the eyes of expert reviewers. */
42 static pthread_mutex_t m
;
43 static pthread_rwlock_t rw
;
44 static pthread_cond_t cv
;
45 static pthread_cond_t consumer_c
, producer_c
;
47 static pthread_spinlock_t sp
;
50 typedef timing_t (*test_t
)(long, int);
52 #define START_ITERS 1000
54 #define FILLER_GOES_HERE \
58 /* Everyone loves a good fibonacci series. This isn't quite one of
59 them because we need larger values in fewer steps, in a way that
60 won't be optimized away. We're looking to approximately double the
61 total time each test iteration takes, so as to not swamp the useful
64 #pragma GCC push_options
65 #pragma GCC optimize(1)
67 static int __attribute__((noinline
))
72 return fibonacci (i
-1) + fibonacci (i
-2);
79 static char buf1
[512], buf2
[512];
80 int f
= fibonacci (5);
81 memcpy (buf1
, buf2
, f
);
84 #pragma GCC pop_options
87 test_mutex (long iters
, int filler
)
89 timing_t start
, stop
, cur
;
91 pthread_mutex_init (&m
, NULL
);
94 for (long j
= iters
; j
>= 0; --j
)
96 pthread_mutex_lock (&m
);
98 pthread_mutex_unlock (&m
);
101 TIMING_DIFF (cur
, start
, stop
);
107 test_mutex_trylock (long iters
, int filler
)
109 timing_t start
, stop
, cur
;
111 pthread_mutex_init (&m
, NULL
);
112 pthread_mutex_lock (&m
);
115 for (long j
= iters
; j
>= 0; --j
)
117 pthread_mutex_trylock (&m
);
121 TIMING_DIFF (cur
, start
, stop
);
123 pthread_mutex_unlock (&m
);
128 test_rwlock_read (long iters
, int filler
)
130 timing_t start
, stop
, cur
;
132 pthread_rwlock_init (&rw
, NULL
);
135 for (long j
= iters
; j
>= 0; --j
)
137 pthread_rwlock_rdlock (&rw
);
139 pthread_rwlock_unlock (&rw
);
142 TIMING_DIFF (cur
, start
, stop
);
148 test_rwlock_tryread (long iters
, int filler
)
150 timing_t start
, stop
, cur
;
152 pthread_rwlock_init (&rw
, NULL
);
153 pthread_rwlock_wrlock (&rw
);
156 for (long j
= iters
; j
>= 0; --j
)
158 pthread_rwlock_tryrdlock (&rw
);
162 TIMING_DIFF (cur
, start
, stop
);
164 pthread_rwlock_unlock (&rw
);
169 test_rwlock_write (long iters
, int filler
)
171 timing_t start
, stop
, cur
;
173 pthread_rwlock_init (&rw
, NULL
);
176 for (long j
= iters
; j
>= 0; --j
)
178 pthread_rwlock_wrlock (&rw
);
180 pthread_rwlock_unlock (&rw
);
183 TIMING_DIFF (cur
, start
, stop
);
189 test_rwlock_trywrite (long iters
, int filler
)
191 timing_t start
, stop
, cur
;
193 pthread_rwlock_init (&rw
, NULL
);
194 pthread_rwlock_rdlock (&rw
);
197 for (long j
= iters
; j
>= 0; --j
)
199 pthread_rwlock_trywrlock (&rw
);
203 TIMING_DIFF (cur
, start
, stop
);
205 pthread_rwlock_unlock (&rw
);
210 test_spin_lock (long iters
, int filler
)
212 timing_t start
, stop
, cur
;
214 pthread_spin_init (&sp
, PTHREAD_PROCESS_PRIVATE
);
217 for (long j
= iters
; j
>= 0; --j
)
219 pthread_spin_lock (&sp
);
221 pthread_spin_unlock (&sp
);
224 TIMING_DIFF (cur
, start
, stop
);
230 test_spin_trylock (long iters
, int filler
)
232 timing_t start
, stop
, cur
;
234 pthread_spin_init (&sp
, PTHREAD_PROCESS_PRIVATE
);
235 pthread_spin_lock (&sp
);
238 for (long j
= iters
; j
>= 0; --j
)
240 pthread_spin_trylock (&sp
);
244 TIMING_DIFF (cur
, start
, stop
);
246 pthread_spin_unlock (&sp
);
251 test_sem_wait (long iters
, int filler
)
253 timing_t start
, stop
, cur
;
255 sem_init (&sem
, 0, 1);
258 for (long j
= iters
; j
>= 0; --j
)
265 TIMING_DIFF (cur
, start
, stop
);
271 test_sem_trywait (long iters
, int filler
)
273 timing_t start
, stop
, cur
;
275 sem_init (&sem
, 0, 0);
278 for (long j
= iters
; j
>= 0; --j
)
284 TIMING_DIFF (cur
, start
, stop
);
290 test_condvar_helper (void *v
)
292 /* This is wasteful, but the alternative is to add the overhead of a
293 mutex lock/unlock to the overall iteration (both threads) and we
294 don't want that. Ideally, this thread would run on an
295 independent processing core anyway. The ONLY goal here is to
296 minimize the time the other thread spends waiting for us. */
297 while (__atomic_load_n (&cv_done
, __ATOMIC_RELAXED
) == 0)
298 pthread_cond_signal (&cv
);
304 test_condvar (long iters
, int filler
)
306 timing_t start
, stop
, cur
;
309 pthread_mutex_init (&m
, NULL
);
310 pthread_cond_init (&cv
, NULL
);
311 pthread_mutex_lock (&m
);
313 __atomic_store_n (&cv_done
, 0, __ATOMIC_RELAXED
);
314 pthread_create (&helper_id
, NULL
, test_condvar_helper
, &iters
);
317 for (long j
= iters
; j
>= 0; --j
)
319 pthread_cond_wait (&cv
, &m
);
323 TIMING_DIFF (cur
, start
, stop
);
325 pthread_mutex_unlock (&m
);
326 __atomic_store_n (&cv_done
, 1, __ATOMIC_RELAXED
);
328 pthread_join (helper_id
, NULL
);
332 /* How many items are "queued" in our pretend queue. */
333 static int queued
= 0;
335 typedef struct Producer_Params
{
340 /* We only benchmark the consumer thread, but both threads are doing
341 essentially the same thing, and never run in parallel due to the
342 locks. Thus, even if they run on separate processing cores, we
343 count the time for both threads. */
345 test_producer_thread (void *v
)
347 Producer_Params
*p
= (Producer_Params
*) v
;
348 long iters
= p
->iters
;
349 int filler
= p
->filler
;
352 for (j
= iters
; j
>= 0; --j
)
354 /* Aquire lock on the queue. */
355 pthread_mutex_lock (&m
);
356 /* if something's already there, wait. */
358 pthread_cond_wait (&consumer_c
, &m
);
360 /* Put something on the queue */
363 pthread_cond_signal (&producer_c
);
365 /* Give the other thread a chance to run. */
366 pthread_mutex_unlock (&m
);
373 test_consumer_producer (long iters
, int filler
)
375 timing_t start
, stop
, cur
;
382 pthread_mutex_init (&m
, NULL
);
383 pthread_cond_init (&cv
, NULL
);
385 pthread_create (&helper_id
, NULL
, test_producer_thread
, &p
);
389 for (long j
= iters
; j
>= 0; --j
)
391 /* Aquire lock on the queue. */
392 pthread_mutex_lock (&m
);
393 /* Wait for something to be on the queue. */
395 pthread_cond_wait (&producer_c
, &m
);
400 pthread_cond_signal (&consumer_c
);
402 /* Give the other thread a chance to run. */
403 pthread_mutex_unlock (&m
);
407 TIMING_DIFF (cur
, start
, stop
);
410 pthread_join (helper_id
, NULL
);
414 /* Number of runs we use for computing mean and standard deviation.
415 We actually do two additional runs and discard the outliers. */
419 do_bench_2 (const char *name
, test_t func
, int filler
, json_ctx_t
*js
)
422 struct timeval ts
, te
;
424 long iters
, iters_limit
;
425 timing_t curs
[RUN_COUNT
+ 2];
430 iters_limit
= LONG_MAX
/ 100;
433 gettimeofday (&ts
, NULL
);
434 cur
= func(iters
, filler
);
435 gettimeofday (&te
, NULL
);
437 /* We want a test to take at least 0.01 seconds, and try
438 increasingly larger iteration counts until it does. This
439 allows for approximately constant-time tests regardless of
440 hardware speed, without the overhead of checking the time
441 inside the test loop itself. We stop at a million iterations
442 as that should be precise enough. Once we determine a suitable
443 iteration count, we run the test multiple times to calculate
444 mean and standard deviation. */
446 /* Note that this also primes the CPU cache and triggers faster
448 tsd
= ts
.tv_sec
+ ts
.tv_usec
/ 1000000.0;
449 ted
= te
.tv_sec
+ te
.tv_usec
/ 1000000.0;
452 || iters
>= iters_limit
460 for (i
= 1; i
< RUN_COUNT
+ 2; i
++)
461 curs
[i
] = func(iters
, filler
);
463 /* We sort the results so we can discard the fastest and slowest
464 times as outliers. In theory we should keep the fastest time,
465 but IMHO this is more fair. A simple bubble sort suffices. */
467 for (i
= 0; i
< RUN_COUNT
+ 1; i
++)
468 for (j
= i
+ 1; j
< RUN_COUNT
+ 2; j
++)
469 if (curs
[i
] > curs
[j
])
471 timing_t temp
= curs
[i
];
476 /* Now calculate mean and standard deviation, skipping the outliers. */
478 for (i
= 1; i
<RUN_COUNT
+ 1; i
++)
479 mean
+= (double) curs
[i
] / (double) iters
;
483 for (i
= 1; i
< RUN_COUNT
+ 1; i
++)
485 double s
= (double) curs
[i
] / (double) iters
- mean
;
488 stdev
= sqrt (stdev
/ (RUN_COUNT
- 1));
491 snprintf (buf
, sizeof buf
, "%s-%s", name
, filler
? "filler" : "empty");
493 json_attr_object_begin (js
, buf
);
495 json_attr_double (js
, "duration", (double) cur
);
496 json_attr_double (js
, "iterations", (double) iters
);
497 json_attr_double (js
, "wall-sec", (double) td
);
498 json_attr_double (js
, "mean", mean
);
499 json_attr_double (js
, "stdev", stdev
);
500 json_attr_double (js
, "min-outlier", (double) curs
[0] / (double) iters
);
501 json_attr_double (js
, "min", (double) curs
[1] / (double) iters
);
502 json_attr_double (js
, "max", (double) curs
[RUN_COUNT
] / (double) iters
);
503 json_attr_double (js
, "max-outlier", (double) curs
[RUN_COUNT
+ 1] / (double) iters
);
505 json_attr_object_end (js
);
511 do_bench_1 (const char *name
, test_t func
, json_ctx_t
*js
)
515 rv
+= do_bench_2 (name
, func
, 0, js
);
516 rv
+= do_bench_2 (name
, func
, 1, js
);
527 json_init (&json_ctx
, 2, stdout
);
528 json_attr_object_begin (&json_ctx
, "pthread_locks");
530 #define BENCH(n) rv += do_bench_1 (#n, test_##n, &json_ctx)
533 BENCH (mutex_trylock
);
535 BENCH (rwlock_tryread
);
536 BENCH (rwlock_write
);
537 BENCH (rwlock_trywrite
);
539 BENCH (spin_trylock
);
543 BENCH (consumer_producer
);
545 json_attr_object_end (&json_ctx
);
551 #define TEST_FUNCTION do_bench ()
553 #include "../test-skeleton.c"