riscv: align stack before calling _dl_init [BZ #28703]
[glibc.git] / benchtests / bench-pthread-locks.c
blob5274246a4bd28f09e9b1b5bbb1d300566abec20f
1 /* Measure various lock acquisition times for empty critical sections.
2 Copyright (C) 2020-2021 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
19 #define TEST_MAIN
20 #define TEST_NAME "pthread-locks"
22 #include <stdio.h>
23 #include <string.h>
24 #include <limits.h>
25 #include <stdlib.h>
26 #include <pthread.h>
27 #include <semaphore.h>
28 #include <stdatomic.h>
29 #include <sys/time.h>
30 #include <math.h>
31 #include "bench-timing.h"
32 #include "json-lib.h"
34 /* The point of this benchmark is to measure the overhead of an empty
35 critical section or a small critical section. This is never going
36 to be indicative of real application performance. Instead we are
37 trying to benchmark the effects of the compiler and the runtime
38 coupled with a particular set of hardware atomic operations.
39 The numbers from this benchmark should be taken with a massive gain
40 of salt and viewed through the eyes of expert reviewers. */
42 static pthread_mutex_t m;
43 static pthread_rwlock_t rw;
44 static pthread_cond_t cv;
45 static pthread_cond_t consumer_c, producer_c;
46 static int cv_done;
47 static pthread_spinlock_t sp;
48 static sem_t sem;
50 typedef timing_t (*test_t)(long, int);
52 #define START_ITERS 1000
54 #define FILLER_GOES_HERE \
55 if (filler) \
56 do_filler ();
58 /* Everyone loves a good fibonacci series. This isn't quite one of
59 them because we need larger values in fewer steps, in a way that
60 won't be optimized away. We're looking to approximately double the
61 total time each test iteration takes, so as to not swamp the useful
62 timings. */
64 #pragma GCC push_options
65 #pragma GCC optimize(1)
67 static int __attribute__((noinline))
68 fibonacci (int i)
70 asm("");
71 if (i > 2)
72 return fibonacci (i-1) + fibonacci (i-2);
73 return 10+i;
76 static void
77 do_filler (void)
79 static char buf1[512], buf2[512];
80 int f = fibonacci (5);
81 memcpy (buf1, buf2, f);
84 #pragma GCC pop_options
86 static timing_t
87 test_mutex (long iters, int filler)
89 timing_t start, stop, cur;
91 pthread_mutex_init (&m, NULL);
93 TIMING_NOW (start);
94 for (long j = iters; j >= 0; --j)
96 pthread_mutex_lock (&m);
97 FILLER_GOES_HERE;
98 pthread_mutex_unlock (&m);
100 TIMING_NOW (stop);
101 TIMING_DIFF (cur, start, stop);
103 return cur;
106 static timing_t
107 test_mutex_trylock (long iters, int filler)
109 timing_t start, stop, cur;
111 pthread_mutex_init (&m, NULL);
112 pthread_mutex_lock (&m);
114 TIMING_NOW (start);
115 for (long j = iters; j >= 0; --j)
117 pthread_mutex_trylock (&m);
118 FILLER_GOES_HERE;
120 TIMING_NOW (stop);
121 TIMING_DIFF (cur, start, stop);
123 pthread_mutex_unlock (&m);
124 return cur;
127 static timing_t
128 test_rwlock_read (long iters, int filler)
130 timing_t start, stop, cur;
132 pthread_rwlock_init (&rw, NULL);
134 TIMING_NOW (start);
135 for (long j = iters; j >= 0; --j)
137 pthread_rwlock_rdlock (&rw);
138 FILLER_GOES_HERE;
139 pthread_rwlock_unlock (&rw);
141 TIMING_NOW (stop);
142 TIMING_DIFF (cur, start, stop);
144 return cur;
147 static timing_t
148 test_rwlock_tryread (long iters, int filler)
150 timing_t start, stop, cur;
152 pthread_rwlock_init (&rw, NULL);
153 pthread_rwlock_wrlock (&rw);
155 TIMING_NOW (start);
156 for (long j = iters; j >= 0; --j)
158 pthread_rwlock_tryrdlock (&rw);
159 FILLER_GOES_HERE;
161 TIMING_NOW (stop);
162 TIMING_DIFF (cur, start, stop);
164 pthread_rwlock_unlock (&rw);
165 return cur;
168 static timing_t
169 test_rwlock_write (long iters, int filler)
171 timing_t start, stop, cur;
173 pthread_rwlock_init (&rw, NULL);
175 TIMING_NOW (start);
176 for (long j = iters; j >= 0; --j)
178 pthread_rwlock_wrlock (&rw);
179 FILLER_GOES_HERE;
180 pthread_rwlock_unlock (&rw);
182 TIMING_NOW (stop);
183 TIMING_DIFF (cur, start, stop);
185 return cur;
188 static timing_t
189 test_rwlock_trywrite (long iters, int filler)
191 timing_t start, stop, cur;
193 pthread_rwlock_init (&rw, NULL);
194 pthread_rwlock_rdlock (&rw);
196 TIMING_NOW (start);
197 for (long j = iters; j >= 0; --j)
199 pthread_rwlock_trywrlock (&rw);
200 FILLER_GOES_HERE;
202 TIMING_NOW (stop);
203 TIMING_DIFF (cur, start, stop);
205 pthread_rwlock_unlock (&rw);
206 return cur;
209 static timing_t
210 test_spin_lock (long iters, int filler)
212 timing_t start, stop, cur;
214 pthread_spin_init (&sp, PTHREAD_PROCESS_PRIVATE);
216 TIMING_NOW (start);
217 for (long j = iters; j >= 0; --j)
219 pthread_spin_lock (&sp);
220 FILLER_GOES_HERE;
221 pthread_spin_unlock (&sp);
223 TIMING_NOW (stop);
224 TIMING_DIFF (cur, start, stop);
226 return cur;
229 static timing_t
230 test_spin_trylock (long iters, int filler)
232 timing_t start, stop, cur;
234 pthread_spin_init (&sp, PTHREAD_PROCESS_PRIVATE);
235 pthread_spin_lock (&sp);
237 TIMING_NOW (start);
238 for (long j = iters; j >= 0; --j)
240 pthread_spin_trylock (&sp);
241 FILLER_GOES_HERE;
243 TIMING_NOW (stop);
244 TIMING_DIFF (cur, start, stop);
246 pthread_spin_unlock (&sp);
247 return cur;
250 static timing_t
251 test_sem_wait (long iters, int filler)
253 timing_t start, stop, cur;
255 sem_init (&sem, 0, 1);
257 TIMING_NOW (start);
258 for (long j = iters; j >= 0; --j)
260 sem_post (&sem);
261 FILLER_GOES_HERE;
262 sem_wait (&sem);
264 TIMING_NOW (stop);
265 TIMING_DIFF (cur, start, stop);
267 return cur;
270 static timing_t
271 test_sem_trywait (long iters, int filler)
273 timing_t start, stop, cur;
275 sem_init (&sem, 0, 0);
277 TIMING_NOW (start);
278 for (long j = iters; j >= 0; --j)
280 sem_trywait (&sem);
281 FILLER_GOES_HERE;
283 TIMING_NOW (stop);
284 TIMING_DIFF (cur, start, stop);
286 return cur;
289 static void *
290 test_condvar_helper (void *v)
292 /* This is wasteful, but the alternative is to add the overhead of a
293 mutex lock/unlock to the overall iteration (both threads) and we
294 don't want that. Ideally, this thread would run on an
295 independent processing core anyway. The ONLY goal here is to
296 minimize the time the other thread spends waiting for us. */
297 while (__atomic_load_n (&cv_done, __ATOMIC_RELAXED) == 0)
298 pthread_cond_signal (&cv);
300 return NULL;
303 static timing_t
304 test_condvar (long iters, int filler)
306 timing_t start, stop, cur;
307 pthread_t helper_id;
309 pthread_mutex_init (&m, NULL);
310 pthread_cond_init (&cv, NULL);
311 pthread_mutex_lock (&m);
313 __atomic_store_n (&cv_done, 0, __ATOMIC_RELAXED);
314 pthread_create (&helper_id, NULL, test_condvar_helper, &iters);
316 TIMING_NOW (start);
317 for (long j = iters; j >= 0; --j)
319 pthread_cond_wait (&cv, &m);
320 FILLER_GOES_HERE;
322 TIMING_NOW (stop);
323 TIMING_DIFF (cur, start, stop);
325 pthread_mutex_unlock (&m);
326 __atomic_store_n (&cv_done, 1, __ATOMIC_RELAXED);
328 pthread_join (helper_id, NULL);
329 return cur;
332 /* How many items are "queued" in our pretend queue. */
333 static int queued = 0;
335 typedef struct Producer_Params {
336 long iters;
337 int filler;
338 } Producer_Params;
340 /* We only benchmark the consumer thread, but both threads are doing
341 essentially the same thing, and never run in parallel due to the
342 locks. Thus, even if they run on separate processing cores, we
343 count the time for both threads. */
344 static void *
345 test_producer_thread (void *v)
347 Producer_Params *p = (Producer_Params *) v;
348 long iters = p->iters;
349 int filler = p->filler;
350 long j;
352 for (j = iters; j >= 0; --j)
354 /* Aquire lock on the queue. */
355 pthread_mutex_lock (&m);
356 /* if something's already there, wait. */
357 while (queued > 0)
358 pthread_cond_wait (&consumer_c, &m);
360 /* Put something on the queue */
361 FILLER_GOES_HERE;
362 ++ queued;
363 pthread_cond_signal (&producer_c);
365 /* Give the other thread a chance to run. */
366 pthread_mutex_unlock (&m);
369 return NULL;
372 static timing_t
373 test_consumer_producer (long iters, int filler)
375 timing_t start, stop, cur;
376 pthread_t helper_id;
377 Producer_Params p;
379 p.iters = iters;
380 p.filler = filler;
382 pthread_mutex_init (&m, NULL);
383 pthread_cond_init (&cv, NULL);
385 pthread_create (&helper_id, NULL, test_producer_thread, &p);
387 TIMING_NOW (start);
389 for (long j = iters; j >= 0; --j)
391 /* Aquire lock on the queue. */
392 pthread_mutex_lock (&m);
393 /* Wait for something to be on the queue. */
394 while (queued == 0)
395 pthread_cond_wait (&producer_c, &m);
397 /* Take if off. */
398 FILLER_GOES_HERE;
399 -- queued;
400 pthread_cond_signal (&consumer_c);
402 /* Give the other thread a chance to run. */
403 pthread_mutex_unlock (&m);
406 TIMING_NOW (stop);
407 TIMING_DIFF (cur, start, stop);
410 pthread_join (helper_id, NULL);
411 return cur;
414 /* Number of runs we use for computing mean and standard deviation.
415 We actually do two additional runs and discard the outliers. */
416 #define RUN_COUNT 10
418 static int
419 do_bench_2 (const char *name, test_t func, int filler, json_ctx_t *js)
421 timing_t cur;
422 struct timeval ts, te;
423 double tsd, ted, td;
424 long iters, iters_limit;
425 timing_t curs[RUN_COUNT + 2];
426 int i, j;
427 double mean, stdev;
429 iters = START_ITERS;
430 iters_limit = LONG_MAX / 100;
432 while (1) {
433 gettimeofday (&ts, NULL);
434 cur = func(iters, filler);
435 gettimeofday (&te, NULL);
437 /* We want a test to take at least 0.01 seconds, and try
438 increasingly larger iteration counts until it does. This
439 allows for approximately constant-time tests regardless of
440 hardware speed, without the overhead of checking the time
441 inside the test loop itself. We stop at a million iterations
442 as that should be precise enough. Once we determine a suitable
443 iteration count, we run the test multiple times to calculate
444 mean and standard deviation. */
446 /* Note that this also primes the CPU cache and triggers faster
447 MHz, we hope. */
448 tsd = ts.tv_sec + ts.tv_usec / 1000000.0;
449 ted = te.tv_sec + te.tv_usec / 1000000.0;
450 td = ted - tsd;
451 if (td >= 0.01
452 || iters >= iters_limit
453 || iters >= 1000000)
454 break;
456 iters *= 10;
459 curs[0] = cur;
460 for (i = 1; i < RUN_COUNT + 2; i ++)
461 curs[i] = func(iters, filler);
463 /* We sort the results so we can discard the fastest and slowest
464 times as outliers. In theory we should keep the fastest time,
465 but IMHO this is more fair. A simple bubble sort suffices. */
467 for (i = 0; i < RUN_COUNT + 1; i ++)
468 for (j = i + 1; j < RUN_COUNT + 2; j ++)
469 if (curs[i] > curs[j])
471 timing_t temp = curs[i];
472 curs[i] = curs[j];
473 curs[j] = temp;
476 /* Now calculate mean and standard deviation, skipping the outliers. */
477 mean = 0.0;
478 for (i = 1; i<RUN_COUNT + 1; i ++)
479 mean += (double) curs[i] / (double) iters;
480 mean /= RUN_COUNT;
482 stdev = 0.0;
483 for (i = 1; i < RUN_COUNT + 1; i ++)
485 double s = (double) curs[i] / (double) iters - mean;
486 stdev += s * s;
488 stdev = sqrt (stdev / (RUN_COUNT - 1));
490 char buf[128];
491 snprintf (buf, sizeof buf, "%s-%s", name, filler ? "filler" : "empty");
493 json_attr_object_begin (js, buf);
495 json_attr_double (js, "duration", (double) cur);
496 json_attr_double (js, "iterations", (double) iters);
497 json_attr_double (js, "wall-sec", (double) td);
498 json_attr_double (js, "mean", mean);
499 json_attr_double (js, "stdev", stdev);
500 json_attr_double (js, "min-outlier", (double) curs[0] / (double) iters);
501 json_attr_double (js, "min", (double) curs[1] / (double) iters);
502 json_attr_double (js, "max", (double) curs[RUN_COUNT] / (double) iters);
503 json_attr_double (js, "max-outlier", (double) curs[RUN_COUNT + 1] / (double) iters);
505 json_attr_object_end (js);
507 return 0;
510 static int
511 do_bench_1 (const char *name, test_t func, json_ctx_t *js)
513 int rv = 0;
515 rv += do_bench_2 (name, func, 0, js);
516 rv += do_bench_2 (name, func, 1, js);
518 return rv;
522 do_bench (void)
524 int rv = 0;
525 json_ctx_t json_ctx;
527 json_init (&json_ctx, 2, stdout);
528 json_attr_object_begin (&json_ctx, "pthread_locks");
530 #define BENCH(n) rv += do_bench_1 (#n, test_##n, &json_ctx)
532 BENCH (mutex);
533 BENCH (mutex_trylock);
534 BENCH (rwlock_read);
535 BENCH (rwlock_tryread);
536 BENCH (rwlock_write);
537 BENCH (rwlock_trywrite);
538 BENCH (spin_lock);
539 BENCH (spin_trylock);
540 BENCH (sem_wait);
541 BENCH (sem_trywait);
542 BENCH (condvar);
543 BENCH (consumer_producer);
545 json_attr_object_end (&json_ctx);
547 return rv;
551 #define TEST_FUNCTION do_bench ()
553 #include "../test-skeleton.c"