nptl/pthread_barrier_wait.c

   1 /* Copyright (C) 2003-2021 Free Software Foundation, Inc.
   2    This file is part of the GNU C Library.
   3    Contributed by Martin Schwidefsky <schwidefsky@de.ibm.com>, 2003.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <https://www.gnu.org/licenses/>.  */
  18
  19 #include <errno.h>
  20 #include <sysdep.h>
  21 #include <futex-internal.h>
  22 #include <pthreadP.h>
  23 #include <shlib-compat.h>
  24
  25
  26 /* Wait on the barrier.
  27
  28    In each round, we wait for a fixed number of threads to enter the barrier
  29    (COUNT).  Once that has happened, exactly these threads are allowed to
  30    leave the barrier.  Note that POSIX does not require that only COUNT
  31    threads can attempt to block using the barrier concurrently.
  32
  33    We count the number of threads that have entered (IN).  Each thread
  34    increments IN when entering, thus getting a position in the sequence of
  35    threads that are or have been waiting (starting with 1, so the position
  36    is the number of threads that have entered so far including the current
  37    thread).
  38    CURRENT_ROUND designates the most recent thread whose round has been
  39    detected as complete.  When a thread detects that enough threads have
  40    entered to make a round complete, it finishes this round by effectively
  41    adding COUNT to CURRENT_ROUND atomically.  Threads that believe that their
  42    round is not complete yet wait until CURRENT_ROUND is not smaller than
  43    their position anymore.
  44
  45    A barrier can be destroyed as soon as no threads are blocked on the
  46    barrier.  This is already the case if just one thread from the last round
  47    has stopped waiting and returned to the caller; the assumption is that
  48    all threads from the round are unblocked atomically, even though they may
  49    return at different times from the respective calls to
  50    pthread_barrier_wait).  Thus, a valid call to pthread_barrier_destroy can
  51    be concurrent with other threads still figuring out that their round has
  52    been completed.  Therefore, threads need to confirm that they have left
  53    the barrier by incrementing OUT, and pthread_barrier_destroy needs to wait
  54    until OUT equals IN.
  55
  56    To avoid an ABA issue for futex_wait on CURRENT_ROUND and for archs with
  57    32b-only atomics, we additionally reset the barrier when IN reaches
  58    a threshold to avoid overflow.  We assume that the total number of threads
  59    is less than UINT_MAX/2, and set the threshold accordingly so that we can
  60    use a simple atomic_fetch_add on IN instead of a CAS when entering.  The
  61    threshold is always set to the end of a round, so all threads that have
  62    entered are either pre-reset threads or post-reset threads (i.e., have a
  63    position larger than the threshold).
  64    Pre-reset threads just run the algorithm explained above.  Post-reset
  65    threads wait until IN is reset to a pre-threshold value.
  66    When the last pre-reset thread leaves the barrier (i.e., OUT equals the
  67    threshold), it resets the barrier to its initial state.  Other (post-reset)
  68    threads wait for the reset to have finished by waiting until IN is less
  69    than the threshold and then restart by trying to enter the barrier again.
  70
  71    We reuse the reset mechanism in pthread_barrier_destroy to get notified
  72    when all threads have left the barrier: We trigger an artificial reset and
  73    wait for the last pre-reset thread to finish reset, thus notifying the
  74    thread that is about to destroy the barrier.
  75
  76    Blocking using futexes is straightforward: pre-reset threads wait for
  77    completion of their round using CURRENT_ROUND as futex word, and post-reset
  78    threads and pthread_barrier_destroy use IN as futex word.
  79
  80    Further notes:
  81    * It is not simple to let some of the post-reset threads help with the
  82      reset because of the ABA issues that arise; therefore, we simply make
  83      the last thread to leave responsible for the reset.
  84    * POSIX leaves it unspecified whether a signal handler running in a thread
  85      that has been unblocked (because its round is complete) can stall all
  86      other threads and prevent them from returning from the barrier.  In this
  87      implementation, other threads will return.  However,
  88      pthread_barrier_destroy will of course wait for the signal handler thread
  89      to confirm that it left the barrier.
  90
  91    TODO We should add spinning with back-off.  Once we do that, we could also
  92    try to avoid the futex_wake syscall when a round is detected as finished.
  93    If we do not spin, it is quite likely that at least some other threads will
  94    have called futex_wait already.  */
  95 int
  96 ___pthread_barrier_wait (pthread_barrier_t *barrier)
  97 {
  98   struct pthread_barrier *bar = (struct pthread_barrier *) barrier;
  99
 100   /* How many threads entered so far, including ourself.  */
 101   unsigned int i;
 102
 103  reset_restart:
 104   /* Try to enter the barrier.  We need acquire MO to (1) ensure that if we
 105      observe that our round can be completed (see below for our attempt to do
 106      so), all pre-barrier-entry effects of all threads in our round happen
 107      before us completing the round, and (2) to make our use of the barrier
 108      happen after a potential reset.  We need release MO to make sure that our
 109      pre-barrier-entry effects happen before threads in this round leaving the
 110      barrier.  */
 111   i = atomic_fetch_add_acq_rel (&bar->in, 1) + 1;
 112   /* These loads are after the fetch_add so that we're less likely to first
 113      pull in the cache line as shared.  */
 114   unsigned int count = bar->count;
 115   /* This is the number of threads that can enter before we need to reset.
 116      Always at the end of a round.  */
 117   unsigned int max_in_before_reset = BARRIER_IN_THRESHOLD
 118                                    - BARRIER_IN_THRESHOLD % count;
 119
 120   if (i > max_in_before_reset)
 121     {
 122       /* We're in a reset round.  Just wait for a reset to finish; do not
 123          help finishing previous rounds because this could happen
 124          concurrently with a reset.  */
 125       while (i > max_in_before_reset)
 126         {
 127           futex_wait_simple (&bar->in, i, bar->shared);
 128           /* Relaxed MO is fine here because we just need an indication for
 129              when we should retry to enter (which will use acquire MO, see
 130              above).  */
 131           i = atomic_load_relaxed (&bar->in);
 132         }
 133       goto reset_restart;
 134     }
 135
 136   /* Look at the current round.  At this point, we are just interested in
 137      whether we can complete rounds, based on the information we obtained
 138      through our acquire-MO load of IN.  Nonetheless, if we notice that
 139      our round has been completed using this load, we use the acquire-MO
 140      fence below to make sure that all pre-barrier-entry effects of all
 141      threads in our round happen before us leaving the barrier.  Therefore,
 142      relaxed MO is sufficient.  */
 143   unsigned cr = atomic_load_relaxed (&bar->current_round);
 144
 145   /* Try to finish previous rounds and/or the current round.  We simply
 146      consider just our position here and do not try to do the work of threads
 147      that entered more recently.  */
 148   while (cr + count <= i)
 149     {
 150       /* Calculate the new current round based on how many threads entered.
 151          NEWCR must be larger than CR because CR+COUNT ends a round.  */
 152       unsigned int newcr = i - i % count;
 153       /* Try to complete previous and/or the current round.  We need release
 154          MO to propagate the happens-before that we observed through reading
 155          with acquire MO from IN to other threads.  If the CAS fails, it
 156          is like the relaxed-MO load of CURRENT_ROUND above.  */
 157       if (atomic_compare_exchange_weak_release (&bar->current_round, &cr,
 158                                                 newcr))
 159         {
 160           /* Update CR with the modification we just did.  */
 161           cr = newcr;
 162           /* Wake threads belonging to the rounds we just finished.  We may
 163              wake more threads than necessary if more than COUNT threads try
 164              to block concurrently on the barrier, but this is not a typical
 165              use of barriers.
 166              Note that we can still access SHARED because we haven't yet
 167              confirmed to have left the barrier.  */
 168           futex_wake (&bar->current_round, INT_MAX, bar->shared);
 169           /* We did as much as we could based on our position.  If we advanced
 170              the current round to a round sufficient for us, do not wait for
 171              that to happen and skip the acquire fence (we already
 172              synchronize-with all other threads in our round through the
 173              initial acquire MO fetch_add of IN.  */
 174           if (i <= cr)
 175             goto ready_to_leave;
 176           else
 177             break;
 178         }
 179     }
 180
 181   /* Wait until the current round is more recent than the round we are in.  */
 182   while (i > cr)
 183     {
 184       /* Wait for the current round to finish.  */
 185       futex_wait_simple (&bar->current_round, cr, bar->shared);
 186       /* See the fence below.  */
 187       cr = atomic_load_relaxed (&bar->current_round);
 188     }
 189
 190   /* Our round finished.  Use the acquire MO fence to synchronize-with the
 191      thread that finished the round, either through the initial load of
 192      CURRENT_ROUND above or a failed CAS in the loop above.  */
 193   atomic_thread_fence_acquire ();
 194
 195   /* Now signal that we left.  */
 196   unsigned int o;
 197  ready_to_leave:
 198   /* We need release MO here so that our use of the barrier happens before
 199      reset or memory reuse after pthread_barrier_destroy.  */
 200   o = atomic_fetch_add_release (&bar->out, 1) + 1;
 201   if (o == max_in_before_reset)
 202     {
 203       /* Perform a reset if we are the last pre-reset thread leaving.   All
 204          other threads accessing the barrier are post-reset threads and are
 205          incrementing or spinning on IN.  Thus, resetting IN as the last step
 206          of reset ensures that the reset is not concurrent with actual use of
 207          the barrier.  We need the acquire MO fence so that the reset happens
 208          after use of the barrier by all earlier pre-reset threads.  */
 209       atomic_thread_fence_acquire ();
 210       atomic_store_relaxed (&bar->current_round, 0);
 211       atomic_store_relaxed (&bar->out, 0);
 212       /* When destroying the barrier, we wait for a reset to happen.  Thus,
 213          we must load SHARED now so that this happens before the barrier is
 214          destroyed.  */
 215       int shared = bar->shared;
 216       atomic_store_release (&bar->in, 0);
 217       futex_wake (&bar->in, INT_MAX, shared);
 218
 219     }
 220
 221   /* Return a special value for exactly one thread per round.  */
 222   return i % count == 0 ?  PTHREAD_BARRIER_SERIAL_THREAD : 0;
 223 }
 224 versioned_symbol (libc, ___pthread_barrier_wait, pthread_barrier_wait,
 225                   GLIBC_2_34);
 226 libc_hidden_ver (___pthread_barrier_wait, __pthread_barrier_wait)
 227 #ifndef SHARED
 228 strong_alias (___pthread_barrier_wait, __pthread_barrier_wait)
 229 #endif
 230
 231 #if OTHER_SHLIB_COMPAT (libpthread, GLIBC_2_2, GLIBC_2_34)
 232 compat_symbol (libpthread, ___pthread_barrier_wait, pthread_barrier_wait,
 233                GLIBC_2_2);
 234 #endif