libgomp/ordered.c

   1 /* Copyright (C) 2005-2015 Free Software Foundation, Inc.
   2    Contributed by Richard Henderson <rth@redhat.com>.
   3
   4    This file is part of the GNU Offloading and Multi Processing Library
   5    (libgomp).
   6
   7    Libgomp is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
  13    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  14    FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  15    more details.
  16
  17    Under Section 7 of GPL version 3, you are granted additional
  18    permissions described in the GCC Runtime Library Exception, version
  19    3.1, as published by the Free Software Foundation.
  20
  21    You should have received a copy of the GNU General Public License and
  22    a copy of the GCC Runtime Library Exception along with this program;
  23    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  24    <http://www.gnu.org/licenses/>.  */
  25
  26 /* This file handles the ORDERED construct.  */
  27
  28 #include "libgomp.h"
  29 #include <stdarg.h>
  30 #include <string.h>
  31 #include "doacross.h"
  32
  33
  34 /* This function is called when first allocating an iteration block.  That
  35    is, the thread is not currently on the queue.  The work-share lock must
  36    be held on entry.  */
  37
  38 void
  39 gomp_ordered_first (void)
  40 {
  41   struct gomp_thread *thr = gomp_thread ();
  42   struct gomp_team *team = thr->ts.team;
  43   struct gomp_work_share *ws = thr->ts.work_share;
  44   unsigned index;
  45
  46   /* Work share constructs can be orphaned.  */
  47   if (team == NULL || team->nthreads == 1)
  48     return;
  49
  50   index = ws->ordered_cur + ws->ordered_num_used;
  51   if (index >= team->nthreads)
  52     index -= team->nthreads;
  53   ws->ordered_team_ids[index] = thr->ts.team_id;
  54
  55   /* If this is the first and only thread in the queue, then there is
  56      no one to release us when we get to our ordered section.  Post to
  57      our own release queue now so that we won't block later.  */
  58   if (ws->ordered_num_used++ == 0)
  59     gomp_sem_post (team->ordered_release[thr->ts.team_id]);
  60 }
  61
  62 /* This function is called when completing the last iteration block.  That
  63    is, there are no more iterations to perform and so the thread should be
  64    removed from the queue entirely.  Because of the way ORDERED blocks are
  65    managed, it follows that we currently own access to the ORDERED block,
  66    and should now pass it on to the next thread.  The work-share lock must
  67    be held on entry.  */
  68
  69 void
  70 gomp_ordered_last (void)
  71 {
  72   struct gomp_thread *thr = gomp_thread ();
  73   struct gomp_team *team = thr->ts.team;
  74   struct gomp_work_share *ws = thr->ts.work_share;
  75   unsigned next_id;
  76
  77   /* Work share constructs can be orphaned.  */
  78   if (team == NULL || team->nthreads == 1)
  79     return;
  80
  81   /* We're no longer the owner.  */
  82   ws->ordered_owner = -1;
  83
  84   /* If we're not the last thread in the queue, then wake the next.  */
  85   if (--ws->ordered_num_used > 0)
  86     {
  87       unsigned next = ws->ordered_cur + 1;
  88       if (next == team->nthreads)
  89         next = 0;
  90       ws->ordered_cur = next;
  91
  92       next_id = ws->ordered_team_ids[next];
  93       gomp_sem_post (team->ordered_release[next_id]);
  94     }
  95 }
  96
  97
  98 /* This function is called when allocating a subsequent allocation block.
  99    That is, we're done with the current iteration block and we're allocating
 100    another.  This is the logical combination of a call to gomp_ordered_last
 101    followed by a call to gomp_ordered_first.  The work-share lock must be
 102    held on entry. */
 103
 104 void
 105 gomp_ordered_next (void)
 106 {
 107   struct gomp_thread *thr = gomp_thread ();
 108   struct gomp_team *team = thr->ts.team;
 109   struct gomp_work_share *ws = thr->ts.work_share;
 110   unsigned index, next_id;
 111
 112   /* Work share constructs can be orphaned.  */
 113   if (team == NULL || team->nthreads == 1)
 114     return;
 115
 116   /* We're no longer the owner.  */
 117   ws->ordered_owner = -1;
 118
 119   /* If there's only one thread in the queue, that must be us.  */
 120   if (ws->ordered_num_used == 1)
 121     {
 122       /* We have a similar situation as in gomp_ordered_first
 123          where we need to post to our own release semaphore.  */
 124       gomp_sem_post (team->ordered_release[thr->ts.team_id]);
 125       return;
 126     }
 127
 128   /* If the queue is entirely full, then we move ourself to the end of
 129      the queue merely by incrementing ordered_cur.  Only if it's not
 130      full do we have to write our id.  */
 131   if (ws->ordered_num_used < team->nthreads)
 132     {
 133       index = ws->ordered_cur + ws->ordered_num_used;
 134       if (index >= team->nthreads)
 135         index -= team->nthreads;
 136       ws->ordered_team_ids[index] = thr->ts.team_id;
 137     }
 138
 139   index = ws->ordered_cur + 1;
 140   if (index == team->nthreads)
 141     index = 0;
 142   ws->ordered_cur = index;
 143
 144   next_id = ws->ordered_team_ids[index];
 145   gomp_sem_post (team->ordered_release[next_id]);
 146 }
 147
 148
 149 /* This function is called when a statically scheduled loop is first
 150    being created.  */
 151
 152 void
 153 gomp_ordered_static_init (void)
 154 {
 155   struct gomp_thread *thr = gomp_thread ();
 156   struct gomp_team *team = thr->ts.team;
 157
 158   if (team == NULL || team->nthreads == 1)
 159     return;
 160
 161   gomp_sem_post (team->ordered_release[0]);
 162 }
 163
 164 /* This function is called when a statically scheduled loop is moving to
 165    the next allocation block.  Static schedules are not first come first
 166    served like the others, so we're to move to the numerically next thread,
 167    not the next thread on a list.  The work-share lock should *not* be held
 168    on entry.  */
 169
 170 void
 171 gomp_ordered_static_next (void)
 172 {
 173   struct gomp_thread *thr = gomp_thread ();
 174   struct gomp_team *team = thr->ts.team;
 175   struct gomp_work_share *ws = thr->ts.work_share;
 176   unsigned id = thr->ts.team_id;
 177
 178   if (team == NULL || team->nthreads == 1)
 179     return;
 180
 181   ws->ordered_owner = -1;
 182
 183   /* This thread currently owns the lock.  Increment the owner.  */
 184   if (++id == team->nthreads)
 185     id = 0;
 186   ws->ordered_team_ids[0] = id;
 187   gomp_sem_post (team->ordered_release[id]);
 188 }
 189
 190 /* This function is called when we need to assert that the thread owns the
 191    ordered section.  Due to the problem of posted-but-not-waited semaphores,
 192    this needs to happen before completing a loop iteration.  */
 193
 194 void
 195 gomp_ordered_sync (void)
 196 {
 197   struct gomp_thread *thr = gomp_thread ();
 198   struct gomp_team *team = thr->ts.team;
 199   struct gomp_work_share *ws = thr->ts.work_share;
 200
 201   /* Work share constructs can be orphaned.  But this clearly means that
 202      we are the only thread, and so we automatically own the section.  */
 203   if (team == NULL || team->nthreads == 1)
 204     return;
 205
 206   /* ??? I believe it to be safe to access this data without taking the
 207      ws->lock.  The only presumed race condition is with the previous
 208      thread on the queue incrementing ordered_cur such that it points
 209      to us, concurrently with our check below.  But our team_id is
 210      already present in the queue, and the other thread will always
 211      post to our release semaphore.  So the two cases are that we will
 212      either win the race an momentarily block on the semaphore, or lose
 213      the race and find the semaphore already unlocked and so not block.
 214      Either way we get correct results.
 215      However, there is an implicit flush on entry to an ordered region,
 216      so we do need to have a barrier here.  If we were taking a lock
 217      this could be MEMMODEL_RELEASE since the acquire would be coverd
 218      by the lock.  */
 219
 220   __atomic_thread_fence (MEMMODEL_ACQ_REL);
 221   if (ws->ordered_owner != thr->ts.team_id)
 222     {
 223       gomp_sem_wait (team->ordered_release[thr->ts.team_id]);
 224       ws->ordered_owner = thr->ts.team_id;
 225     }
 226 }
 227
 228 /* This function is called by user code when encountering the start of an
 229    ORDERED block.  We must check to see if the current thread is at the
 230    head of the queue, and if not, block.  */
 231
 232 #ifdef HAVE_ATTRIBUTE_ALIAS
 233 extern void GOMP_ordered_start (void)
 234         __attribute__((alias ("gomp_ordered_sync")));
 235 #else
 236 void
 237 GOMP_ordered_start (void)
 238 {
 239   gomp_ordered_sync ();
 240 }
 241 #endif
 242
 243 /* This function is called by user code when encountering the end of an
 244    ORDERED block.  With the current ORDERED implementation there's nothing
 245    for us to do.
 246
 247    However, the current implementation has a flaw in that it does not allow
 248    the next thread into the ORDERED section immediately after the current
 249    thread exits the ORDERED section in its last iteration.  The existance
 250    of this function allows the implementation to change.  */
 251
 252 void
 253 GOMP_ordered_end (void)
 254 {
 255 }
 256
 257 /* DOACROSS initialization.  */
 258
 259 #define MAX_COLLAPSED_BITS (__SIZEOF_LONG__ * __CHAR_BIT__)
 260
 261 void
 262 gomp_doacross_init (unsigned ncounts, long *counts, long chunk_size)
 263 {
 264   struct gomp_thread *thr = gomp_thread ();
 265   struct gomp_team *team = thr->ts.team;
 266   struct gomp_work_share *ws = thr->ts.work_share;
 267   unsigned int i, bits[MAX_COLLAPSED_BITS], num_bits = 0;
 268   unsigned long ent, num_ents, elt_sz, shift_sz;
 269   struct gomp_doacross_work_share *doacross;
 270
 271   if (team == NULL || team->nthreads == 1)
 272     return;
 273
 274   for (i = 0; i < ncounts; i++)
 275     {
 276       /* If any count is 0, GOMP_doacross_{post,wait} can't be called.  */
 277       if (counts[i] == 0)
 278         return;
 279
 280       if (num_bits <= MAX_COLLAPSED_BITS)
 281         {
 282           unsigned int this_bits;
 283           if (counts[i] == 1)
 284             this_bits = 1;
 285           else
 286             this_bits = __SIZEOF_LONG__ * __CHAR_BIT__
 287                         - __builtin_clzl (counts[i] - 1);
 288           if (num_bits + this_bits <= MAX_COLLAPSED_BITS)
 289             {
 290               bits[i] = this_bits;
 291               num_bits += this_bits;
 292             }
 293           else
 294             num_bits = MAX_COLLAPSED_BITS + 1;
 295         }
 296     }
 297
 298   if (ws->sched == GFS_STATIC)
 299     num_ents = team->nthreads;
 300   else
 301     num_ents = (counts[0] - 1) / chunk_size + 1;
 302   if (num_bits <= MAX_COLLAPSED_BITS)
 303     {
 304       elt_sz = sizeof (unsigned long);
 305       shift_sz = ncounts * sizeof (unsigned int);
 306     }
 307   else
 308     {
 309       elt_sz = sizeof (unsigned long) * ncounts;
 310       shift_sz = 0;
 311     }
 312   elt_sz = (elt_sz + 63) & ~63UL;
 313
 314   doacross = gomp_malloc (sizeof (*doacross) + 63 + num_ents * elt_sz
 315                           + shift_sz);
 316   doacross->chunk_size = chunk_size;
 317   doacross->elt_sz = elt_sz;
 318   doacross->ncounts = ncounts;
 319   doacross->flattened = false;
 320   doacross->array = (unsigned char *)
 321                     ((((uintptr_t) (doacross + 1)) + 63 + shift_sz)
 322                      & ~(uintptr_t) 63);
 323   if (num_bits <= MAX_COLLAPSED_BITS)
 324     {
 325       unsigned int shift_count = 0;
 326       doacross->flattened = true;
 327       for (i = ncounts; i > 0; i--)
 328         {
 329           doacross->shift_counts[i - 1] = shift_count;
 330           shift_count += bits[i - 1];
 331         }
 332       for (ent = 0; ent < num_ents; ent++)
 333         *(unsigned long *) (doacross->array + ent * elt_sz) = 0;
 334     }
 335   else
 336     for (ent = 0; ent < num_ents; ent++)
 337       memset (doacross->array + ent * elt_sz, '\0',
 338               sizeof (unsigned long) * ncounts);
 339   if (ws->sched == GFS_STATIC && chunk_size == 0)
 340     {
 341       unsigned long q = counts[0] / num_ents;
 342       unsigned long t = counts[0] % num_ents;
 343       doacross->boundary = t * (q + 1);
 344       doacross->q = q;
 345       doacross->t = t;
 346     }
 347   ws->doacross = doacross;
 348 }
 349
 350 /* DOACROSS POST operation.  */
 351
 352 void
 353 GOMP_doacross_post (long *counts)
 354 {
 355   struct gomp_thread *thr = gomp_thread ();
 356   struct gomp_work_share *ws = thr->ts.work_share;
 357   struct gomp_doacross_work_share *doacross = ws->doacross;
 358   unsigned long ent;
 359   unsigned int i;
 360
 361   if (__builtin_expect (doacross == NULL, 0))
 362     {
 363       __sync_synchronize ();
 364       return;
 365     }
 366
 367   if (__builtin_expect (ws->sched == GFS_STATIC, 1))
 368     ent = thr->ts.team_id;
 369   else
 370     ent = counts[0] / doacross->chunk_size;
 371   unsigned long *array = (unsigned long *) (doacross->array
 372                                             + ent * doacross->elt_sz);
 373
 374   if (__builtin_expect (doacross->flattened, 1))
 375     {
 376       unsigned long flattened
 377         = (unsigned long) counts[0] << doacross->shift_counts[0];
 378
 379       for (i = 1; i < doacross->ncounts; i++)
 380         flattened |= (unsigned long) counts[i]
 381                      << doacross->shift_counts[i];
 382       flattened++;
 383       if (flattened == __atomic_load_n (array, MEMMODEL_ACQUIRE))
 384         __atomic_thread_fence (MEMMODEL_RELEASE);
 385       else
 386         __atomic_store_n (array, flattened, MEMMODEL_RELEASE);
 387       return;
 388     }
 389
 390   __atomic_thread_fence (MEMMODEL_ACQUIRE);
 391   for (i = doacross->ncounts; i-- > 0; )
 392     {
 393       if (counts[i] + 1UL != __atomic_load_n (&array[i], MEMMODEL_RELAXED))
 394         __atomic_store_n (&array[i], counts[i] + 1UL, MEMMODEL_RELEASE);
 395     }
 396 }
 397
 398 /* DOACROSS WAIT operation.  */
 399
 400 void
 401 GOMP_doacross_wait (long first, ...)
 402 {
 403   struct gomp_thread *thr = gomp_thread ();
 404   struct gomp_work_share *ws = thr->ts.work_share;
 405   struct gomp_doacross_work_share *doacross = ws->doacross;
 406   va_list ap;
 407   unsigned long ent;
 408   unsigned int i;
 409
 410   if (__builtin_expect (doacross == NULL, 0))
 411     {
 412       __sync_synchronize ();
 413       return;
 414     }
 415
 416   if (__builtin_expect (ws->sched == GFS_STATIC, 1))
 417     {
 418       if (ws->chunk_size == 0)
 419         {
 420           if (first < doacross->boundary)
 421             ent = first / (doacross->q + 1);
 422           else
 423             ent = (first - doacross->boundary) / doacross->q
 424                   + doacross->t;
 425         }
 426       else
 427         ent = first / ws->chunk_size % thr->ts.team->nthreads;
 428     }
 429   else
 430     ent = first / doacross->chunk_size;
 431   unsigned long *array = (unsigned long *) (doacross->array
 432                                             + ent * doacross->elt_sz);
 433
 434   if (__builtin_expect (doacross->flattened, 1))
 435     {
 436       unsigned long flattened
 437         = (unsigned long) first << doacross->shift_counts[0];
 438       unsigned long cur;
 439
 440       va_start (ap, first);
 441       for (i = 1; i < doacross->ncounts; i++)
 442         flattened |= (unsigned long) va_arg (ap, long)
 443                      << doacross->shift_counts[i];
 444       cur = __atomic_load_n (array, MEMMODEL_ACQUIRE);
 445       if (flattened < cur)
 446         {
 447           __atomic_thread_fence (MEMMODEL_RELEASE);
 448           va_end (ap);
 449           return;
 450         }
 451       doacross_spin (array, flattened, cur);
 452       __atomic_thread_fence (MEMMODEL_RELEASE);
 453       va_end (ap);
 454       return;
 455     }
 456
 457   do
 458     {
 459       va_start (ap, first);
 460       for (i = 0; i < doacross->ncounts; i++)
 461         {
 462           unsigned long thisv
 463             = (unsigned long) (i ? va_arg (ap, long) : first) + 1;
 464           unsigned long cur = __atomic_load_n (&array[i], MEMMODEL_RELAXED);
 465           if (thisv < cur)
 466             {
 467               i = doacross->ncounts;
 468               break;
 469             }
 470           if (thisv > cur)
 471             break;
 472         }
 473       va_end (ap);
 474       if (i == doacross->ncounts)
 475         break;
 476       cpu_relax ();
 477     }
 478   while (1);
 479   __sync_synchronize ();
 480 }
 481
 482 typedef unsigned long long gomp_ull;
 483
 484 void
 485 gomp_doacross_ull_init (unsigned ncounts, gomp_ull *counts, gomp_ull chunk_size)
 486 {
 487   struct gomp_thread *thr = gomp_thread ();
 488   struct gomp_team *team = thr->ts.team;
 489   struct gomp_work_share *ws = thr->ts.work_share;
 490   unsigned int i, bits[MAX_COLLAPSED_BITS], num_bits = 0;
 491   unsigned long ent, num_ents, elt_sz, shift_sz;
 492   struct gomp_doacross_work_share *doacross;
 493
 494   if (team == NULL || team->nthreads == 1)
 495     return;
 496
 497   for (i = 0; i < ncounts; i++)
 498     {
 499       /* If any count is 0, GOMP_doacross_{post,wait} can't be called.  */
 500       if (counts[i] == 0)
 501         return;
 502
 503       if (num_bits <= MAX_COLLAPSED_BITS)
 504         {
 505           unsigned int this_bits;
 506           if (counts[i] == 1)
 507             this_bits = 1;
 508           else
 509             this_bits = __SIZEOF_LONG_LONG__ * __CHAR_BIT__
 510                         - __builtin_clzll (counts[i] - 1);
 511           if (num_bits + this_bits <= MAX_COLLAPSED_BITS)
 512             {
 513               bits[i] = this_bits;
 514               num_bits += this_bits;
 515             }
 516           else
 517             num_bits = MAX_COLLAPSED_BITS + 1;
 518         }
 519     }
 520
 521   if (ws->sched == GFS_STATIC)
 522     num_ents = team->nthreads;
 523   else
 524     num_ents = (counts[0] - 1) / chunk_size + 1;
 525   if (num_bits <= MAX_COLLAPSED_BITS)
 526     {
 527       elt_sz = sizeof (unsigned long);
 528       shift_sz = ncounts * sizeof (unsigned int);
 529     }
 530   else
 531     {
 532       if (sizeof (gomp_ull) == sizeof (unsigned long))
 533         elt_sz = sizeof (gomp_ull) * ncounts;
 534       else if (sizeof (gomp_ull) == 2 * sizeof (unsigned long))
 535         elt_sz = sizeof (unsigned long) * 2 * ncounts;
 536       else
 537         abort ();
 538       shift_sz = 0;
 539     }
 540   elt_sz = (elt_sz + 63) & ~63UL;
 541
 542   doacross = gomp_malloc (sizeof (*doacross) + 63 + num_ents * elt_sz
 543                           + shift_sz);
 544   doacross->chunk_size_ull = chunk_size;
 545   doacross->elt_sz = elt_sz;
 546   doacross->ncounts = ncounts;
 547   doacross->flattened = false;
 548   doacross->boundary = 0;
 549   doacross->array = (unsigned char *)
 550                     ((((uintptr_t) (doacross + 1)) + 63 + shift_sz)
 551                      & ~(uintptr_t) 63);
 552   if (num_bits <= MAX_COLLAPSED_BITS)
 553     {
 554       unsigned int shift_count = 0;
 555       doacross->flattened = true;
 556       for (i = ncounts; i > 0; i--)
 557         {
 558           doacross->shift_counts[i - 1] = shift_count;
 559           shift_count += bits[i - 1];
 560         }
 561       for (ent = 0; ent < num_ents; ent++)
 562         *(unsigned long *) (doacross->array + ent * elt_sz) = 0;
 563     }
 564   else
 565     for (ent = 0; ent < num_ents; ent++)
 566       memset (doacross->array + ent * elt_sz, '\0',
 567               sizeof (unsigned long) * ncounts);
 568   if (ws->sched == GFS_STATIC && chunk_size == 0)
 569     {
 570       gomp_ull q = counts[0] / num_ents;
 571       gomp_ull t = counts[0] % num_ents;
 572       doacross->boundary_ull = t * (q + 1);
 573       doacross->q_ull = q;
 574       doacross->t = t;
 575     }
 576   ws->doacross = doacross;
 577 }
 578
 579 /* DOACROSS POST operation.  */
 580
 581 void
 582 GOMP_doacross_ull_post (gomp_ull *counts)
 583 {
 584   struct gomp_thread *thr = gomp_thread ();
 585   struct gomp_work_share *ws = thr->ts.work_share;
 586   struct gomp_doacross_work_share *doacross = ws->doacross;
 587   unsigned long ent;
 588   unsigned int i;
 589
 590   if (__builtin_expect (doacross == NULL, 0))
 591     {
 592       __sync_synchronize ();
 593       return;
 594     }
 595
 596   if (__builtin_expect (ws->sched == GFS_STATIC, 1))
 597     ent = thr->ts.team_id;
 598   else
 599     ent = counts[0] / doacross->chunk_size_ull;
 600
 601   if (__builtin_expect (doacross->flattened, 1))
 602     {
 603       unsigned long *array = (unsigned long *) (doacross->array
 604                               + ent * doacross->elt_sz);
 605       gomp_ull flattened
 606         = counts[0] << doacross->shift_counts[0];
 607
 608       for (i = 1; i < doacross->ncounts; i++)
 609         flattened |= counts[i] << doacross->shift_counts[i];
 610       flattened++;
 611       if (flattened == __atomic_load_n (array, MEMMODEL_ACQUIRE))
 612         __atomic_thread_fence (MEMMODEL_RELEASE);
 613       else
 614         __atomic_store_n (array, flattened, MEMMODEL_RELEASE);
 615       return;
 616     }
 617
 618   __atomic_thread_fence (MEMMODEL_ACQUIRE);
 619   if (sizeof (gomp_ull) == sizeof (unsigned long))
 620     {
 621       gomp_ull *array = (gomp_ull *) (doacross->array
 622                                       + ent * doacross->elt_sz);
 623
 624       for (i = doacross->ncounts; i-- > 0; )
 625         {
 626           if (counts[i] + 1UL != __atomic_load_n (&array[i], MEMMODEL_RELAXED))
 627             __atomic_store_n (&array[i], counts[i] + 1UL, MEMMODEL_RELEASE);
 628         }
 629     }
 630   else
 631     {
 632       unsigned long *array = (unsigned long *) (doacross->array
 633                                                 + ent * doacross->elt_sz);
 634
 635       for (i = doacross->ncounts; i-- > 0; )
 636         {
 637           gomp_ull cull = counts[i] + 1UL;
 638           unsigned long c = (unsigned long) cull;
 639           if (c != __atomic_load_n (&array[2 * i + 1], MEMMODEL_RELAXED))
 640             __atomic_store_n (&array[2 * i + 1], c, MEMMODEL_RELEASE);
 641           c = cull >> (__SIZEOF_LONG_LONG__ * __CHAR_BIT__ / 2);
 642           if (c != __atomic_load_n (&array[2 * i], MEMMODEL_RELAXED))
 643             __atomic_store_n (&array[2 * i], c, MEMMODEL_RELEASE);
 644         }
 645     }
 646 }
 647
 648 /* DOACROSS WAIT operation.  */
 649
 650 void
 651 GOMP_doacross_ull_wait (gomp_ull first, ...)
 652 {
 653   struct gomp_thread *thr = gomp_thread ();
 654   struct gomp_work_share *ws = thr->ts.work_share;
 655   struct gomp_doacross_work_share *doacross = ws->doacross;
 656   va_list ap;
 657   unsigned long ent;
 658   unsigned int i;
 659
 660   if (__builtin_expect (doacross == NULL, 0))
 661     {
 662       __sync_synchronize ();
 663       return;
 664     }
 665
 666   if (__builtin_expect (ws->sched == GFS_STATIC, 1))
 667     {
 668       if (ws->chunk_size_ull == 0)
 669         {
 670           if (first < doacross->boundary_ull)
 671             ent = first / (doacross->q_ull + 1);
 672           else
 673             ent = (first - doacross->boundary_ull) / doacross->q_ull
 674                   + doacross->t;
 675         }
 676       else
 677         ent = first / ws->chunk_size_ull % thr->ts.team->nthreads;
 678     }
 679   else
 680     ent = first / doacross->chunk_size_ull;
 681
 682   if (__builtin_expect (doacross->flattened, 1))
 683     {
 684       unsigned long *array = (unsigned long *) (doacross->array
 685                                                 + ent * doacross->elt_sz);
 686       gomp_ull flattened = first << doacross->shift_counts[0];
 687       unsigned long cur;
 688
 689       va_start (ap, first);
 690       for (i = 1; i < doacross->ncounts; i++)
 691         flattened |= va_arg (ap, gomp_ull)
 692                      << doacross->shift_counts[i];
 693       cur = __atomic_load_n (array, MEMMODEL_ACQUIRE);
 694       if (flattened < cur)
 695         {
 696           __atomic_thread_fence (MEMMODEL_RELEASE);
 697           va_end (ap);
 698           return;
 699         }
 700       doacross_spin (array, flattened, cur);
 701       __atomic_thread_fence (MEMMODEL_RELEASE);
 702       va_end (ap);
 703       return;
 704     }
 705
 706   if (sizeof (gomp_ull) == sizeof (unsigned long))
 707     {
 708       gomp_ull *array = (gomp_ull *) (doacross->array
 709                                       + ent * doacross->elt_sz);
 710       do
 711         {
 712           va_start (ap, first);
 713           for (i = 0; i < doacross->ncounts; i++)
 714             {
 715               gomp_ull thisv
 716                 = (i ? va_arg (ap, gomp_ull) : first) + 1;
 717               gomp_ull cur = __atomic_load_n (&array[i], MEMMODEL_RELAXED);
 718               if (thisv < cur)
 719                 {
 720                   i = doacross->ncounts;
 721                   break;
 722                 }
 723               if (thisv > cur)
 724                 break;
 725             }
 726           va_end (ap);
 727           if (i == doacross->ncounts)
 728             break;
 729           cpu_relax ();
 730         }
 731       while (1);
 732     }
 733   else
 734     {
 735       unsigned long *array = (unsigned long *) (doacross->array
 736                                                 + ent * doacross->elt_sz);
 737       do
 738         {
 739           va_start (ap, first);
 740           for (i = 0; i < doacross->ncounts; i++)
 741             {
 742               gomp_ull thisv
 743                 = (i ? va_arg (ap, gomp_ull) : first) + 1;
 744               unsigned long t
 745                 = thisv >> (__SIZEOF_LONG_LONG__ * __CHAR_BIT__ / 2);
 746               unsigned long cur
 747                 = __atomic_load_n (&array[2 * i], MEMMODEL_RELAXED);
 748               if (t < cur)
 749                 {
 750                   i = doacross->ncounts;
 751                   break;
 752                 }
 753               if (t > cur)
 754                 break;
 755               t = thisv;
 756               cur = __atomic_load_n (&array[2 * i + 1], MEMMODEL_RELAXED);
 757               if (t < cur)
 758                 {
 759                   i = doacross->ncounts;
 760                   break;
 761                 }
 762               if (t > cur)
 763                 break;
 764             }
 765           va_end (ap);
 766           if (i == doacross->ncounts)
 767             break;
 768           cpu_relax ();
 769         }
 770       while (1);
 771     }
 772   __sync_synchronize ();
 773 }