libgomp/ordered.c

   1 /* Copyright (C) 2005-2023 Free Software Foundation, Inc.
   2    Contributed by Richard Henderson <rth@redhat.com>.
   3
   4    This file is part of the GNU Offloading and Multi Processing Library
   5    (libgomp).
   6
   7    Libgomp is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
  13    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  14    FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  15    more details.
  16
  17    Under Section 7 of GPL version 3, you are granted additional
  18    permissions described in the GCC Runtime Library Exception, version
  19    3.1, as published by the Free Software Foundation.
  20
  21    You should have received a copy of the GNU General Public License and
  22    a copy of the GCC Runtime Library Exception along with this program;
  23    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  24    <http://www.gnu.org/licenses/>.  */
  25
  26 /* This file handles the ORDERED construct.  */
  27
  28 #include "libgomp.h"
  29 #include <stdarg.h>
  30 #include <string.h>
  31 #include "doacross.h"
  32
  33
  34 /* This function is called when first allocating an iteration block.  That
  35    is, the thread is not currently on the queue.  The work-share lock must
  36    be held on entry.  */
  37
  38 void
  39 gomp_ordered_first (void)
  40 {
  41   struct gomp_thread *thr = gomp_thread ();
  42   struct gomp_team *team = thr->ts.team;
  43   struct gomp_work_share *ws = thr->ts.work_share;
  44   unsigned index;
  45
  46   /* Work share constructs can be orphaned.  */
  47   if (team == NULL || team->nthreads == 1)
  48     return;
  49
  50   index = ws->ordered_cur + ws->ordered_num_used;
  51   if (index >= team->nthreads)
  52     index -= team->nthreads;
  53   ws->ordered_team_ids[index] = thr->ts.team_id;
  54
  55   /* If this is the first and only thread in the queue, then there is
  56      no one to release us when we get to our ordered section.  Post to
  57      our own release queue now so that we won't block later.  */
  58   if (ws->ordered_num_used++ == 0)
  59     gomp_sem_post (team->ordered_release[thr->ts.team_id]);
  60 }
  61
  62 /* This function is called when completing the last iteration block.  That
  63    is, there are no more iterations to perform and so the thread should be
  64    removed from the queue entirely.  Because of the way ORDERED blocks are
  65    managed, it follows that we currently own access to the ORDERED block,
  66    and should now pass it on to the next thread.  The work-share lock must
  67    be held on entry.  */
  68
  69 void
  70 gomp_ordered_last (void)
  71 {
  72   struct gomp_thread *thr = gomp_thread ();
  73   struct gomp_team *team = thr->ts.team;
  74   struct gomp_work_share *ws = thr->ts.work_share;
  75   unsigned next_id;
  76
  77   /* Work share constructs can be orphaned.  */
  78   if (team == NULL || team->nthreads == 1)
  79     return;
  80
  81   /* We're no longer the owner.  */
  82   ws->ordered_owner = -1;
  83
  84   /* If we're not the last thread in the queue, then wake the next.  */
  85   if (--ws->ordered_num_used > 0)
  86     {
  87       unsigned next = ws->ordered_cur + 1;
  88       if (next == team->nthreads)
  89         next = 0;
  90       ws->ordered_cur = next;
  91
  92       next_id = ws->ordered_team_ids[next];
  93       gomp_sem_post (team->ordered_release[next_id]);
  94     }
  95 }
  96
  97
  98 /* This function is called when allocating a subsequent allocation block.
  99    That is, we're done with the current iteration block and we're allocating
 100    another.  This is the logical combination of a call to gomp_ordered_last
 101    followed by a call to gomp_ordered_first.  The work-share lock must be
 102    held on entry. */
 103
 104 void
 105 gomp_ordered_next (void)
 106 {
 107   struct gomp_thread *thr = gomp_thread ();
 108   struct gomp_team *team = thr->ts.team;
 109   struct gomp_work_share *ws = thr->ts.work_share;
 110   unsigned index, next_id;
 111
 112   /* Work share constructs can be orphaned.  */
 113   if (team == NULL || team->nthreads == 1)
 114     return;
 115
 116   /* We're no longer the owner.  */
 117   ws->ordered_owner = -1;
 118
 119   /* If there's only one thread in the queue, that must be us.  */
 120   if (ws->ordered_num_used == 1)
 121     {
 122       /* We have a similar situation as in gomp_ordered_first
 123          where we need to post to our own release semaphore.  */
 124       gomp_sem_post (team->ordered_release[thr->ts.team_id]);
 125       return;
 126     }
 127
 128   /* If the queue is entirely full, then we move ourself to the end of
 129      the queue merely by incrementing ordered_cur.  Only if it's not
 130      full do we have to write our id.  */
 131   if (ws->ordered_num_used < team->nthreads)
 132     {
 133       index = ws->ordered_cur + ws->ordered_num_used;
 134       if (index >= team->nthreads)
 135         index -= team->nthreads;
 136       ws->ordered_team_ids[index] = thr->ts.team_id;
 137     }
 138
 139   index = ws->ordered_cur + 1;
 140   if (index == team->nthreads)
 141     index = 0;
 142   ws->ordered_cur = index;
 143
 144   next_id = ws->ordered_team_ids[index];
 145   gomp_sem_post (team->ordered_release[next_id]);
 146 }
 147
 148
 149 /* This function is called when a statically scheduled loop is first
 150    being created.  */
 151
 152 void
 153 gomp_ordered_static_init (void)
 154 {
 155   struct gomp_thread *thr = gomp_thread ();
 156   struct gomp_team *team = thr->ts.team;
 157
 158   if (team == NULL || team->nthreads == 1)
 159     return;
 160
 161   gomp_sem_post (team->ordered_release[0]);
 162 }
 163
 164 /* This function is called when a statically scheduled loop is moving to
 165    the next allocation block.  Static schedules are not first come first
 166    served like the others, so we're to move to the numerically next thread,
 167    not the next thread on a list.  The work-share lock should *not* be held
 168    on entry.  */
 169
 170 void
 171 gomp_ordered_static_next (void)
 172 {
 173   struct gomp_thread *thr = gomp_thread ();
 174   struct gomp_team *team = thr->ts.team;
 175   struct gomp_work_share *ws = thr->ts.work_share;
 176   unsigned id = thr->ts.team_id;
 177
 178   if (team == NULL || team->nthreads == 1)
 179     return;
 180
 181   ws->ordered_owner = -1;
 182
 183   /* This thread currently owns the lock.  Increment the owner.  */
 184   if (++id == team->nthreads)
 185     id = 0;
 186   ws->ordered_team_ids[0] = id;
 187   gomp_sem_post (team->ordered_release[id]);
 188 }
 189
 190 /* This function is called when we need to assert that the thread owns the
 191    ordered section.  Due to the problem of posted-but-not-waited semaphores,
 192    this needs to happen before completing a loop iteration.  */
 193
 194 void
 195 gomp_ordered_sync (void)
 196 {
 197   struct gomp_thread *thr = gomp_thread ();
 198   struct gomp_team *team = thr->ts.team;
 199   struct gomp_work_share *ws = thr->ts.work_share;
 200
 201   /* Work share constructs can be orphaned.  But this clearly means that
 202      we are the only thread, and so we automatically own the section.  */
 203   if (team == NULL || team->nthreads == 1)
 204     return;
 205
 206   /* ??? I believe it to be safe to access this data without taking the
 207      ws->lock.  The only presumed race condition is with the previous
 208      thread on the queue incrementing ordered_cur such that it points
 209      to us, concurrently with our check below.  But our team_id is
 210      already present in the queue, and the other thread will always
 211      post to our release semaphore.  So the two cases are that we will
 212      either win the race an momentarily block on the semaphore, or lose
 213      the race and find the semaphore already unlocked and so not block.
 214      Either way we get correct results.
 215      However, there is an implicit flush on entry to an ordered region,
 216      so we do need to have a barrier here.  If we were taking a lock
 217      this could be MEMMODEL_RELEASE since the acquire would be covered
 218      by the lock.  */
 219
 220   __atomic_thread_fence (MEMMODEL_ACQ_REL);
 221   if (ws->ordered_owner != thr->ts.team_id)
 222     {
 223       gomp_sem_wait (team->ordered_release[thr->ts.team_id]);
 224       ws->ordered_owner = thr->ts.team_id;
 225     }
 226 }
 227
 228 /* This function is called by user code when encountering the start of an
 229    ORDERED block.  We must check to see if the current thread is at the
 230    head of the queue, and if not, block.  */
 231
 232 #ifdef HAVE_ATTRIBUTE_ALIAS
 233 extern void GOMP_ordered_start (void)
 234         __attribute__((alias ("gomp_ordered_sync")));
 235 #else
 236 void
 237 GOMP_ordered_start (void)
 238 {
 239   gomp_ordered_sync ();
 240 }
 241 #endif
 242
 243 /* This function is called by user code when encountering the end of an
 244    ORDERED block.  With the current ORDERED implementation there's nothing
 245    for us to do.
 246
 247    However, the current implementation has a flaw in that it does not allow
 248    the next thread into the ORDERED section immediately after the current
 249    thread exits the ORDERED section in its last iteration.  The existence
 250    of this function allows the implementation to change.  */
 251
 252 void
 253 GOMP_ordered_end (void)
 254 {
 255 }
 256
 257 /* DOACROSS initialization.  */
 258
 259 #define MAX_COLLAPSED_BITS (__SIZEOF_LONG__ * __CHAR_BIT__)
 260
 261 void
 262 gomp_doacross_init (unsigned ncounts, long *counts, long chunk_size,
 263                     size_t extra)
 264 {
 265   struct gomp_thread *thr = gomp_thread ();
 266   struct gomp_team *team = thr->ts.team;
 267   struct gomp_work_share *ws = thr->ts.work_share;
 268   unsigned int i, bits[MAX_COLLAPSED_BITS], num_bits = 0;
 269   unsigned long ent, num_ents, elt_sz, shift_sz;
 270   struct gomp_doacross_work_share *doacross;
 271
 272   if (team == NULL || team->nthreads == 1)
 273     {
 274     empty:
 275       if (!extra)
 276         ws->doacross = NULL;
 277       else
 278         {
 279           doacross = gomp_malloc_cleared (sizeof (*doacross) + extra);
 280           doacross->extra = (void *) (doacross + 1);
 281           ws->doacross = doacross;
 282         }
 283       return;
 284     }
 285
 286   for (i = 0; i < ncounts; i++)
 287     {
 288       /* If any count is 0, GOMP_doacross_{post,wait} can't be called.  */
 289       if (counts[i] == 0)
 290         goto empty;
 291
 292       if (num_bits <= MAX_COLLAPSED_BITS)
 293         {
 294           unsigned int this_bits;
 295           if (counts[i] == 1)
 296             this_bits = 1;
 297           else
 298             this_bits = __SIZEOF_LONG__ * __CHAR_BIT__
 299                         - __builtin_clzl (counts[i] - 1);
 300           if (num_bits + this_bits <= MAX_COLLAPSED_BITS)
 301             {
 302               bits[i] = this_bits;
 303               num_bits += this_bits;
 304             }
 305           else
 306             num_bits = MAX_COLLAPSED_BITS + 1;
 307         }
 308     }
 309
 310   if (ws->sched == GFS_STATIC)
 311     num_ents = team->nthreads;
 312   else if (ws->sched == GFS_GUIDED)
 313     num_ents = counts[0];
 314   else
 315     num_ents = (counts[0] - 1) / chunk_size + 1;
 316   if (num_bits <= MAX_COLLAPSED_BITS)
 317     {
 318       elt_sz = sizeof (unsigned long);
 319       shift_sz = ncounts * sizeof (unsigned int);
 320     }
 321   else
 322     {
 323       elt_sz = sizeof (unsigned long) * ncounts;
 324       shift_sz = 0;
 325     }
 326   elt_sz = (elt_sz + 63) & ~63UL;
 327
 328   doacross = gomp_malloc (sizeof (*doacross) + 63 + num_ents * elt_sz
 329                           + shift_sz + extra);
 330   doacross->chunk_size = chunk_size;
 331   doacross->elt_sz = elt_sz;
 332   doacross->ncounts = ncounts;
 333   doacross->flattened = false;
 334   doacross->array = (unsigned char *)
 335                     ((((uintptr_t) (doacross + 1)) + 63 + shift_sz)
 336                      & ~(uintptr_t) 63);
 337   if (extra)
 338     {
 339       doacross->extra = doacross->array + num_ents * elt_sz;
 340       memset (doacross->extra, '\0', extra);
 341     }
 342   else
 343     doacross->extra = NULL;
 344   if (num_bits <= MAX_COLLAPSED_BITS)
 345     {
 346       unsigned int shift_count = 0;
 347       doacross->flattened = true;
 348       for (i = ncounts; i > 0; i--)
 349         {
 350           doacross->shift_counts[i - 1] = shift_count;
 351           shift_count += bits[i - 1];
 352         }
 353       for (ent = 0; ent < num_ents; ent++)
 354         *(unsigned long *) (doacross->array + ent * elt_sz) = 0;
 355     }
 356   else
 357     for (ent = 0; ent < num_ents; ent++)
 358       memset (doacross->array + ent * elt_sz, '\0',
 359               sizeof (unsigned long) * ncounts);
 360   if (ws->sched == GFS_STATIC && chunk_size == 0)
 361     {
 362       unsigned long q = counts[0] / num_ents;
 363       unsigned long t = counts[0] % num_ents;
 364       doacross->boundary = t * (q + 1);
 365       doacross->q = q;
 366       doacross->t = t;
 367     }
 368   ws->doacross = doacross;
 369 }
 370
 371 /* DOACROSS POST operation.  */
 372
 373 void
 374 GOMP_doacross_post (long *counts)
 375 {
 376   struct gomp_thread *thr = gomp_thread ();
 377   struct gomp_work_share *ws = thr->ts.work_share;
 378   struct gomp_doacross_work_share *doacross = ws->doacross;
 379   unsigned long ent;
 380   unsigned int i;
 381
 382   if (__builtin_expect (doacross == NULL, 0)
 383       || __builtin_expect (doacross->array == NULL, 0))
 384     {
 385       __sync_synchronize ();
 386       return;
 387     }
 388
 389   if (__builtin_expect (ws->sched == GFS_STATIC, 1))
 390     ent = thr->ts.team_id;
 391   else if (ws->sched == GFS_GUIDED)
 392     ent = counts[0];
 393   else
 394     ent = counts[0] / doacross->chunk_size;
 395   unsigned long *array = (unsigned long *) (doacross->array
 396                                             + ent * doacross->elt_sz);
 397
 398   if (__builtin_expect (doacross->flattened, 1))
 399     {
 400       unsigned long flattened
 401         = (unsigned long) counts[0] << doacross->shift_counts[0];
 402
 403       for (i = 1; i < doacross->ncounts; i++)
 404         flattened |= (unsigned long) counts[i]
 405                      << doacross->shift_counts[i];
 406       flattened++;
 407       if (flattened == __atomic_load_n (array, MEMMODEL_ACQUIRE))
 408         __atomic_thread_fence (MEMMODEL_RELEASE);
 409       else
 410         __atomic_store_n (array, flattened, MEMMODEL_RELEASE);
 411       return;
 412     }
 413
 414   __atomic_thread_fence (MEMMODEL_ACQUIRE);
 415   for (i = doacross->ncounts; i-- > 0; )
 416     {
 417       if (counts[i] + 1UL != __atomic_load_n (&array[i], MEMMODEL_RELAXED))
 418         __atomic_store_n (&array[i], counts[i] + 1UL, MEMMODEL_RELEASE);
 419     }
 420 }
 421
 422 /* DOACROSS WAIT operation.  */
 423
 424 void
 425 GOMP_doacross_wait (long first, ...)
 426 {
 427   struct gomp_thread *thr = gomp_thread ();
 428   struct gomp_work_share *ws = thr->ts.work_share;
 429   struct gomp_doacross_work_share *doacross = ws->doacross;
 430   va_list ap;
 431   unsigned long ent;
 432   unsigned int i;
 433
 434   if (__builtin_expect (doacross == NULL, 0)
 435       || __builtin_expect (doacross->array == NULL, 0))
 436     {
 437       __sync_synchronize ();
 438       return;
 439     }
 440
 441   if (__builtin_expect (ws->sched == GFS_STATIC, 1))
 442     {
 443       if (ws->chunk_size == 0)
 444         {
 445           if (first < doacross->boundary)
 446             ent = first / (doacross->q + 1);
 447           else
 448             ent = (first - doacross->boundary) / doacross->q
 449                   + doacross->t;
 450         }
 451       else
 452         ent = first / ws->chunk_size % thr->ts.team->nthreads;
 453     }
 454   else if (ws->sched == GFS_GUIDED)
 455     ent = first;
 456   else
 457     ent = first / doacross->chunk_size;
 458   unsigned long *array = (unsigned long *) (doacross->array
 459                                             + ent * doacross->elt_sz);
 460
 461   if (__builtin_expect (doacross->flattened, 1))
 462     {
 463       unsigned long flattened
 464         = (unsigned long) first << doacross->shift_counts[0];
 465       unsigned long cur;
 466
 467       va_start (ap, first);
 468       for (i = 1; i < doacross->ncounts; i++)
 469         flattened |= (unsigned long) va_arg (ap, long)
 470                      << doacross->shift_counts[i];
 471       cur = __atomic_load_n (array, MEMMODEL_ACQUIRE);
 472       if (flattened < cur)
 473         {
 474           __atomic_thread_fence (MEMMODEL_RELEASE);
 475           va_end (ap);
 476           return;
 477         }
 478       doacross_spin (array, flattened, cur);
 479       __atomic_thread_fence (MEMMODEL_RELEASE);
 480       va_end (ap);
 481       return;
 482     }
 483
 484   do
 485     {
 486       va_start (ap, first);
 487       for (i = 0; i < doacross->ncounts; i++)
 488         {
 489           unsigned long thisv
 490             = (unsigned long) (i ? va_arg (ap, long) : first) + 1;
 491           unsigned long cur = __atomic_load_n (&array[i], MEMMODEL_RELAXED);
 492           if (thisv < cur)
 493             {
 494               i = doacross->ncounts;
 495               break;
 496             }
 497           if (thisv > cur)
 498             break;
 499         }
 500       va_end (ap);
 501       if (i == doacross->ncounts)
 502         break;
 503       cpu_relax ();
 504     }
 505   while (1);
 506   __sync_synchronize ();
 507 }
 508
 509 typedef unsigned long long gomp_ull;
 510
 511 void
 512 gomp_doacross_ull_init (unsigned ncounts, gomp_ull *counts,
 513                         gomp_ull chunk_size, size_t extra)
 514 {
 515   struct gomp_thread *thr = gomp_thread ();
 516   struct gomp_team *team = thr->ts.team;
 517   struct gomp_work_share *ws = thr->ts.work_share;
 518   unsigned int i, bits[MAX_COLLAPSED_BITS], num_bits = 0;
 519   unsigned long ent, num_ents, elt_sz, shift_sz;
 520   struct gomp_doacross_work_share *doacross;
 521
 522   if (team == NULL || team->nthreads == 1)
 523     {
 524     empty:
 525       if (!extra)
 526         ws->doacross = NULL;
 527       else
 528         {
 529           doacross = gomp_malloc_cleared (sizeof (*doacross) + extra);
 530           doacross->extra = (void *) (doacross + 1);
 531           ws->doacross = doacross;
 532         }
 533       return;
 534     }
 535
 536   for (i = 0; i < ncounts; i++)
 537     {
 538       /* If any count is 0, GOMP_doacross_{post,wait} can't be called.  */
 539       if (counts[i] == 0)
 540         goto empty;
 541
 542       if (num_bits <= MAX_COLLAPSED_BITS)
 543         {
 544           unsigned int this_bits;
 545           if (counts[i] == 1)
 546             this_bits = 1;
 547           else
 548             this_bits = __SIZEOF_LONG_LONG__ * __CHAR_BIT__
 549                         - __builtin_clzll (counts[i] - 1);
 550           if (num_bits + this_bits <= MAX_COLLAPSED_BITS)
 551             {
 552               bits[i] = this_bits;
 553               num_bits += this_bits;
 554             }
 555           else
 556             num_bits = MAX_COLLAPSED_BITS + 1;
 557         }
 558     }
 559
 560   if (ws->sched == GFS_STATIC)
 561     num_ents = team->nthreads;
 562   else if (ws->sched == GFS_GUIDED)
 563     num_ents = counts[0];
 564   else
 565     num_ents = (counts[0] - 1) / chunk_size + 1;
 566   if (num_bits <= MAX_COLLAPSED_BITS)
 567     {
 568       elt_sz = sizeof (unsigned long);
 569       shift_sz = ncounts * sizeof (unsigned int);
 570     }
 571   else
 572     {
 573       if (sizeof (gomp_ull) == sizeof (unsigned long))
 574         elt_sz = sizeof (gomp_ull) * ncounts;
 575       else if (sizeof (gomp_ull) == 2 * sizeof (unsigned long))
 576         elt_sz = sizeof (unsigned long) * 2 * ncounts;
 577       else
 578         abort ();
 579       shift_sz = 0;
 580     }
 581   elt_sz = (elt_sz + 63) & ~63UL;
 582
 583   doacross = gomp_malloc (sizeof (*doacross) + 63 + num_ents * elt_sz
 584                           + shift_sz);
 585   doacross->chunk_size_ull = chunk_size;
 586   doacross->elt_sz = elt_sz;
 587   doacross->ncounts = ncounts;
 588   doacross->flattened = false;
 589   doacross->boundary = 0;
 590   doacross->array = (unsigned char *)
 591                     ((((uintptr_t) (doacross + 1)) + 63 + shift_sz)
 592                      & ~(uintptr_t) 63);
 593   if (extra)
 594     {
 595       doacross->extra = doacross->array + num_ents * elt_sz;
 596       memset (doacross->extra, '\0', extra);
 597     }
 598   else
 599     doacross->extra = NULL;
 600   if (num_bits <= MAX_COLLAPSED_BITS)
 601     {
 602       unsigned int shift_count = 0;
 603       doacross->flattened = true;
 604       for (i = ncounts; i > 0; i--)
 605         {
 606           doacross->shift_counts[i - 1] = shift_count;
 607           shift_count += bits[i - 1];
 608         }
 609       for (ent = 0; ent < num_ents; ent++)
 610         *(unsigned long *) (doacross->array + ent * elt_sz) = 0;
 611     }
 612   else
 613     for (ent = 0; ent < num_ents; ent++)
 614       memset (doacross->array + ent * elt_sz, '\0',
 615               sizeof (unsigned long) * ncounts);
 616   if (ws->sched == GFS_STATIC && chunk_size == 0)
 617     {
 618       gomp_ull q = counts[0] / num_ents;
 619       gomp_ull t = counts[0] % num_ents;
 620       doacross->boundary_ull = t * (q + 1);
 621       doacross->q_ull = q;
 622       doacross->t = t;
 623     }
 624   ws->doacross = doacross;
 625 }
 626
 627 /* DOACROSS POST operation.  */
 628
 629 void
 630 GOMP_doacross_ull_post (gomp_ull *counts)
 631 {
 632   struct gomp_thread *thr = gomp_thread ();
 633   struct gomp_work_share *ws = thr->ts.work_share;
 634   struct gomp_doacross_work_share *doacross = ws->doacross;
 635   unsigned long ent;
 636   unsigned int i;
 637
 638   if (__builtin_expect (doacross == NULL, 0)
 639       || __builtin_expect (doacross->array == NULL, 0))
 640     {
 641       __sync_synchronize ();
 642       return;
 643     }
 644
 645   if (__builtin_expect (ws->sched == GFS_STATIC, 1))
 646     ent = thr->ts.team_id;
 647   else if (ws->sched == GFS_GUIDED)
 648     ent = counts[0];
 649   else
 650     ent = counts[0] / doacross->chunk_size_ull;
 651
 652   if (__builtin_expect (doacross->flattened, 1))
 653     {
 654       unsigned long *array = (unsigned long *) (doacross->array
 655                               + ent * doacross->elt_sz);
 656       gomp_ull flattened
 657         = counts[0] << doacross->shift_counts[0];
 658
 659       for (i = 1; i < doacross->ncounts; i++)
 660         flattened |= counts[i] << doacross->shift_counts[i];
 661       flattened++;
 662       if (flattened == __atomic_load_n (array, MEMMODEL_ACQUIRE))
 663         __atomic_thread_fence (MEMMODEL_RELEASE);
 664       else
 665         __atomic_store_n (array, flattened, MEMMODEL_RELEASE);
 666       return;
 667     }
 668
 669   __atomic_thread_fence (MEMMODEL_ACQUIRE);
 670   if (sizeof (gomp_ull) == sizeof (unsigned long))
 671     {
 672       gomp_ull *array = (gomp_ull *) (doacross->array
 673                                       + ent * doacross->elt_sz);
 674
 675       for (i = doacross->ncounts; i-- > 0; )
 676         {
 677           if (counts[i] + 1UL != __atomic_load_n (&array[i], MEMMODEL_RELAXED))
 678             __atomic_store_n (&array[i], counts[i] + 1UL, MEMMODEL_RELEASE);
 679         }
 680     }
 681   else
 682     {
 683       unsigned long *array = (unsigned long *) (doacross->array
 684                                                 + ent * doacross->elt_sz);
 685
 686       for (i = doacross->ncounts; i-- > 0; )
 687         {
 688           gomp_ull cull = counts[i] + 1UL;
 689           unsigned long c = (unsigned long) cull;
 690           if (c != __atomic_load_n (&array[2 * i + 1], MEMMODEL_RELAXED))
 691             __atomic_store_n (&array[2 * i + 1], c, MEMMODEL_RELEASE);
 692           c = cull >> (__SIZEOF_LONG_LONG__ * __CHAR_BIT__ / 2);
 693           if (c != __atomic_load_n (&array[2 * i], MEMMODEL_RELAXED))
 694             __atomic_store_n (&array[2 * i], c, MEMMODEL_RELEASE);
 695         }
 696     }
 697 }
 698
 699 /* DOACROSS WAIT operation.  */
 700
 701 void
 702 GOMP_doacross_ull_wait (gomp_ull first, ...)
 703 {
 704   struct gomp_thread *thr = gomp_thread ();
 705   struct gomp_work_share *ws = thr->ts.work_share;
 706   struct gomp_doacross_work_share *doacross = ws->doacross;
 707   va_list ap;
 708   unsigned long ent;
 709   unsigned int i;
 710
 711   if (__builtin_expect (doacross == NULL, 0)
 712       || __builtin_expect (doacross->array == NULL, 0))
 713     {
 714       __sync_synchronize ();
 715       return;
 716     }
 717
 718   if (__builtin_expect (ws->sched == GFS_STATIC, 1))
 719     {
 720       if (ws->chunk_size_ull == 0)
 721         {
 722           if (first < doacross->boundary_ull)
 723             ent = first / (doacross->q_ull + 1);
 724           else
 725             ent = (first - doacross->boundary_ull) / doacross->q_ull
 726                   + doacross->t;
 727         }
 728       else
 729         ent = first / ws->chunk_size_ull % thr->ts.team->nthreads;
 730     }
 731   else if (ws->sched == GFS_GUIDED)
 732     ent = first;
 733   else
 734     ent = first / doacross->chunk_size_ull;
 735
 736   if (__builtin_expect (doacross->flattened, 1))
 737     {
 738       unsigned long *array = (unsigned long *) (doacross->array
 739                                                 + ent * doacross->elt_sz);
 740       gomp_ull flattened = first << doacross->shift_counts[0];
 741       unsigned long cur;
 742
 743       va_start (ap, first);
 744       for (i = 1; i < doacross->ncounts; i++)
 745         flattened |= va_arg (ap, gomp_ull)
 746                      << doacross->shift_counts[i];
 747       cur = __atomic_load_n (array, MEMMODEL_ACQUIRE);
 748       if (flattened < cur)
 749         {
 750           __atomic_thread_fence (MEMMODEL_RELEASE);
 751           va_end (ap);
 752           return;
 753         }
 754       doacross_spin (array, flattened, cur);
 755       __atomic_thread_fence (MEMMODEL_RELEASE);
 756       va_end (ap);
 757       return;
 758     }
 759
 760   if (sizeof (gomp_ull) == sizeof (unsigned long))
 761     {
 762       gomp_ull *array = (gomp_ull *) (doacross->array
 763                                       + ent * doacross->elt_sz);
 764       do
 765         {
 766           va_start (ap, first);
 767           for (i = 0; i < doacross->ncounts; i++)
 768             {
 769               gomp_ull thisv
 770                 = (i ? va_arg (ap, gomp_ull) : first) + 1;
 771               gomp_ull cur = __atomic_load_n (&array[i], MEMMODEL_RELAXED);
 772               if (thisv < cur)
 773                 {
 774                   i = doacross->ncounts;
 775                   break;
 776                 }
 777               if (thisv > cur)
 778                 break;
 779             }
 780           va_end (ap);
 781           if (i == doacross->ncounts)
 782             break;
 783           cpu_relax ();
 784         }
 785       while (1);
 786     }
 787   else
 788     {
 789       unsigned long *array = (unsigned long *) (doacross->array
 790                                                 + ent * doacross->elt_sz);
 791       do
 792         {
 793           va_start (ap, first);
 794           for (i = 0; i < doacross->ncounts; i++)
 795             {
 796               gomp_ull thisv
 797                 = (i ? va_arg (ap, gomp_ull) : first) + 1;
 798               unsigned long t
 799                 = thisv >> (__SIZEOF_LONG_LONG__ * __CHAR_BIT__ / 2);
 800               unsigned long cur
 801                 = __atomic_load_n (&array[2 * i], MEMMODEL_RELAXED);
 802               if (t < cur)
 803                 {
 804                   i = doacross->ncounts;
 805                   break;
 806                 }
 807               if (t > cur)
 808                 break;
 809               t = thisv;
 810               cur = __atomic_load_n (&array[2 * i + 1], MEMMODEL_RELAXED);
 811               if (t < cur)
 812                 {
 813                   i = doacross->ncounts;
 814                   break;
 815                 }
 816               if (t > cur)
 817                 break;
 818             }
 819           va_end (ap);
 820           if (i == doacross->ncounts)
 821             break;
 822           cpu_relax ();
 823         }
 824       while (1);
 825     }
 826   __sync_synchronize ();
 827 }