libgomp/ordered.c

   1 /* Copyright (C) 2005-2017 Free Software Foundation, Inc.
   2    Contributed by Richard Henderson <rth@redhat.com>.
   3
   4    This file is part of the GNU Offloading and Multi Processing Library
   5    (libgomp).
   6
   7    Libgomp is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
  13    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  14    FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  15    more details.
  16
  17    Under Section 7 of GPL version 3, you are granted additional
  18    permissions described in the GCC Runtime Library Exception, version
  19    3.1, as published by the Free Software Foundation.
  20
  21    You should have received a copy of the GNU General Public License and
  22    a copy of the GCC Runtime Library Exception along with this program;
  23    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  24    <http://www.gnu.org/licenses/>.  */
  25
  26 /* This file handles the ORDERED construct.  */
  27
  28 #include "libgomp.h"
  29 #include <stdarg.h>
  30 #include <string.h>
  31 #include "doacross.h"
  32
  33
  34 /* This function is called when first allocating an iteration block.  That
  35    is, the thread is not currently on the queue.  The work-share lock must
  36    be held on entry.  */
  37
  38 void
  39 gomp_ordered_first (void)
  40 {
  41   struct gomp_thread *thr = gomp_thread ();
  42   struct gomp_team *team = thr->ts.team;
  43   struct gomp_work_share *ws = thr->ts.work_share;
  44   unsigned index;
  45
  46   /* Work share constructs can be orphaned.  */
  47   if (team == NULL || team->nthreads == 1)
  48     return;
  49
  50   index = ws->ordered_cur + ws->ordered_num_used;
  51   if (index >= team->nthreads)
  52     index -= team->nthreads;
  53   ws->ordered_team_ids[index] = thr->ts.team_id;
  54
  55   /* If this is the first and only thread in the queue, then there is
  56      no one to release us when we get to our ordered section.  Post to
  57      our own release queue now so that we won't block later.  */
  58   if (ws->ordered_num_used++ == 0)
  59     gomp_sem_post (team->ordered_release[thr->ts.team_id]);
  60 }
  61
  62 /* This function is called when completing the last iteration block.  That
  63    is, there are no more iterations to perform and so the thread should be
  64    removed from the queue entirely.  Because of the way ORDERED blocks are
  65    managed, it follows that we currently own access to the ORDERED block,
  66    and should now pass it on to the next thread.  The work-share lock must
  67    be held on entry.  */
  68
  69 void
  70 gomp_ordered_last (void)
  71 {
  72   struct gomp_thread *thr = gomp_thread ();
  73   struct gomp_team *team = thr->ts.team;
  74   struct gomp_work_share *ws = thr->ts.work_share;
  75   unsigned next_id;
  76
  77   /* Work share constructs can be orphaned.  */
  78   if (team == NULL || team->nthreads == 1)
  79     return;
  80
  81   /* We're no longer the owner.  */
  82   ws->ordered_owner = -1;
  83
  84   /* If we're not the last thread in the queue, then wake the next.  */
  85   if (--ws->ordered_num_used > 0)
  86     {
  87       unsigned next = ws->ordered_cur + 1;
  88       if (next == team->nthreads)
  89         next = 0;
  90       ws->ordered_cur = next;
  91
  92       next_id = ws->ordered_team_ids[next];
  93       gomp_sem_post (team->ordered_release[next_id]);
  94     }
  95 }
  96
  97
  98 /* This function is called when allocating a subsequent allocation block.
  99    That is, we're done with the current iteration block and we're allocating
 100    another.  This is the logical combination of a call to gomp_ordered_last
 101    followed by a call to gomp_ordered_first.  The work-share lock must be
 102    held on entry. */
 103
 104 void
 105 gomp_ordered_next (void)
 106 {
 107   struct gomp_thread *thr = gomp_thread ();
 108   struct gomp_team *team = thr->ts.team;
 109   struct gomp_work_share *ws = thr->ts.work_share;
 110   unsigned index, next_id;
 111
 112   /* Work share constructs can be orphaned.  */
 113   if (team == NULL || team->nthreads == 1)
 114     return;
 115
 116   /* We're no longer the owner.  */
 117   ws->ordered_owner = -1;
 118
 119   /* If there's only one thread in the queue, that must be us.  */
 120   if (ws->ordered_num_used == 1)
 121     {
 122       /* We have a similar situation as in gomp_ordered_first
 123          where we need to post to our own release semaphore.  */
 124       gomp_sem_post (team->ordered_release[thr->ts.team_id]);
 125       return;
 126     }
 127
 128   /* If the queue is entirely full, then we move ourself to the end of
 129      the queue merely by incrementing ordered_cur.  Only if it's not
 130      full do we have to write our id.  */
 131   if (ws->ordered_num_used < team->nthreads)
 132     {
 133       index = ws->ordered_cur + ws->ordered_num_used;
 134       if (index >= team->nthreads)
 135         index -= team->nthreads;
 136       ws->ordered_team_ids[index] = thr->ts.team_id;
 137     }
 138
 139   index = ws->ordered_cur + 1;
 140   if (index == team->nthreads)
 141     index = 0;
 142   ws->ordered_cur = index;
 143
 144   next_id = ws->ordered_team_ids[index];
 145   gomp_sem_post (team->ordered_release[next_id]);
 146 }
 147
 148
 149 /* This function is called when a statically scheduled loop is first
 150    being created.  */
 151
 152 void
 153 gomp_ordered_static_init (void)
 154 {
 155   struct gomp_thread *thr = gomp_thread ();
 156   struct gomp_team *team = thr->ts.team;
 157
 158   if (team == NULL || team->nthreads == 1)
 159     return;
 160
 161   gomp_sem_post (team->ordered_release[0]);
 162 }
 163
 164 /* This function is called when a statically scheduled loop is moving to
 165    the next allocation block.  Static schedules are not first come first
 166    served like the others, so we're to move to the numerically next thread,
 167    not the next thread on a list.  The work-share lock should *not* be held
 168    on entry.  */
 169
 170 void
 171 gomp_ordered_static_next (void)
 172 {
 173   struct gomp_thread *thr = gomp_thread ();
 174   struct gomp_team *team = thr->ts.team;
 175   struct gomp_work_share *ws = thr->ts.work_share;
 176   unsigned id = thr->ts.team_id;
 177
 178   if (team == NULL || team->nthreads == 1)
 179     return;
 180
 181   ws->ordered_owner = -1;
 182
 183   /* This thread currently owns the lock.  Increment the owner.  */
 184   if (++id == team->nthreads)
 185     id = 0;
 186   ws->ordered_team_ids[0] = id;
 187   gomp_sem_post (team->ordered_release[id]);
 188 }
 189
 190 /* This function is called when we need to assert that the thread owns the
 191    ordered section.  Due to the problem of posted-but-not-waited semaphores,
 192    this needs to happen before completing a loop iteration.  */
 193
 194 void
 195 gomp_ordered_sync (void)
 196 {
 197   struct gomp_thread *thr = gomp_thread ();
 198   struct gomp_team *team = thr->ts.team;
 199   struct gomp_work_share *ws = thr->ts.work_share;
 200
 201   /* Work share constructs can be orphaned.  But this clearly means that
 202      we are the only thread, and so we automatically own the section.  */
 203   if (team == NULL || team->nthreads == 1)
 204     return;
 205
 206   /* ??? I believe it to be safe to access this data without taking the
 207      ws->lock.  The only presumed race condition is with the previous
 208      thread on the queue incrementing ordered_cur such that it points
 209      to us, concurrently with our check below.  But our team_id is
 210      already present in the queue, and the other thread will always
 211      post to our release semaphore.  So the two cases are that we will
 212      either win the race an momentarily block on the semaphore, or lose
 213      the race and find the semaphore already unlocked and so not block.
 214      Either way we get correct results.
 215      However, there is an implicit flush on entry to an ordered region,
 216      so we do need to have a barrier here.  If we were taking a lock
 217      this could be MEMMODEL_RELEASE since the acquire would be coverd
 218      by the lock.  */
 219
 220   __atomic_thread_fence (MEMMODEL_ACQ_REL);
 221   if (ws->ordered_owner != thr->ts.team_id)
 222     {
 223       gomp_sem_wait (team->ordered_release[thr->ts.team_id]);
 224       ws->ordered_owner = thr->ts.team_id;
 225     }
 226 }
 227
 228 /* This function is called by user code when encountering the start of an
 229    ORDERED block.  We must check to see if the current thread is at the
 230    head of the queue, and if not, block.  */
 231
 232 #ifdef HAVE_ATTRIBUTE_ALIAS
 233 extern void GOMP_ordered_start (void)
 234         __attribute__((alias ("gomp_ordered_sync")));
 235 #else
 236 void
 237 GOMP_ordered_start (void)
 238 {
 239   gomp_ordered_sync ();
 240 }
 241 #endif
 242
 243 /* This function is called by user code when encountering the end of an
 244    ORDERED block.  With the current ORDERED implementation there's nothing
 245    for us to do.
 246
 247    However, the current implementation has a flaw in that it does not allow
 248    the next thread into the ORDERED section immediately after the current
 249    thread exits the ORDERED section in its last iteration.  The existance
 250    of this function allows the implementation to change.  */
 251
 252 void
 253 GOMP_ordered_end (void)
 254 {
 255 }
 256
 257 /* DOACROSS initialization.  */
 258
 259 #define MAX_COLLAPSED_BITS (__SIZEOF_LONG__ * __CHAR_BIT__)
 260
 261 void
 262 gomp_doacross_init (unsigned ncounts, long *counts, long chunk_size)
 263 {
 264   struct gomp_thread *thr = gomp_thread ();
 265   struct gomp_team *team = thr->ts.team;
 266   struct gomp_work_share *ws = thr->ts.work_share;
 267   unsigned int i, bits[MAX_COLLAPSED_BITS], num_bits = 0;
 268   unsigned long ent, num_ents, elt_sz, shift_sz;
 269   struct gomp_doacross_work_share *doacross;
 270
 271   if (team == NULL || team->nthreads == 1)
 272     return;
 273
 274   for (i = 0; i < ncounts; i++)
 275     {
 276       /* If any count is 0, GOMP_doacross_{post,wait} can't be called.  */
 277       if (counts[i] == 0)
 278         return;
 279
 280       if (num_bits <= MAX_COLLAPSED_BITS)
 281         {
 282           unsigned int this_bits;
 283           if (counts[i] == 1)
 284             this_bits = 1;
 285           else
 286             this_bits = __SIZEOF_LONG__ * __CHAR_BIT__
 287                         - __builtin_clzl (counts[i] - 1);
 288           if (num_bits + this_bits <= MAX_COLLAPSED_BITS)
 289             {
 290               bits[i] = this_bits;
 291               num_bits += this_bits;
 292             }
 293           else
 294             num_bits = MAX_COLLAPSED_BITS + 1;
 295         }
 296     }
 297
 298   if (ws->sched == GFS_STATIC)
 299     num_ents = team->nthreads;
 300   else if (ws->sched == GFS_GUIDED)
 301     num_ents = counts[0];
 302   else
 303     num_ents = (counts[0] - 1) / chunk_size + 1;
 304   if (num_bits <= MAX_COLLAPSED_BITS)
 305     {
 306       elt_sz = sizeof (unsigned long);
 307       shift_sz = ncounts * sizeof (unsigned int);
 308     }
 309   else
 310     {
 311       elt_sz = sizeof (unsigned long) * ncounts;
 312       shift_sz = 0;
 313     }
 314   elt_sz = (elt_sz + 63) & ~63UL;
 315
 316   doacross = gomp_malloc (sizeof (*doacross) + 63 + num_ents * elt_sz
 317                           + shift_sz);
 318   doacross->chunk_size = chunk_size;
 319   doacross->elt_sz = elt_sz;
 320   doacross->ncounts = ncounts;
 321   doacross->flattened = false;
 322   doacross->array = (unsigned char *)
 323                     ((((uintptr_t) (doacross + 1)) + 63 + shift_sz)
 324                      & ~(uintptr_t) 63);
 325   if (num_bits <= MAX_COLLAPSED_BITS)
 326     {
 327       unsigned int shift_count = 0;
 328       doacross->flattened = true;
 329       for (i = ncounts; i > 0; i--)
 330         {
 331           doacross->shift_counts[i - 1] = shift_count;
 332           shift_count += bits[i - 1];
 333         }
 334       for (ent = 0; ent < num_ents; ent++)
 335         *(unsigned long *) (doacross->array + ent * elt_sz) = 0;
 336     }
 337   else
 338     for (ent = 0; ent < num_ents; ent++)
 339       memset (doacross->array + ent * elt_sz, '\0',
 340               sizeof (unsigned long) * ncounts);
 341   if (ws->sched == GFS_STATIC && chunk_size == 0)
 342     {
 343       unsigned long q = counts[0] / num_ents;
 344       unsigned long t = counts[0] % num_ents;
 345       doacross->boundary = t * (q + 1);
 346       doacross->q = q;
 347       doacross->t = t;
 348     }
 349   ws->doacross = doacross;
 350 }
 351
 352 /* DOACROSS POST operation.  */
 353
 354 void
 355 GOMP_doacross_post (long *counts)
 356 {
 357   struct gomp_thread *thr = gomp_thread ();
 358   struct gomp_work_share *ws = thr->ts.work_share;
 359   struct gomp_doacross_work_share *doacross = ws->doacross;
 360   unsigned long ent;
 361   unsigned int i;
 362
 363   if (__builtin_expect (doacross == NULL, 0))
 364     {
 365       __sync_synchronize ();
 366       return;
 367     }
 368
 369   if (__builtin_expect (ws->sched == GFS_STATIC, 1))
 370     ent = thr->ts.team_id;
 371   else if (ws->sched == GFS_GUIDED)
 372     ent = counts[0];
 373   else
 374     ent = counts[0] / doacross->chunk_size;
 375   unsigned long *array = (unsigned long *) (doacross->array
 376                                             + ent * doacross->elt_sz);
 377
 378   if (__builtin_expect (doacross->flattened, 1))
 379     {
 380       unsigned long flattened
 381         = (unsigned long) counts[0] << doacross->shift_counts[0];
 382
 383       for (i = 1; i < doacross->ncounts; i++)
 384         flattened |= (unsigned long) counts[i]
 385                      << doacross->shift_counts[i];
 386       flattened++;
 387       if (flattened == __atomic_load_n (array, MEMMODEL_ACQUIRE))
 388         __atomic_thread_fence (MEMMODEL_RELEASE);
 389       else
 390         __atomic_store_n (array, flattened, MEMMODEL_RELEASE);
 391       return;
 392     }
 393
 394   __atomic_thread_fence (MEMMODEL_ACQUIRE);
 395   for (i = doacross->ncounts; i-- > 0; )
 396     {
 397       if (counts[i] + 1UL != __atomic_load_n (&array[i], MEMMODEL_RELAXED))
 398         __atomic_store_n (&array[i], counts[i] + 1UL, MEMMODEL_RELEASE);
 399     }
 400 }
 401
 402 /* DOACROSS WAIT operation.  */
 403
 404 void
 405 GOMP_doacross_wait (long first, ...)
 406 {
 407   struct gomp_thread *thr = gomp_thread ();
 408   struct gomp_work_share *ws = thr->ts.work_share;
 409   struct gomp_doacross_work_share *doacross = ws->doacross;
 410   va_list ap;
 411   unsigned long ent;
 412   unsigned int i;
 413
 414   if (__builtin_expect (doacross == NULL, 0))
 415     {
 416       __sync_synchronize ();
 417       return;
 418     }
 419
 420   if (__builtin_expect (ws->sched == GFS_STATIC, 1))
 421     {
 422       if (ws->chunk_size == 0)
 423         {
 424           if (first < doacross->boundary)
 425             ent = first / (doacross->q + 1);
 426           else
 427             ent = (first - doacross->boundary) / doacross->q
 428                   + doacross->t;
 429         }
 430       else
 431         ent = first / ws->chunk_size % thr->ts.team->nthreads;
 432     }
 433   else if (ws->sched == GFS_GUIDED)
 434     ent = first;
 435   else
 436     ent = first / doacross->chunk_size;
 437   unsigned long *array = (unsigned long *) (doacross->array
 438                                             + ent * doacross->elt_sz);
 439
 440   if (__builtin_expect (doacross->flattened, 1))
 441     {
 442       unsigned long flattened
 443         = (unsigned long) first << doacross->shift_counts[0];
 444       unsigned long cur;
 445
 446       va_start (ap, first);
 447       for (i = 1; i < doacross->ncounts; i++)
 448         flattened |= (unsigned long) va_arg (ap, long)
 449                      << doacross->shift_counts[i];
 450       cur = __atomic_load_n (array, MEMMODEL_ACQUIRE);
 451       if (flattened < cur)
 452         {
 453           __atomic_thread_fence (MEMMODEL_RELEASE);
 454           va_end (ap);
 455           return;
 456         }
 457       doacross_spin (array, flattened, cur);
 458       __atomic_thread_fence (MEMMODEL_RELEASE);
 459       va_end (ap);
 460       return;
 461     }
 462
 463   do
 464     {
 465       va_start (ap, first);
 466       for (i = 0; i < doacross->ncounts; i++)
 467         {
 468           unsigned long thisv
 469             = (unsigned long) (i ? va_arg (ap, long) : first) + 1;
 470           unsigned long cur = __atomic_load_n (&array[i], MEMMODEL_RELAXED);
 471           if (thisv < cur)
 472             {
 473               i = doacross->ncounts;
 474               break;
 475             }
 476           if (thisv > cur)
 477             break;
 478         }
 479       va_end (ap);
 480       if (i == doacross->ncounts)
 481         break;
 482       cpu_relax ();
 483     }
 484   while (1);
 485   __sync_synchronize ();
 486 }
 487
 488 typedef unsigned long long gomp_ull;
 489
 490 void
 491 gomp_doacross_ull_init (unsigned ncounts, gomp_ull *counts, gomp_ull chunk_size)
 492 {
 493   struct gomp_thread *thr = gomp_thread ();
 494   struct gomp_team *team = thr->ts.team;
 495   struct gomp_work_share *ws = thr->ts.work_share;
 496   unsigned int i, bits[MAX_COLLAPSED_BITS], num_bits = 0;
 497   unsigned long ent, num_ents, elt_sz, shift_sz;
 498   struct gomp_doacross_work_share *doacross;
 499
 500   if (team == NULL || team->nthreads == 1)
 501     return;
 502
 503   for (i = 0; i < ncounts; i++)
 504     {
 505       /* If any count is 0, GOMP_doacross_{post,wait} can't be called.  */
 506       if (counts[i] == 0)
 507         return;
 508
 509       if (num_bits <= MAX_COLLAPSED_BITS)
 510         {
 511           unsigned int this_bits;
 512           if (counts[i] == 1)
 513             this_bits = 1;
 514           else
 515             this_bits = __SIZEOF_LONG_LONG__ * __CHAR_BIT__
 516                         - __builtin_clzll (counts[i] - 1);
 517           if (num_bits + this_bits <= MAX_COLLAPSED_BITS)
 518             {
 519               bits[i] = this_bits;
 520               num_bits += this_bits;
 521             }
 522           else
 523             num_bits = MAX_COLLAPSED_BITS + 1;
 524         }
 525     }
 526
 527   if (ws->sched == GFS_STATIC)
 528     num_ents = team->nthreads;
 529   else if (ws->sched == GFS_GUIDED)
 530     num_ents = counts[0];
 531   else
 532     num_ents = (counts[0] - 1) / chunk_size + 1;
 533   if (num_bits <= MAX_COLLAPSED_BITS)
 534     {
 535       elt_sz = sizeof (unsigned long);
 536       shift_sz = ncounts * sizeof (unsigned int);
 537     }
 538   else
 539     {
 540       if (sizeof (gomp_ull) == sizeof (unsigned long))
 541         elt_sz = sizeof (gomp_ull) * ncounts;
 542       else if (sizeof (gomp_ull) == 2 * sizeof (unsigned long))
 543         elt_sz = sizeof (unsigned long) * 2 * ncounts;
 544       else
 545         abort ();
 546       shift_sz = 0;
 547     }
 548   elt_sz = (elt_sz + 63) & ~63UL;
 549
 550   doacross = gomp_malloc (sizeof (*doacross) + 63 + num_ents * elt_sz
 551                           + shift_sz);
 552   doacross->chunk_size_ull = chunk_size;
 553   doacross->elt_sz = elt_sz;
 554   doacross->ncounts = ncounts;
 555   doacross->flattened = false;
 556   doacross->boundary = 0;
 557   doacross->array = (unsigned char *)
 558                     ((((uintptr_t) (doacross + 1)) + 63 + shift_sz)
 559                      & ~(uintptr_t) 63);
 560   if (num_bits <= MAX_COLLAPSED_BITS)
 561     {
 562       unsigned int shift_count = 0;
 563       doacross->flattened = true;
 564       for (i = ncounts; i > 0; i--)
 565         {
 566           doacross->shift_counts[i - 1] = shift_count;
 567           shift_count += bits[i - 1];
 568         }
 569       for (ent = 0; ent < num_ents; ent++)
 570         *(unsigned long *) (doacross->array + ent * elt_sz) = 0;
 571     }
 572   else
 573     for (ent = 0; ent < num_ents; ent++)
 574       memset (doacross->array + ent * elt_sz, '\0',
 575               sizeof (unsigned long) * ncounts);
 576   if (ws->sched == GFS_STATIC && chunk_size == 0)
 577     {
 578       gomp_ull q = counts[0] / num_ents;
 579       gomp_ull t = counts[0] % num_ents;
 580       doacross->boundary_ull = t * (q + 1);
 581       doacross->q_ull = q;
 582       doacross->t = t;
 583     }
 584   ws->doacross = doacross;
 585 }
 586
 587 /* DOACROSS POST operation.  */
 588
 589 void
 590 GOMP_doacross_ull_post (gomp_ull *counts)
 591 {
 592   struct gomp_thread *thr = gomp_thread ();
 593   struct gomp_work_share *ws = thr->ts.work_share;
 594   struct gomp_doacross_work_share *doacross = ws->doacross;
 595   unsigned long ent;
 596   unsigned int i;
 597
 598   if (__builtin_expect (doacross == NULL, 0))
 599     {
 600       __sync_synchronize ();
 601       return;
 602     }
 603
 604   if (__builtin_expect (ws->sched == GFS_STATIC, 1))
 605     ent = thr->ts.team_id;
 606   else if (ws->sched == GFS_GUIDED)
 607     ent = counts[0];
 608   else
 609     ent = counts[0] / doacross->chunk_size_ull;
 610
 611   if (__builtin_expect (doacross->flattened, 1))
 612     {
 613       unsigned long *array = (unsigned long *) (doacross->array
 614                               + ent * doacross->elt_sz);
 615       gomp_ull flattened
 616         = counts[0] << doacross->shift_counts[0];
 617
 618       for (i = 1; i < doacross->ncounts; i++)
 619         flattened |= counts[i] << doacross->shift_counts[i];
 620       flattened++;
 621       if (flattened == __atomic_load_n (array, MEMMODEL_ACQUIRE))
 622         __atomic_thread_fence (MEMMODEL_RELEASE);
 623       else
 624         __atomic_store_n (array, flattened, MEMMODEL_RELEASE);
 625       return;
 626     }
 627
 628   __atomic_thread_fence (MEMMODEL_ACQUIRE);
 629   if (sizeof (gomp_ull) == sizeof (unsigned long))
 630     {
 631       gomp_ull *array = (gomp_ull *) (doacross->array
 632                                       + ent * doacross->elt_sz);
 633
 634       for (i = doacross->ncounts; i-- > 0; )
 635         {
 636           if (counts[i] + 1UL != __atomic_load_n (&array[i], MEMMODEL_RELAXED))
 637             __atomic_store_n (&array[i], counts[i] + 1UL, MEMMODEL_RELEASE);
 638         }
 639     }
 640   else
 641     {
 642       unsigned long *array = (unsigned long *) (doacross->array
 643                                                 + ent * doacross->elt_sz);
 644
 645       for (i = doacross->ncounts; i-- > 0; )
 646         {
 647           gomp_ull cull = counts[i] + 1UL;
 648           unsigned long c = (unsigned long) cull;
 649           if (c != __atomic_load_n (&array[2 * i + 1], MEMMODEL_RELAXED))
 650             __atomic_store_n (&array[2 * i + 1], c, MEMMODEL_RELEASE);
 651           c = cull >> (__SIZEOF_LONG_LONG__ * __CHAR_BIT__ / 2);
 652           if (c != __atomic_load_n (&array[2 * i], MEMMODEL_RELAXED))
 653             __atomic_store_n (&array[2 * i], c, MEMMODEL_RELEASE);
 654         }
 655     }
 656 }
 657
 658 /* DOACROSS WAIT operation.  */
 659
 660 void
 661 GOMP_doacross_ull_wait (gomp_ull first, ...)
 662 {
 663   struct gomp_thread *thr = gomp_thread ();
 664   struct gomp_work_share *ws = thr->ts.work_share;
 665   struct gomp_doacross_work_share *doacross = ws->doacross;
 666   va_list ap;
 667   unsigned long ent;
 668   unsigned int i;
 669
 670   if (__builtin_expect (doacross == NULL, 0))
 671     {
 672       __sync_synchronize ();
 673       return;
 674     }
 675
 676   if (__builtin_expect (ws->sched == GFS_STATIC, 1))
 677     {
 678       if (ws->chunk_size_ull == 0)
 679         {
 680           if (first < doacross->boundary_ull)
 681             ent = first / (doacross->q_ull + 1);
 682           else
 683             ent = (first - doacross->boundary_ull) / doacross->q_ull
 684                   + doacross->t;
 685         }
 686       else
 687         ent = first / ws->chunk_size_ull % thr->ts.team->nthreads;
 688     }
 689   else if (ws->sched == GFS_GUIDED)
 690     ent = first;
 691   else
 692     ent = first / doacross->chunk_size_ull;
 693
 694   if (__builtin_expect (doacross->flattened, 1))
 695     {
 696       unsigned long *array = (unsigned long *) (doacross->array
 697                                                 + ent * doacross->elt_sz);
 698       gomp_ull flattened = first << doacross->shift_counts[0];
 699       unsigned long cur;
 700
 701       va_start (ap, first);
 702       for (i = 1; i < doacross->ncounts; i++)
 703         flattened |= va_arg (ap, gomp_ull)
 704                      << doacross->shift_counts[i];
 705       cur = __atomic_load_n (array, MEMMODEL_ACQUIRE);
 706       if (flattened < cur)
 707         {
 708           __atomic_thread_fence (MEMMODEL_RELEASE);
 709           va_end (ap);
 710           return;
 711         }
 712       doacross_spin (array, flattened, cur);
 713       __atomic_thread_fence (MEMMODEL_RELEASE);
 714       va_end (ap);
 715       return;
 716     }
 717
 718   if (sizeof (gomp_ull) == sizeof (unsigned long))
 719     {
 720       gomp_ull *array = (gomp_ull *) (doacross->array
 721                                       + ent * doacross->elt_sz);
 722       do
 723         {
 724           va_start (ap, first);
 725           for (i = 0; i < doacross->ncounts; i++)
 726             {
 727               gomp_ull thisv
 728                 = (i ? va_arg (ap, gomp_ull) : first) + 1;
 729               gomp_ull cur = __atomic_load_n (&array[i], MEMMODEL_RELAXED);
 730               if (thisv < cur)
 731                 {
 732                   i = doacross->ncounts;
 733                   break;
 734                 }
 735               if (thisv > cur)
 736                 break;
 737             }
 738           va_end (ap);
 739           if (i == doacross->ncounts)
 740             break;
 741           cpu_relax ();
 742         }
 743       while (1);
 744     }
 745   else
 746     {
 747       unsigned long *array = (unsigned long *) (doacross->array
 748                                                 + ent * doacross->elt_sz);
 749       do
 750         {
 751           va_start (ap, first);
 752           for (i = 0; i < doacross->ncounts; i++)
 753             {
 754               gomp_ull thisv
 755                 = (i ? va_arg (ap, gomp_ull) : first) + 1;
 756               unsigned long t
 757                 = thisv >> (__SIZEOF_LONG_LONG__ * __CHAR_BIT__ / 2);
 758               unsigned long cur
 759                 = __atomic_load_n (&array[2 * i], MEMMODEL_RELAXED);
 760               if (t < cur)
 761                 {
 762                   i = doacross->ncounts;
 763                   break;
 764                 }
 765               if (t > cur)
 766                 break;
 767               t = thisv;
 768               cur = __atomic_load_n (&array[2 * i + 1], MEMMODEL_RELAXED);
 769               if (t < cur)
 770                 {
 771                   i = doacross->ncounts;
 772                   break;
 773                 }
 774               if (t > cur)
 775                 break;
 776             }
 777           va_end (ap);
 778           if (i == doacross->ncounts)
 779             break;
 780           cpu_relax ();
 781         }
 782       while (1);
 783     }
 784   __sync_synchronize ();
 785 }