libgomp/iter_ull.c

   1 /* Copyright (C) 2005-2013 Free Software Foundation, Inc.
   2    Contributed by Richard Henderson <rth@redhat.com>.
   3
   4    This file is part of the GNU OpenMP Library (libgomp).
   5
   6    Libgomp is free software; you can redistribute it and/or modify it
   7    under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 3, or (at your option)
   9    any later version.
  10
  11    Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
  12    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  13    FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  14    more details.
  15
  16    Under Section 7 of GPL version 3, you are granted additional
  17    permissions described in the GCC Runtime Library Exception, version
  18    3.1, as published by the Free Software Foundation.
  19
  20    You should have received a copy of the GNU General Public License and
  21    a copy of the GCC Runtime Library Exception along with this program;
  22    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  23    <http://www.gnu.org/licenses/>.  */
  24
  25 /* This file contains routines for managing work-share iteration, both
  26    for loops and sections.  */
  27
  28 #include "libgomp.h"
  29 #include <stdlib.h>
  30
  31 typedef unsigned long long gomp_ull;
  32
  33 /* This function implements the STATIC scheduling method.  The caller should
  34    iterate *pstart <= x < *pend.  Return zero if there are more iterations
  35    to perform; nonzero if not.  Return less than 0 if this thread had
  36    received the absolutely last iteration.  */
  37
  38 int
  39 gomp_iter_ull_static_next (gomp_ull *pstart, gomp_ull *pend)
  40 {
  41   struct gomp_thread *thr = gomp_thread ();
  42   struct gomp_team *team = thr->ts.team;
  43   struct gomp_work_share *ws = thr->ts.work_share;
  44   unsigned long nthreads = team ? team->nthreads : 1;
  45
  46   if (thr->ts.static_trip == -1)
  47     return -1;
  48
  49   /* Quick test for degenerate teams and orphaned constructs.  */
  50   if (nthreads == 1)
  51     {
  52       *pstart = ws->next_ull;
  53       *pend = ws->end_ull;
  54       thr->ts.static_trip = -1;
  55       return ws->next_ull == ws->end_ull;
  56     }
  57
  58   /* We interpret chunk_size zero as "unspecified", which means that we
  59      should break up the iterations such that each thread makes only one
  60      trip through the outer loop.  */
  61   if (ws->chunk_size_ull == 0)
  62     {
  63       gomp_ull n, q, i, t, s0, e0, s, e;
  64
  65       if (thr->ts.static_trip > 0)
  66         return 1;
  67
  68       /* Compute the total number of iterations.  */
  69       if (__builtin_expect (ws->mode, 0) == 0)
  70         n = (ws->end_ull - ws->next_ull + ws->incr_ull - 1) / ws->incr_ull;
  71       else
  72         n = (ws->next_ull - ws->end_ull - ws->incr_ull - 1) / -ws->incr_ull;
  73       i = thr->ts.team_id;
  74
  75       /* Compute the "zero-based" start and end points.  That is, as
  76          if the loop began at zero and incremented by one.  */
  77       q = n / nthreads;
  78       t = n % nthreads;
  79       if (i < t)
  80         {
  81           t = 0;
  82           q++;
  83         }
  84       s0 = q * i + t;
  85       e0 = s0 + q;
  86
  87       /* Notice when no iterations allocated for this thread.  */
  88       if (s0 >= e0)
  89         {
  90           thr->ts.static_trip = 1;
  91           return 1;
  92         }
  93
  94       /* Transform these to the actual start and end numbers.  */
  95       s = s0 * ws->incr_ull + ws->next_ull;
  96       e = e0 * ws->incr_ull + ws->next_ull;
  97
  98       *pstart = s;
  99       *pend = e;
 100       thr->ts.static_trip = (e0 == n ? -1 : 1);
 101       return 0;
 102     }
 103   else
 104     {
 105       gomp_ull n, s0, e0, i, c, s, e;
 106
 107       /* Otherwise, each thread gets exactly chunk_size iterations
 108          (if available) each time through the loop.  */
 109
 110       if (__builtin_expect (ws->mode, 0) == 0)
 111         n = (ws->end_ull - ws->next_ull + ws->incr_ull - 1) / ws->incr_ull;
 112       else
 113         n = (ws->next_ull - ws->end_ull - ws->incr_ull - 1) / -ws->incr_ull;
 114       i = thr->ts.team_id;
 115       c = ws->chunk_size_ull;
 116
 117       /* Initial guess is a C sized chunk positioned nthreads iterations
 118          in, offset by our thread number.  */
 119       s0 = (thr->ts.static_trip * (gomp_ull) nthreads + i) * c;
 120       e0 = s0 + c;
 121
 122       /* Detect overflow.  */
 123       if (s0 >= n)
 124         return 1;
 125       if (e0 > n)
 126         e0 = n;
 127
 128       /* Transform these to the actual start and end numbers.  */
 129       s = s0 * ws->incr_ull + ws->next_ull;
 130       e = e0 * ws->incr_ull + ws->next_ull;
 131
 132       *pstart = s;
 133       *pend = e;
 134
 135       if (e0 == n)
 136         thr->ts.static_trip = -1;
 137       else
 138         thr->ts.static_trip++;
 139       return 0;
 140     }
 141 }
 142
 143
 144 /* This function implements the DYNAMIC scheduling method.  Arguments are
 145    as for gomp_iter_ull_static_next.  This function must be called with
 146    ws->lock held.  */
 147
 148 bool
 149 gomp_iter_ull_dynamic_next_locked (gomp_ull *pstart, gomp_ull *pend)
 150 {
 151   struct gomp_thread *thr = gomp_thread ();
 152   struct gomp_work_share *ws = thr->ts.work_share;
 153   gomp_ull start, end, chunk, left;
 154
 155   start = ws->next_ull;
 156   if (start == ws->end_ull)
 157     return false;
 158
 159   chunk = ws->chunk_size_ull;
 160   left = ws->end_ull - start;
 161   if (__builtin_expect (ws->mode & 2, 0))
 162     {
 163       if (chunk < left)
 164         chunk = left;
 165     }
 166   else
 167     {
 168       if (chunk > left)
 169         chunk = left;
 170     }
 171   end = start + chunk;
 172
 173   ws->next_ull = end;
 174   *pstart = start;
 175   *pend = end;
 176   return true;
 177 }
 178
 179
 180 #if defined HAVE_SYNC_BUILTINS && defined __LP64__
 181 /* Similar, but doesn't require the lock held, and uses compare-and-swap
 182    instead.  Note that the only memory value that changes is ws->next_ull.  */
 183
 184 bool
 185 gomp_iter_ull_dynamic_next (gomp_ull *pstart, gomp_ull *pend)
 186 {
 187   struct gomp_thread *thr = gomp_thread ();
 188   struct gomp_work_share *ws = thr->ts.work_share;
 189   gomp_ull start, end, nend, chunk;
 190
 191   end = ws->end_ull;
 192   chunk = ws->chunk_size_ull;
 193
 194   if (__builtin_expect (ws->mode & 1, 1))
 195     {
 196       gomp_ull tmp = __sync_fetch_and_add (&ws->next_ull, chunk);
 197       if (__builtin_expect (ws->mode & 2, 0) == 0)
 198         {
 199           if (tmp >= end)
 200             return false;
 201           nend = tmp + chunk;
 202           if (nend > end)
 203             nend = end;
 204           *pstart = tmp;
 205           *pend = nend;
 206           return true;
 207         }
 208       else
 209         {
 210           if (tmp <= end)
 211             return false;
 212           nend = tmp + chunk;
 213           if (nend < end)
 214             nend = end;
 215           *pstart = tmp;
 216           *pend = nend;
 217           return true;
 218         }
 219     }
 220
 221   start = ws->next_ull;
 222   while (1)
 223     {
 224       gomp_ull left = end - start;
 225       gomp_ull tmp;
 226
 227       if (start == end)
 228         return false;
 229
 230       if (__builtin_expect (ws->mode & 2, 0))
 231         {
 232           if (chunk < left)
 233             chunk = left;
 234         }
 235       else
 236         {
 237           if (chunk > left)
 238             chunk = left;
 239         }
 240       nend = start + chunk;
 241
 242       tmp = __sync_val_compare_and_swap (&ws->next_ull, start, nend);
 243       if (__builtin_expect (tmp == start, 1))
 244         break;
 245
 246       start = tmp;
 247     }
 248
 249   *pstart = start;
 250   *pend = nend;
 251   return true;
 252 }
 253 #endif /* HAVE_SYNC_BUILTINS */
 254
 255
 256 /* This function implements the GUIDED scheduling method.  Arguments are
 257    as for gomp_iter_ull_static_next.  This function must be called with the
 258    work share lock held.  */
 259
 260 bool
 261 gomp_iter_ull_guided_next_locked (gomp_ull *pstart, gomp_ull *pend)
 262 {
 263   struct gomp_thread *thr = gomp_thread ();
 264   struct gomp_work_share *ws = thr->ts.work_share;
 265   struct gomp_team *team = thr->ts.team;
 266   gomp_ull nthreads = team ? team->nthreads : 1;
 267   gomp_ull n, q;
 268   gomp_ull start, end;
 269
 270   if (ws->next_ull == ws->end_ull)
 271     return false;
 272
 273   start = ws->next_ull;
 274   if (__builtin_expect (ws->mode, 0) == 0)
 275     n = (ws->end_ull - start) / ws->incr_ull;
 276   else
 277     n = (start - ws->end_ull) / -ws->incr_ull;
 278   q = (n + nthreads - 1) / nthreads;
 279
 280   if (q < ws->chunk_size_ull)
 281     q = ws->chunk_size_ull;
 282   if (q <= n)
 283     end = start + q * ws->incr_ull;
 284   else
 285     end = ws->end_ull;
 286
 287   ws->next_ull = end;
 288   *pstart = start;
 289   *pend = end;
 290   return true;
 291 }
 292
 293 #if defined HAVE_SYNC_BUILTINS && defined __LP64__
 294 /* Similar, but doesn't require the lock held, and uses compare-and-swap
 295    instead.  Note that the only memory value that changes is ws->next_ull.  */
 296
 297 bool
 298 gomp_iter_ull_guided_next (gomp_ull *pstart, gomp_ull *pend)
 299 {
 300   struct gomp_thread *thr = gomp_thread ();
 301   struct gomp_work_share *ws = thr->ts.work_share;
 302   struct gomp_team *team = thr->ts.team;
 303   gomp_ull nthreads = team ? team->nthreads : 1;
 304   gomp_ull start, end, nend, incr;
 305   gomp_ull chunk_size;
 306
 307   start = ws->next_ull;
 308   end = ws->end_ull;
 309   incr = ws->incr_ull;
 310   chunk_size = ws->chunk_size_ull;
 311
 312   while (1)
 313     {
 314       gomp_ull n, q;
 315       gomp_ull tmp;
 316
 317       if (start == end)
 318         return false;
 319
 320       if (__builtin_expect (ws->mode, 0) == 0)
 321         n = (end - start) / incr;
 322       else
 323         n = (start - end) / -incr;
 324       q = (n + nthreads - 1) / nthreads;
 325
 326       if (q < chunk_size)
 327         q = chunk_size;
 328       if (__builtin_expect (q <= n, 1))
 329         nend = start + q * incr;
 330       else
 331         nend = end;
 332
 333       tmp = __sync_val_compare_and_swap (&ws->next_ull, start, nend);
 334       if (__builtin_expect (tmp == start, 1))
 335         break;
 336
 337       start = tmp;
 338     }
 339
 340   *pstart = start;
 341   *pend = nend;
 342   return true;
 343 }
 344 #endif /* HAVE_SYNC_BUILTINS */