libgomp/iter_ull.c

   1 /* Copyright (C) 2005, 2008, 2009 Free Software Foundation, Inc.
   2    Contributed by Richard Henderson <rth@redhat.com>.
   3
   4    This file is part of the GNU OpenMP Library (libgomp).
   5
   6    Libgomp is free software; you can redistribute it and/or modify it
   7    under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 3, or (at your option)
   9    any later version.
  10
  11    Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
  12    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  13    FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  14    more details.
  15
  16    Under Section 7 of GPL version 3, you are granted additional
  17    permissions described in the GCC Runtime Library Exception, version
  18    3.1, as published by the Free Software Foundation.
  19
  20    You should have received a copy of the GNU General Public License and
  21    a copy of the GCC Runtime Library Exception along with this program;
  22    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  23    <http://www.gnu.org/licenses/>.  */
  24
  25 /* This file contains routines for managing work-share iteration, both
  26    for loops and sections.  */
  27
  28 #include "libgomp.h"
  29 #include <stdlib.h>
  30
  31 typedef unsigned long long gomp_ull;
  32
  33 /* This function implements the STATIC scheduling method.  The caller should
  34    iterate *pstart <= x < *pend.  Return zero if there are more iterations
  35    to perform; nonzero if not.  Return less than 0 if this thread had
  36    received the absolutely last iteration.  */
  37
  38 int
  39 gomp_iter_ull_static_next (gomp_ull *pstart, gomp_ull *pend)
  40 {
  41   struct gomp_thread *thr = gomp_thread ();
  42   struct gomp_team *team = thr->ts.team;
  43   struct gomp_work_share *ws = thr->ts.work_share;
  44   unsigned long nthreads = team ? team->nthreads : 1;
  45
  46   if (thr->ts.static_trip == -1)
  47     return -1;
  48
  49   /* Quick test for degenerate teams and orphaned constructs.  */
  50   if (nthreads == 1)
  51     {
  52       *pstart = ws->next_ull;
  53       *pend = ws->end_ull;
  54       thr->ts.static_trip = -1;
  55       return ws->next_ull == ws->end_ull;
  56     }
  57
  58   /* We interpret chunk_size zero as "unspecified", which means that we
  59      should break up the iterations such that each thread makes only one
  60      trip through the outer loop.  */
  61   if (ws->chunk_size_ull == 0)
  62     {
  63       gomp_ull n, q, i, s0, e0, s, e;
  64
  65       if (thr->ts.static_trip > 0)
  66         return 1;
  67
  68       /* Compute the total number of iterations.  */
  69       if (__builtin_expect (ws->mode, 0) == 0)
  70         n = (ws->end_ull - ws->next_ull + ws->incr_ull - 1) / ws->incr_ull;
  71       else
  72         n = (ws->next_ull - ws->end_ull - ws->incr_ull - 1) / -ws->incr_ull;
  73       i = thr->ts.team_id;
  74
  75       /* Compute the "zero-based" start and end points.  That is, as
  76          if the loop began at zero and incremented by one.  */
  77       q = n / nthreads;
  78       q += (q * nthreads != n);
  79       s0 = q * i;
  80       e0 = s0 + q;
  81       if (e0 > n)
  82         e0 = n;
  83
  84       /* Notice when no iterations allocated for this thread.  */
  85       if (s0 >= e0)
  86         {
  87           thr->ts.static_trip = 1;
  88           return 1;
  89         }
  90
  91       /* Transform these to the actual start and end numbers.  */
  92       s = s0 * ws->incr_ull + ws->next_ull;
  93       e = e0 * ws->incr_ull + ws->next_ull;
  94
  95       *pstart = s;
  96       *pend = e;
  97       thr->ts.static_trip = (e0 == n ? -1 : 1);
  98       return 0;
  99     }
 100   else
 101     {
 102       gomp_ull n, s0, e0, i, c, s, e;
 103
 104       /* Otherwise, each thread gets exactly chunk_size iterations
 105          (if available) each time through the loop.  */
 106
 107       if (__builtin_expect (ws->mode, 0) == 0)
 108         n = (ws->end_ull - ws->next_ull + ws->incr_ull - 1) / ws->incr_ull;
 109       else
 110         n = (ws->next_ull - ws->end_ull - ws->incr_ull - 1) / -ws->incr_ull;
 111       i = thr->ts.team_id;
 112       c = ws->chunk_size_ull;
 113
 114       /* Initial guess is a C sized chunk positioned nthreads iterations
 115          in, offset by our thread number.  */
 116       s0 = (thr->ts.static_trip * (gomp_ull) nthreads + i) * c;
 117       e0 = s0 + c;
 118
 119       /* Detect overflow.  */
 120       if (s0 >= n)
 121         return 1;
 122       if (e0 > n)
 123         e0 = n;
 124
 125       /* Transform these to the actual start and end numbers.  */
 126       s = s0 * ws->incr_ull + ws->next_ull;
 127       e = e0 * ws->incr_ull + ws->next_ull;
 128
 129       *pstart = s;
 130       *pend = e;
 131
 132       if (e0 == n)
 133         thr->ts.static_trip = -1;
 134       else
 135         thr->ts.static_trip++;
 136       return 0;
 137     }
 138 }
 139
 140
 141 /* This function implements the DYNAMIC scheduling method.  Arguments are
 142    as for gomp_iter_ull_static_next.  This function must be called with
 143    ws->lock held.  */
 144
 145 bool
 146 gomp_iter_ull_dynamic_next_locked (gomp_ull *pstart, gomp_ull *pend)
 147 {
 148   struct gomp_thread *thr = gomp_thread ();
 149   struct gomp_work_share *ws = thr->ts.work_share;
 150   gomp_ull start, end, chunk, left;
 151
 152   start = ws->next_ull;
 153   if (start == ws->end_ull)
 154     return false;
 155
 156   chunk = ws->chunk_size_ull;
 157   left = ws->end_ull - start;
 158   if (__builtin_expect (ws->mode & 2, 0))
 159     {
 160       if (chunk < left)
 161         chunk = left;
 162     }
 163   else
 164     {
 165       if (chunk > left)
 166         chunk = left;
 167     }
 168   end = start + chunk;
 169
 170   ws->next_ull = end;
 171   *pstart = start;
 172   *pend = end;
 173   return true;
 174 }
 175
 176
 177 #if defined HAVE_SYNC_BUILTINS && defined __LP64__
 178 /* Similar, but doesn't require the lock held, and uses compare-and-swap
 179    instead.  Note that the only memory value that changes is ws->next_ull.  */
 180
 181 bool
 182 gomp_iter_ull_dynamic_next (gomp_ull *pstart, gomp_ull *pend)
 183 {
 184   struct gomp_thread *thr = gomp_thread ();
 185   struct gomp_work_share *ws = thr->ts.work_share;
 186   gomp_ull start, end, nend, chunk;
 187
 188   end = ws->end_ull;
 189   chunk = ws->chunk_size_ull;
 190
 191   if (__builtin_expect (ws->mode & 1, 1))
 192     {
 193       gomp_ull tmp = __sync_fetch_and_add (&ws->next_ull, chunk);
 194       if (__builtin_expect (ws->mode & 2, 0) == 0)
 195         {
 196           if (tmp >= end)
 197             return false;
 198           nend = tmp + chunk;
 199           if (nend > end)
 200             nend = end;
 201           *pstart = tmp;
 202           *pend = nend;
 203           return true;
 204         }
 205       else
 206         {
 207           if (tmp <= end)
 208             return false;
 209           nend = tmp + chunk;
 210           if (nend < end)
 211             nend = end;
 212           *pstart = tmp;
 213           *pend = nend;
 214           return true;
 215         }
 216     }
 217
 218   start = ws->next_ull;
 219   while (1)
 220     {
 221       gomp_ull left = end - start;
 222       gomp_ull tmp;
 223
 224       if (start == end)
 225         return false;
 226
 227       if (__builtin_expect (ws->mode & 2, 0))
 228         {
 229           if (chunk < left)
 230             chunk = left;
 231         }
 232       else
 233         {
 234           if (chunk > left)
 235             chunk = left;
 236         }
 237       nend = start + chunk;
 238
 239       tmp = __sync_val_compare_and_swap (&ws->next_ull, start, nend);
 240       if (__builtin_expect (tmp == start, 1))
 241         break;
 242
 243       start = tmp;
 244     }
 245
 246   *pstart = start;
 247   *pend = nend;
 248   return true;
 249 }
 250 #endif /* HAVE_SYNC_BUILTINS */
 251
 252
 253 /* This function implements the GUIDED scheduling method.  Arguments are
 254    as for gomp_iter_ull_static_next.  This function must be called with the
 255    work share lock held.  */
 256
 257 bool
 258 gomp_iter_ull_guided_next_locked (gomp_ull *pstart, gomp_ull *pend)
 259 {
 260   struct gomp_thread *thr = gomp_thread ();
 261   struct gomp_work_share *ws = thr->ts.work_share;
 262   struct gomp_team *team = thr->ts.team;
 263   gomp_ull nthreads = team ? team->nthreads : 1;
 264   gomp_ull n, q;
 265   gomp_ull start, end;
 266
 267   if (ws->next_ull == ws->end_ull)
 268     return false;
 269
 270   start = ws->next_ull;
 271   if (__builtin_expect (ws->mode, 0) == 0)
 272     n = (ws->end_ull - start) / ws->incr_ull;
 273   else
 274     n = (start - ws->end_ull) / -ws->incr_ull;
 275   q = (n + nthreads - 1) / nthreads;
 276
 277   if (q < ws->chunk_size_ull)
 278     q = ws->chunk_size_ull;
 279   if (q <= n)
 280     end = start + q * ws->incr_ull;
 281   else
 282     end = ws->end_ull;
 283
 284   ws->next_ull = end;
 285   *pstart = start;
 286   *pend = end;
 287   return true;
 288 }
 289
 290 #if defined HAVE_SYNC_BUILTINS && defined __LP64__
 291 /* Similar, but doesn't require the lock held, and uses compare-and-swap
 292    instead.  Note that the only memory value that changes is ws->next_ull.  */
 293
 294 bool
 295 gomp_iter_ull_guided_next (gomp_ull *pstart, gomp_ull *pend)
 296 {
 297   struct gomp_thread *thr = gomp_thread ();
 298   struct gomp_work_share *ws = thr->ts.work_share;
 299   struct gomp_team *team = thr->ts.team;
 300   gomp_ull nthreads = team ? team->nthreads : 1;
 301   gomp_ull start, end, nend, incr;
 302   gomp_ull chunk_size;
 303
 304   start = ws->next_ull;
 305   end = ws->end_ull;
 306   incr = ws->incr_ull;
 307   chunk_size = ws->chunk_size_ull;
 308
 309   while (1)
 310     {
 311       gomp_ull n, q;
 312       gomp_ull tmp;
 313
 314       if (start == end)
 315         return false;
 316
 317       if (__builtin_expect (ws->mode, 0) == 0)
 318         n = (end - start) / incr;
 319       else
 320         n = (start - end) / -incr;
 321       q = (n + nthreads - 1) / nthreads;
 322
 323       if (q < chunk_size)
 324         q = chunk_size;
 325       if (__builtin_expect (q <= n, 1))
 326         nend = start + q * incr;
 327       else
 328         nend = end;
 329
 330       tmp = __sync_val_compare_and_swap (&ws->next_ull, start, nend);
 331       if (__builtin_expect (tmp == start, 1))
 332         break;
 333
 334       start = tmp;
 335     }
 336
 337   *pstart = start;
 338   *pend = nend;
 339   return true;
 340 }
 341 #endif /* HAVE_SYNC_BUILTINS */