libgomp/iter_ull.c

   1 /* Copyright (C) 2005-2024 Free Software Foundation, Inc.
   2    Contributed by Richard Henderson <rth@redhat.com>.
   3
   4    This file is part of the GNU Offloading and Multi Processing Library
   5    (libgomp).
   6
   7    Libgomp is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
  13    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  14    FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  15    more details.
  16
  17    Under Section 7 of GPL version 3, you are granted additional
  18    permissions described in the GCC Runtime Library Exception, version
  19    3.1, as published by the Free Software Foundation.
  20
  21    You should have received a copy of the GNU General Public License and
  22    a copy of the GCC Runtime Library Exception along with this program;
  23    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  24    <http://www.gnu.org/licenses/>.  */
  25
  26 /* This file contains routines for managing work-share iteration, both
  27    for loops and sections.  */
  28
  29 #include "libgomp.h"
  30 #include <stdlib.h>
  31
  32 typedef unsigned long long gomp_ull;
  33
  34 /* This function implements the STATIC scheduling method.  The caller should
  35    iterate *pstart <= x < *pend.  Return zero if there are more iterations
  36    to perform; nonzero if not.  Return less than 0 if this thread had
  37    received the absolutely last iteration.  */
  38
  39 int
  40 gomp_iter_ull_static_next (gomp_ull *pstart, gomp_ull *pend)
  41 {
  42   struct gomp_thread *thr = gomp_thread ();
  43   struct gomp_team *team = thr->ts.team;
  44   struct gomp_work_share *ws = thr->ts.work_share;
  45   unsigned long nthreads = team ? team->nthreads : 1;
  46
  47   if (thr->ts.static_trip == -1)
  48     return -1;
  49
  50   /* Quick test for degenerate teams and orphaned constructs.  */
  51   if (nthreads == 1)
  52     {
  53       *pstart = ws->next_ull;
  54       *pend = ws->end_ull;
  55       thr->ts.static_trip = -1;
  56       return ws->next_ull == ws->end_ull;
  57     }
  58
  59   /* We interpret chunk_size zero as "unspecified", which means that we
  60      should break up the iterations such that each thread makes only one
  61      trip through the outer loop.  */
  62   if (ws->chunk_size_ull == 0)
  63     {
  64       gomp_ull n, q, i, t, s0, e0, s, e;
  65
  66       if (thr->ts.static_trip > 0)
  67         return 1;
  68
  69       /* Compute the total number of iterations.  */
  70       if (__builtin_expect (ws->mode, 0) == 0)
  71         n = (ws->end_ull - ws->next_ull + ws->incr_ull - 1) / ws->incr_ull;
  72       else
  73         n = (ws->next_ull - ws->end_ull - ws->incr_ull - 1) / -ws->incr_ull;
  74       i = thr->ts.team_id;
  75
  76       /* Compute the "zero-based" start and end points.  That is, as
  77          if the loop began at zero and incremented by one.  */
  78       q = n / nthreads;
  79       t = n % nthreads;
  80       if (i < t)
  81         {
  82           t = 0;
  83           q++;
  84         }
  85       s0 = q * i + t;
  86       e0 = s0 + q;
  87
  88       /* Notice when no iterations allocated for this thread.  */
  89       if (s0 >= e0)
  90         {
  91           thr->ts.static_trip = 1;
  92           return 1;
  93         }
  94
  95       /* Transform these to the actual start and end numbers.  */
  96       s = s0 * ws->incr_ull + ws->next_ull;
  97       e = e0 * ws->incr_ull + ws->next_ull;
  98
  99       *pstart = s;
 100       *pend = e;
 101       thr->ts.static_trip = (e0 == n ? -1 : 1);
 102       return 0;
 103     }
 104   else
 105     {
 106       gomp_ull n, s0, e0, i, c, s, e;
 107
 108       /* Otherwise, each thread gets exactly chunk_size iterations
 109          (if available) each time through the loop.  */
 110
 111       if (__builtin_expect (ws->mode, 0) == 0)
 112         n = (ws->end_ull - ws->next_ull + ws->incr_ull - 1) / ws->incr_ull;
 113       else
 114         n = (ws->next_ull - ws->end_ull - ws->incr_ull - 1) / -ws->incr_ull;
 115       i = thr->ts.team_id;
 116       c = ws->chunk_size_ull;
 117
 118       /* Initial guess is a C sized chunk positioned nthreads iterations
 119          in, offset by our thread number.  */
 120       s0 = (thr->ts.static_trip * (gomp_ull) nthreads + i) * c;
 121       e0 = s0 + c;
 122
 123       /* Detect overflow.  */
 124       if (s0 >= n)
 125         return 1;
 126       if (e0 > n)
 127         e0 = n;
 128
 129       /* Transform these to the actual start and end numbers.  */
 130       s = s0 * ws->incr_ull + ws->next_ull;
 131       e = e0 * ws->incr_ull + ws->next_ull;
 132
 133       *pstart = s;
 134       *pend = e;
 135
 136       if (e0 == n)
 137         thr->ts.static_trip = -1;
 138       else
 139         thr->ts.static_trip++;
 140       return 0;
 141     }
 142 }
 143
 144
 145 /* This function implements the DYNAMIC scheduling method.  Arguments are
 146    as for gomp_iter_ull_static_next.  This function must be called with
 147    ws->lock held.  */
 148
 149 bool
 150 gomp_iter_ull_dynamic_next_locked (gomp_ull *pstart, gomp_ull *pend)
 151 {
 152   struct gomp_thread *thr = gomp_thread ();
 153   struct gomp_work_share *ws = thr->ts.work_share;
 154   gomp_ull start, end, chunk, left;
 155
 156   start = ws->next_ull;
 157   if (start == ws->end_ull)
 158     return false;
 159
 160   chunk = ws->chunk_size_ull;
 161   left = ws->end_ull - start;
 162   if (__builtin_expect (ws->mode & 2, 0))
 163     {
 164       if (chunk < left)
 165         chunk = left;
 166     }
 167   else
 168     {
 169       if (chunk > left)
 170         chunk = left;
 171     }
 172   end = start + chunk;
 173
 174   ws->next_ull = end;
 175   *pstart = start;
 176   *pend = end;
 177   return true;
 178 }
 179
 180
 181 #if defined HAVE_SYNC_BUILTINS && defined __LP64__
 182 /* Similar, but doesn't require the lock held, and uses compare-and-swap
 183    instead.  Note that the only memory value that changes is ws->next_ull.  */
 184
 185 bool
 186 gomp_iter_ull_dynamic_next (gomp_ull *pstart, gomp_ull *pend)
 187 {
 188   struct gomp_thread *thr = gomp_thread ();
 189   struct gomp_work_share *ws = thr->ts.work_share;
 190   gomp_ull start, end, nend, chunk;
 191
 192   end = ws->end_ull;
 193   chunk = ws->chunk_size_ull;
 194
 195   if (__builtin_expect (ws->mode & 1, 1))
 196     {
 197       gomp_ull tmp = __sync_fetch_and_add (&ws->next_ull, chunk);
 198       if (__builtin_expect (ws->mode & 2, 0) == 0)
 199         {
 200           if (tmp >= end)
 201             return false;
 202           nend = tmp + chunk;
 203           if (nend > end)
 204             nend = end;
 205           *pstart = tmp;
 206           *pend = nend;
 207           return true;
 208         }
 209       else
 210         {
 211           if (tmp <= end)
 212             return false;
 213           nend = tmp + chunk;
 214           if (nend < end)
 215             nend = end;
 216           *pstart = tmp;
 217           *pend = nend;
 218           return true;
 219         }
 220     }
 221
 222   start = __atomic_load_n (&ws->next_ull, MEMMODEL_RELAXED);
 223   while (1)
 224     {
 225       gomp_ull left = end - start;
 226       gomp_ull tmp;
 227
 228       if (start == end)
 229         return false;
 230
 231       if (__builtin_expect (ws->mode & 2, 0))
 232         {
 233           if (chunk < left)
 234             chunk = left;
 235         }
 236       else
 237         {
 238           if (chunk > left)
 239             chunk = left;
 240         }
 241       nend = start + chunk;
 242
 243       tmp = __sync_val_compare_and_swap (&ws->next_ull, start, nend);
 244       if (__builtin_expect (tmp == start, 1))
 245         break;
 246
 247       start = tmp;
 248     }
 249
 250   *pstart = start;
 251   *pend = nend;
 252   return true;
 253 }
 254 #endif /* HAVE_SYNC_BUILTINS */
 255
 256
 257 /* This function implements the GUIDED scheduling method.  Arguments are
 258    as for gomp_iter_ull_static_next.  This function must be called with the
 259    work share lock held.  */
 260
 261 bool
 262 gomp_iter_ull_guided_next_locked (gomp_ull *pstart, gomp_ull *pend)
 263 {
 264   struct gomp_thread *thr = gomp_thread ();
 265   struct gomp_work_share *ws = thr->ts.work_share;
 266   struct gomp_team *team = thr->ts.team;
 267   gomp_ull nthreads = team ? team->nthreads : 1;
 268   gomp_ull n, q;
 269   gomp_ull start, end;
 270
 271   if (ws->next_ull == ws->end_ull)
 272     return false;
 273
 274   start = ws->next_ull;
 275   if (__builtin_expect (ws->mode, 0) == 0)
 276     n = (ws->end_ull - start) / ws->incr_ull;
 277   else
 278     n = (start - ws->end_ull) / -ws->incr_ull;
 279   q = (n + nthreads - 1) / nthreads;
 280
 281   if (q < ws->chunk_size_ull)
 282     q = ws->chunk_size_ull;
 283   if (q <= n)
 284     end = start + q * ws->incr_ull;
 285   else
 286     end = ws->end_ull;
 287
 288   ws->next_ull = end;
 289   *pstart = start;
 290   *pend = end;
 291   return true;
 292 }
 293
 294 #if defined HAVE_SYNC_BUILTINS && defined __LP64__
 295 /* Similar, but doesn't require the lock held, and uses compare-and-swap
 296    instead.  Note that the only memory value that changes is ws->next_ull.  */
 297
 298 bool
 299 gomp_iter_ull_guided_next (gomp_ull *pstart, gomp_ull *pend)
 300 {
 301   struct gomp_thread *thr = gomp_thread ();
 302   struct gomp_work_share *ws = thr->ts.work_share;
 303   struct gomp_team *team = thr->ts.team;
 304   gomp_ull nthreads = team ? team->nthreads : 1;
 305   gomp_ull start, end, nend, incr;
 306   gomp_ull chunk_size;
 307
 308   start = __atomic_load_n (&ws->next_ull, MEMMODEL_RELAXED);
 309   end = ws->end_ull;
 310   incr = ws->incr_ull;
 311   chunk_size = ws->chunk_size_ull;
 312
 313   while (1)
 314     {
 315       gomp_ull n, q;
 316       gomp_ull tmp;
 317
 318       if (start == end)
 319         return false;
 320
 321       if (__builtin_expect (ws->mode, 0) == 0)
 322         n = (end - start) / incr;
 323       else
 324         n = (start - end) / -incr;
 325       q = (n + nthreads - 1) / nthreads;
 326
 327       if (q < chunk_size)
 328         q = chunk_size;
 329       if (__builtin_expect (q <= n, 1))
 330         nend = start + q * incr;
 331       else
 332         nend = end;
 333
 334       tmp = __sync_val_compare_and_swap (&ws->next_ull, start, nend);
 335       if (__builtin_expect (tmp == start, 1))
 336         break;
 337
 338       start = tmp;
 339     }
 340
 341   *pstart = start;
 342   *pend = nend;
 343   return true;
 344 }
 345 #endif /* HAVE_SYNC_BUILTINS */