Remove not needed __builtin_expect due to malloc predictor.
[official-gcc.git] / libgomp / iter_ull.c
blobfba37af0f45a11636092738d3e0a9eaeda00f8b2
1 /* Copyright (C) 2005-2018 Free Software Foundation, Inc.
2 Contributed by Richard Henderson <rth@redhat.com>.
4 This file is part of the GNU Offloading and Multi Processing Library
5 (libgomp).
7 Libgomp is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
12 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
14 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
15 more details.
17 Under Section 7 of GPL version 3, you are granted additional
18 permissions described in the GCC Runtime Library Exception, version
19 3.1, as published by the Free Software Foundation.
21 You should have received a copy of the GNU General Public License and
22 a copy of the GCC Runtime Library Exception along with this program;
23 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
24 <http://www.gnu.org/licenses/>. */
26 /* This file contains routines for managing work-share iteration, both
27 for loops and sections. */
29 #include "libgomp.h"
30 #include <stdlib.h>
32 typedef unsigned long long gomp_ull;
34 /* This function implements the STATIC scheduling method. The caller should
35 iterate *pstart <= x < *pend. Return zero if there are more iterations
36 to perform; nonzero if not. Return less than 0 if this thread had
37 received the absolutely last iteration. */
39 int
40 gomp_iter_ull_static_next (gomp_ull *pstart, gomp_ull *pend)
42 struct gomp_thread *thr = gomp_thread ();
43 struct gomp_team *team = thr->ts.team;
44 struct gomp_work_share *ws = thr->ts.work_share;
45 unsigned long nthreads = team ? team->nthreads : 1;
47 if (thr->ts.static_trip == -1)
48 return -1;
50 /* Quick test for degenerate teams and orphaned constructs. */
51 if (nthreads == 1)
53 *pstart = ws->next_ull;
54 *pend = ws->end_ull;
55 thr->ts.static_trip = -1;
56 return ws->next_ull == ws->end_ull;
59 /* We interpret chunk_size zero as "unspecified", which means that we
60 should break up the iterations such that each thread makes only one
61 trip through the outer loop. */
62 if (ws->chunk_size_ull == 0)
64 gomp_ull n, q, i, t, s0, e0, s, e;
66 if (thr->ts.static_trip > 0)
67 return 1;
69 /* Compute the total number of iterations. */
70 if (__builtin_expect (ws->mode, 0) == 0)
71 n = (ws->end_ull - ws->next_ull + ws->incr_ull - 1) / ws->incr_ull;
72 else
73 n = (ws->next_ull - ws->end_ull - ws->incr_ull - 1) / -ws->incr_ull;
74 i = thr->ts.team_id;
76 /* Compute the "zero-based" start and end points. That is, as
77 if the loop began at zero and incremented by one. */
78 q = n / nthreads;
79 t = n % nthreads;
80 if (i < t)
82 t = 0;
83 q++;
85 s0 = q * i + t;
86 e0 = s0 + q;
88 /* Notice when no iterations allocated for this thread. */
89 if (s0 >= e0)
91 thr->ts.static_trip = 1;
92 return 1;
95 /* Transform these to the actual start and end numbers. */
96 s = s0 * ws->incr_ull + ws->next_ull;
97 e = e0 * ws->incr_ull + ws->next_ull;
99 *pstart = s;
100 *pend = e;
101 thr->ts.static_trip = (e0 == n ? -1 : 1);
102 return 0;
104 else
106 gomp_ull n, s0, e0, i, c, s, e;
108 /* Otherwise, each thread gets exactly chunk_size iterations
109 (if available) each time through the loop. */
111 if (__builtin_expect (ws->mode, 0) == 0)
112 n = (ws->end_ull - ws->next_ull + ws->incr_ull - 1) / ws->incr_ull;
113 else
114 n = (ws->next_ull - ws->end_ull - ws->incr_ull - 1) / -ws->incr_ull;
115 i = thr->ts.team_id;
116 c = ws->chunk_size_ull;
118 /* Initial guess is a C sized chunk positioned nthreads iterations
119 in, offset by our thread number. */
120 s0 = (thr->ts.static_trip * (gomp_ull) nthreads + i) * c;
121 e0 = s0 + c;
123 /* Detect overflow. */
124 if (s0 >= n)
125 return 1;
126 if (e0 > n)
127 e0 = n;
129 /* Transform these to the actual start and end numbers. */
130 s = s0 * ws->incr_ull + ws->next_ull;
131 e = e0 * ws->incr_ull + ws->next_ull;
133 *pstart = s;
134 *pend = e;
136 if (e0 == n)
137 thr->ts.static_trip = -1;
138 else
139 thr->ts.static_trip++;
140 return 0;
145 /* This function implements the DYNAMIC scheduling method. Arguments are
146 as for gomp_iter_ull_static_next. This function must be called with
147 ws->lock held. */
149 bool
150 gomp_iter_ull_dynamic_next_locked (gomp_ull *pstart, gomp_ull *pend)
152 struct gomp_thread *thr = gomp_thread ();
153 struct gomp_work_share *ws = thr->ts.work_share;
154 gomp_ull start, end, chunk, left;
156 start = ws->next_ull;
157 if (start == ws->end_ull)
158 return false;
160 chunk = ws->chunk_size_ull;
161 left = ws->end_ull - start;
162 if (__builtin_expect (ws->mode & 2, 0))
164 if (chunk < left)
165 chunk = left;
167 else
169 if (chunk > left)
170 chunk = left;
172 end = start + chunk;
174 ws->next_ull = end;
175 *pstart = start;
176 *pend = end;
177 return true;
181 #if defined HAVE_SYNC_BUILTINS && defined __LP64__
182 /* Similar, but doesn't require the lock held, and uses compare-and-swap
183 instead. Note that the only memory value that changes is ws->next_ull. */
185 bool
186 gomp_iter_ull_dynamic_next (gomp_ull *pstart, gomp_ull *pend)
188 struct gomp_thread *thr = gomp_thread ();
189 struct gomp_work_share *ws = thr->ts.work_share;
190 gomp_ull start, end, nend, chunk;
192 end = ws->end_ull;
193 chunk = ws->chunk_size_ull;
195 if (__builtin_expect (ws->mode & 1, 1))
197 gomp_ull tmp = __sync_fetch_and_add (&ws->next_ull, chunk);
198 if (__builtin_expect (ws->mode & 2, 0) == 0)
200 if (tmp >= end)
201 return false;
202 nend = tmp + chunk;
203 if (nend > end)
204 nend = end;
205 *pstart = tmp;
206 *pend = nend;
207 return true;
209 else
211 if (tmp <= end)
212 return false;
213 nend = tmp + chunk;
214 if (nend < end)
215 nend = end;
216 *pstart = tmp;
217 *pend = nend;
218 return true;
222 start = __atomic_load_n (&ws->next_ull, MEMMODEL_RELAXED);
223 while (1)
225 gomp_ull left = end - start;
226 gomp_ull tmp;
228 if (start == end)
229 return false;
231 if (__builtin_expect (ws->mode & 2, 0))
233 if (chunk < left)
234 chunk = left;
236 else
238 if (chunk > left)
239 chunk = left;
241 nend = start + chunk;
243 tmp = __sync_val_compare_and_swap (&ws->next_ull, start, nend);
244 if (__builtin_expect (tmp == start, 1))
245 break;
247 start = tmp;
250 *pstart = start;
251 *pend = nend;
252 return true;
254 #endif /* HAVE_SYNC_BUILTINS */
257 /* This function implements the GUIDED scheduling method. Arguments are
258 as for gomp_iter_ull_static_next. This function must be called with the
259 work share lock held. */
261 bool
262 gomp_iter_ull_guided_next_locked (gomp_ull *pstart, gomp_ull *pend)
264 struct gomp_thread *thr = gomp_thread ();
265 struct gomp_work_share *ws = thr->ts.work_share;
266 struct gomp_team *team = thr->ts.team;
267 gomp_ull nthreads = team ? team->nthreads : 1;
268 gomp_ull n, q;
269 gomp_ull start, end;
271 if (ws->next_ull == ws->end_ull)
272 return false;
274 start = ws->next_ull;
275 if (__builtin_expect (ws->mode, 0) == 0)
276 n = (ws->end_ull - start) / ws->incr_ull;
277 else
278 n = (start - ws->end_ull) / -ws->incr_ull;
279 q = (n + nthreads - 1) / nthreads;
281 if (q < ws->chunk_size_ull)
282 q = ws->chunk_size_ull;
283 if (q <= n)
284 end = start + q * ws->incr_ull;
285 else
286 end = ws->end_ull;
288 ws->next_ull = end;
289 *pstart = start;
290 *pend = end;
291 return true;
294 #if defined HAVE_SYNC_BUILTINS && defined __LP64__
295 /* Similar, but doesn't require the lock held, and uses compare-and-swap
296 instead. Note that the only memory value that changes is ws->next_ull. */
298 bool
299 gomp_iter_ull_guided_next (gomp_ull *pstart, gomp_ull *pend)
301 struct gomp_thread *thr = gomp_thread ();
302 struct gomp_work_share *ws = thr->ts.work_share;
303 struct gomp_team *team = thr->ts.team;
304 gomp_ull nthreads = team ? team->nthreads : 1;
305 gomp_ull start, end, nend, incr;
306 gomp_ull chunk_size;
308 start = __atomic_load_n (&ws->next_ull, MEMMODEL_RELAXED);
309 end = ws->end_ull;
310 incr = ws->incr_ull;
311 chunk_size = ws->chunk_size_ull;
313 while (1)
315 gomp_ull n, q;
316 gomp_ull tmp;
318 if (start == end)
319 return false;
321 if (__builtin_expect (ws->mode, 0) == 0)
322 n = (end - start) / incr;
323 else
324 n = (start - end) / -incr;
325 q = (n + nthreads - 1) / nthreads;
327 if (q < chunk_size)
328 q = chunk_size;
329 if (__builtin_expect (q <= n, 1))
330 nend = start + q * incr;
331 else
332 nend = end;
334 tmp = __sync_val_compare_and_swap (&ws->next_ull, start, nend);
335 if (__builtin_expect (tmp == start, 1))
336 break;
338 start = tmp;
341 *pstart = start;
342 *pend = nend;
343 return true;
345 #endif /* HAVE_SYNC_BUILTINS */