Update ChangeLog and version files for release
[official-gcc.git] / libcilkrts / runtime / cilk-abi-cilk-for.cpp
blob4cd04f521cf2428d36c88bac1d7424dc75cf9ebd
1 /* cilk-abi-cilk-for.cpp -*-C++-*-
3 *************************************************************************
5 * @copyright
6 * Copyright (C) 2011, 2013, Intel Corporation
7 * All rights reserved.
8 *
9 * @copyright
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
14 * * Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * * Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in
18 * the documentation and/or other materials provided with the
19 * distribution.
20 * * Neither the name of Intel Corporation nor the names of its
21 * contributors may be used to endorse or promote products derived
22 * from this software without specific prior written permission.
24 * @copyright
25 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
26 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
27 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
28 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
29 * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
30 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
31 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
32 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
33 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
35 * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 * POSSIBILITY OF SUCH DAMAGE.
38 **************************************************************************/
40 /* Implementation of cilk_for ABI.
42 * This file must be C++, not C, in order to handle C++ exceptions correctly
43 * from within the body of the cilk_for loop
46 #include "internal/abi.h"
47 #include "metacall_impl.h"
48 #include "global_state.h"
50 // Icky macros to determine if we're compiled with optimization. Based on
51 // the declaration of __CILKRTS_ASSERT in common.h
52 #if defined(_WIN32)
53 # if defined (_DEBUG)
54 # define CILKRTS_OPTIMIZED 0 // Assumes /MDd is always used with /Od
55 # else
56 # define CILKRTS_OPTIMIZED 1
57 # endif // defined(_DEBUG)
58 #else
59 # if defined(__OPTIMIZE__)
60 # define CILKRTS_OPTIMIZED 1
61 # else
62 # define CILKRTS_OPTIMIZED 0
63 # endif
64 #endif
66 template <typename count_t>
67 static inline int grainsize(int req, count_t count)
69 // A positive requested grain size comes from the user. A very high grain
70 // size risks losing parallelism, but the user told us what they want for
71 // grainsize. Who are we to argue?
72 if (req > 0)
73 return req;
75 // At present, a negative requested grain size is treated the same way as
76 // a zero grain size, i.e., the runtime computes the actual grainsize
77 // using a hueristic. In the future, the compiler may give us additional
78 // information about the size of the cilk_for body by passing a negative
79 // grain size.
81 // Avoid generating a zero grainsize, even for empty loops.
82 if (count < 1)
83 return 1;
85 global_state_t* g = cilkg_get_global_state();
86 if (g->under_ptool)
88 // Grainsize = 1, when running under PIN, and when the grainsize has
89 // not explicitly been set by the user.
90 return 1;
92 else
94 // Divide loop count by 8 times the worker count and round up.
95 const int Px8 = g->P * 8;
96 count_t n = (count + Px8 - 1) / Px8;
98 // 2K should be enough to amortize the cost of the cilk_for. Any
99 // larger grainsize risks losing parallelism.
100 if (n > 2048)
101 return 2048;
102 return (int) n; // n <= 2048, so no loss of precision on cast to int
107 * call_cilk_for_loop_body
109 * Centralizes the code to call the loop body. The compiler should be
110 * inlining this code
112 * low - Low loop index we're considering in this portion of the algorithm
113 * high - High loop index we're considering in this portion of the algorithm
114 * body - lambda function for the cilk_for loop body
115 * data - data used by the lambda function
116 * w - __cilkrts_worker we're currently executing on
117 * loop_root_pedigree - __cilkrts_pedigree node we generated for the root of
118 * the cilk_for loop to flatten out the internal nodes
120 template <typename count_t, typename F>
121 inline static
122 void call_cilk_for_loop_body(count_t low, count_t high,
123 F body, void *data,
124 __cilkrts_worker *w,
125 __cilkrts_pedigree *loop_root_pedigree)
127 // Cilkscreen should not report this call in a stack trace
128 NOTIFY_ZC_INTRINSIC((char *)"cilkscreen_hide_call", 0);
130 // The worker is only valid until the first spawn. Fetch the
131 // __cilkrts_stack_frame out of the worker, since it will be stable across
132 // steals. The sf pointer actually points to the *parent's*
133 // __cilkrts_stack_frame, since this function is a non-spawning function
134 // and therefore has no cilk stack frame of its own.
135 __cilkrts_stack_frame *sf = w->current_stack_frame;
137 // Save the pedigree node pointed to by the worker. We'll need to restore
138 // that when we exit since the spawn helpers in the cilk_for call tree
139 // will assume that it's valid
140 const __cilkrts_pedigree *saved_next_pedigree_node = w->pedigree.parent;
142 // Add the leaf pedigree node to the chain. The parent is the root node
143 // to flatten the tree regardless of the DAG branches in the cilk_for
144 // divide-and-conquer recursion.
146 // The rank is initialized to the low index. The user is
147 // expected to call __cilkrts_bump_loop_rank at the end of the cilk_for
148 // loop body.
149 __cilkrts_pedigree loop_leaf_pedigree;
151 loop_leaf_pedigree.rank = (uint64_t)low;
152 loop_leaf_pedigree.parent = loop_root_pedigree;
154 // The worker's pedigree always starts with a rank of 0
155 w->pedigree.rank = 0;
156 w->pedigree.parent = &loop_leaf_pedigree;
158 // Call the compiler generated cilk_for loop body lambda function
159 body(data, low, high);
161 // The loop body may have included spawns, so we must refetch the worker
162 // from the __cilkrts_stack_frame, which is stable regardless of which
163 // worker we're executing on.
164 w = sf->worker;
166 // Restore the pedigree chain. It must be valid because the spawn helpers
167 // generated by the cilk_for implementation will access it.
168 w->pedigree.parent = saved_next_pedigree_node;
171 /* capture_spawn_arg_stack_frame
173 * Efficiently get the address of the caller's __cilkrts_stack_frame. The
174 * preconditons are that 'w' is the worker at the time of the call and
175 * 'w->current_stack_frame' points to the __cilkrts_stack_frame within the
176 * spawn helper. This function should be called only within the argument list
177 * of a function that is being spawned because that is the only situation in
178 * which these preconditions hold. This function returns the worker
179 * (unchanged) after storing the captured stack frame pointer is stored in the
180 * sf argument.
182 * The purpose of this function is to get the caller's stack frame in a
183 * context where the caller's worker is known but its stack frame is not
184 * necessarily initialized. The "shrink wrap" optimization delays
185 * initializing the contents of a spawning function's '__cilkrts_stack_frame'
186 * as well as the 'current_stack_frame' pointer within the worker. By calling
187 * this function within a spawning function's argument list, we can ensure
188 * that these initializations have occured but that a detach (which would
189 * invalidate the worker pointer in the caller) has not yet occured. Once the
190 * '__cilkrts_stack_frame' has been retrieved in this way, it is stable for the
191 * remainder of the caller's execution, and becomes an efficient way to get
192 * the worker (much more efficient than calling '__cilkrts_get_tls_worker()'),
193 * even after a spawn or sync.
195 inline __cilkrts_worker*
196 capture_spawn_arg_stack_frame(__cilkrts_stack_frame* &sf, __cilkrts_worker* w)
198 // Get current stack frame
199 sf = w->current_stack_frame;
200 #ifdef __INTEL_COMPILER
201 # if __INTEL_COMPILER <= 1300 && __INTEL_COMPILER_BUILD_DATE < 20130101
202 // In older compilers 'w->current_stack_frame' points to the
203 // spawn-helper's stack frame. In newer compiler's however, it points
204 // directly to the pointer's stack frame. (This change was made to avoid
205 // having the spawn helper in the frame list when evaluating function
206 // arguments, thus avoiding corruption when those arguments themselves
207 // contain cilk_spawns.)
209 // w->current_stack_frame is the spawn helper's stack frame.
210 // w->current_stack_frame->call_parent is the caller's stack frame.
211 sf = sf->call_parent;
212 # endif
213 #endif
214 return w;
218 * cilk_for_recursive
220 * Templatized function to implement the recursive divide-and-conquer
221 * algorithm that's how we implement a cilk_for.
223 * low - Low loop index we're considering in this portion of the algorithm
224 * high - High loop index we're considering in this portion of the algorithm
225 * body - lambda function for the cilk_for loop body
226 * data - data used by the lambda function
227 * grain - grain size (0 if it should be computed)
228 * w - __cilkrts_worker we're currently executing on
229 * loop_root_pedigree - __cilkrts_pedigree node we generated for the root of
230 * the cilk_for loop to flatten out the internal nodes
232 template <typename count_t, typename F>
233 static
234 void cilk_for_recursive(count_t low, count_t high,
235 F body, void *data, int grain,
236 __cilkrts_worker *w,
237 __cilkrts_pedigree *loop_root_pedigree)
239 tail_recurse:
240 // Cilkscreen should not report this call in a stack trace
241 // This needs to be done everytime the worker resumes
242 NOTIFY_ZC_INTRINSIC((char *)"cilkscreen_hide_call", 0);
244 count_t count = high - low;
245 // Invariant: count > 0, grain >= 1
246 if (count > grain)
248 // Invariant: count >= 2
249 count_t mid = low + count / 2;
250 // The worker is valid only until the first spawn and is expensive to
251 // retrieve (using '__cilkrts_get_tls_worker') after the spawn. The
252 // '__cilkrts_stack_frame' is more stable, but isn't initialized until
253 // the first spawn. Thus, we want to grab the address of the
254 // '__cilkrts_stack_frame' after it is initialized but before the
255 // spawn detaches. The only place we can do that is within the
256 // argument list of the spawned function, hence the call to
257 // capture_spawn_arg_stack_frame().
258 __cilkrts_stack_frame *sf;
259 #if defined(__GNUC__) && ! defined(__INTEL_COMPILER) && ! defined(__clang__)
260 // The current version of gcc initializes the sf structure eagerly.
261 // We can take advantage of this fact to avoid calling
262 // `capture_spawn_arg_stack_frame` when compiling with gcc.
263 // Remove this if the "shrink-wrap" optimization is implemented.
264 sf = w->current_stack_frame;
265 _Cilk_spawn cilk_for_recursive(low, mid, body, data, grain, w,
266 loop_root_pedigree);
267 #else
268 _Cilk_spawn cilk_for_recursive(low, mid, body, data, grain,
269 capture_spawn_arg_stack_frame(sf, w),
270 loop_root_pedigree);
271 #endif
272 w = sf->worker;
273 low = mid;
275 goto tail_recurse;
278 // Call the cilk_for loop body lambda function passed in by the compiler to
279 // execute one grain
280 call_cilk_for_loop_body(low, high, body, data, w, loop_root_pedigree);
283 static void noop() { }
286 * cilk_for_root
288 * Templatized function to implement the top level of a cilk_for loop.
290 * body - lambda function for the cilk_for loop body
291 * data - data used by the lambda function
292 * count - trip count for loop
293 * grain - grain size (0 if it should be computed)
295 template <typename count_t, typename F>
296 static void cilk_for_root(F body, void *data, count_t count, int grain)
298 // Cilkscreen should not report this call in a stack trace
299 NOTIFY_ZC_INTRINSIC((char *)"cilkscreen_hide_call", 0);
301 // Pedigree computation:
303 // If the last pedigree node on entry to the _Cilk_for has value X,
304 // then at the start of each iteration of the loop body, the value of
305 // the last pedigree node should be 0, the value of the second-to-last
306 // node should equal the loop counter, and the value of the
307 // third-to-last node should be X. On return from the _Cilk_for, the
308 // value of the last pedigree should be incremented to X+2. The
309 // pedigree within the loop is thus flattened, such that the depth of
310 // recursion does not affect the results either inside or outside of
311 // the loop. Note that the pedigree after the loop exists is the same
312 // as if a single spawn and sync were executed within this function.
314 // TBD: Since the shrink-wrap optimization was turned on in the compiler,
315 // it is not possible to get the current stack frame without actually
316 // forcing a call to bind-thread. This spurious spawn is a temporary
317 // stopgap until the correct intrinsics are added to give us total control
318 // over frame initialization.
319 _Cilk_spawn noop();
321 // Fetch the current worker. From that we can get the current stack frame
322 // which will be constant even if we're stolen
323 __cilkrts_worker *w = __cilkrts_get_tls_worker();
324 __cilkrts_stack_frame *sf = w->current_stack_frame;
326 // Decrement the rank by one to undo the pedigree change from the
327 // _Cilk_spawn
328 --w->pedigree.rank;
330 // Save the current worker pedigree into loop_root_pedigree, which will be
331 // the root node for our flattened pedigree.
332 __cilkrts_pedigree loop_root_pedigree = w->pedigree;
334 // Don't splice the loop_root node in yet. It will be done when we
335 // call the loop body lambda function
336 // w->pedigree.rank = 0;
337 // w->pedigree.next = &loop_root_pedigree;
339 /* Spawn is necessary at top-level to force runtime to start up.
340 * Runtime must be started in order to call the grainsize() function.
342 int gs = grainsize(grain, count);
343 cilk_for_recursive((count_t) 0, count, body, data, gs, w,
344 &loop_root_pedigree);
346 // Need to refetch the worker after calling a spawning function.
347 w = sf->worker;
349 // Restore the pedigree in the worker.
350 w->pedigree = loop_root_pedigree;
352 // Bump the worker pedigree.
353 ++w->pedigree.rank;
355 // Implicit sync will increment the pedigree leaf rank again, for a total
356 // of two increments. If the noop spawn above is removed, then we'll need
357 // to re-enable the following code:
358 // // If this is an optimized build, then the compiler will have optimized
359 // // out the increment of the worker's pedigree in the implied sync. We
360 // // need to add one to make the pedigree_loop test work correctly.
361 // #if CILKRTS_OPTIMIZED
362 // ++sf->worker->pedigree.rank;
363 // #endif
366 // Use extern "C" to suppress name mangling of __cilkrts_cilk_for_32 and
367 // __cilkrts_cilk_for_64.
368 extern "C" {
371 * __cilkrts_cilk_for_32
373 * Implementation of cilk_for for 32-bit trip counts (regardless of processor
374 * word size). Assumes that the range is 0 - count.
376 * body - lambda function for the cilk_for loop body
377 * data - data used by the lambda function
378 * count - trip count for loop
379 * grain - grain size (0 if it should be computed)
382 CILK_ABI_THROWS_VOID __cilkrts_cilk_for_32(__cilk_abi_f32_t body, void *data,
383 cilk32_t count, int grain)
385 // Cilkscreen should not report this call in a stack trace
386 NOTIFY_ZC_INTRINSIC((char *)"cilkscreen_hide_call", 0);
388 // Check for an empty range here as an optimization - don't need to do any
389 // __cilkrts_stack_frame initialization
390 if (count > 0)
391 cilk_for_root(body, data, count, grain);
395 * __cilkrts_cilk_for_64
397 * Implementation of cilk_for for 64-bit trip counts (regardless of processor
398 * word size). Assumes that the range is 0 - count.
400 * body - lambda function for the cilk_for loop body
401 * data - data used by the lambda function
402 * count - trip count for loop
403 * grain - grain size (0 if it should be computed)
405 CILK_ABI_THROWS_VOID __cilkrts_cilk_for_64(__cilk_abi_f64_t body, void *data,
406 cilk64_t count, int grain)
408 // Check for an empty range here as an optimization - don't need to do any
409 // __cilkrts_stack_frame initialization
410 if (count > 0)
411 cilk_for_root(body, data, count, grain);
414 } // end extern "C"
416 /* End cilk-abi-cilk-for.cpp */