common.opt (fvect-cost-model): Remove RejectNegative flag...
[official-gcc.git] / libcilkrts / runtime / scheduler.c
blob82c9e02af086907ae191fa2be301c6567ea3b55b
1 /* scheduler.c -*-C-*-
3 *************************************************************************
5 * Copyright (C) 2007-2016, Intel Corporation
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
12 * * Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * * Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in
16 * the documentation and/or other materials provided with the
17 * distribution.
18 * * Neither the name of Intel Corporation nor the names of its
19 * contributors may be used to endorse or promote products derived
20 * from this software without specific prior written permission.
22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
25 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
26 * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
28 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
29 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
32 * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
33 * POSSIBILITY OF SUCH DAMAGE.
35 * *********************************************************************
37 * PLEASE NOTE: This file is a downstream copy of a file mainitained in
38 * a repository at cilkplus.org. Changes made to this file that are not
39 * submitted through the contribution process detailed at
40 * http://www.cilkplus.org/submit-cilk-contribution will be lost the next
41 * time that a new version is released. Changes only submitted to the
42 * GNU compiler collection or posted to the git repository at
43 * https://bitbucket.org/intelcilkruntime/intel-cilk-runtime.git are
44 * not tracked.
46 * We welcome your contributions to this open source project. Thank you
47 * for your assistance in helping us improve Cilk Plus.
49 **************************************************************************/
52 * Cilk scheduler
55 #include "scheduler.h"
56 #include "bug.h"
57 #include "os.h"
58 #include "os_mutex.h"
59 #include "local_state.h"
60 #include "signal_node.h"
61 #include "full_frame.h"
62 #include "sysdep.h"
63 #include "except.h"
64 #include "cilk_malloc.h"
65 #include "pedigrees.h"
66 #include "record-replay.h"
68 #include <limits.h>
69 #include <string.h> /* memcpy */
70 #include <stdio.h> // sprintf
71 #include <stdlib.h> // malloc, free, abort
73 #ifdef _WIN32
74 # pragma warning(disable:1786) // disable warning: sprintf is deprecated
75 # include "sysdep-win.h"
76 # include "except-win32.h"
77 #endif // _WIN32
79 // ICL: Don't complain about conversion from pointer to same-sized integral
80 // type in __cilkrts_put_stack. That's why we're using ptrdiff_t
81 #ifdef _WIN32
82 # pragma warning(disable: 1684)
83 #endif
85 #include "cilk/cilk_api.h"
86 #include "frame_malloc.h"
87 #include "metacall_impl.h"
88 #include "reducer_impl.h"
89 #include "cilk-tbb-interop.h"
90 #include "cilk-ittnotify.h"
91 #include "stats.h"
93 // ICL: Don't complain about loss of precision in myrand
94 // I tried restoring the warning after the function, but it didn't
95 // suppress it
96 #ifdef _WIN32
97 # pragma warning(disable: 2259)
98 #endif
100 #ifndef _WIN32
101 # include <unistd.h>
102 #endif
104 #ifdef __VXWORKS__
105 // redeclare longjmp() with noreturn to stop warnings
106 extern __attribute__((noreturn))
107 void longjmp(jmp_buf, int);
108 #endif
110 //#define DEBUG_LOCKS 1
111 #ifdef DEBUG_LOCKS
112 // The currently executing worker must own this worker's lock
113 # define ASSERT_WORKER_LOCK_OWNED(w) \
115 __cilkrts_worker *tls_worker = __cilkrts_get_tls_worker(); \
116 CILK_ASSERT((w)->l->lock.owner == tls_worker); \
118 #else
119 # define ASSERT_WORKER_LOCK_OWNED(w)
120 #endif // DEBUG_LOCKS
122 // Options for the scheduler.
123 enum schedule_t { SCHEDULE_RUN,
124 SCHEDULE_WAIT,
125 SCHEDULE_EXIT };
127 // Return values for provably_good_steal()
128 enum provably_good_steal_t
130 ABANDON_EXECUTION, // Not the last child to the sync - attempt to steal work
131 CONTINUE_EXECUTION, // Last child to the sync - continue executing on this worker
132 WAIT_FOR_CONTINUE // The replay log indicates that this was the worker
133 // which continued. Loop until we are the last worker
134 // to the sync.
138 // Verify that "w" is the worker we are currently executing on.
139 // Because this check is expensive, this method is usually a no-op.
140 static inline void verify_current_wkr(__cilkrts_worker *w)
142 #if ((REDPAR_DEBUG >= 3) || (FIBER_DEBUG >= 1))
143 // Lookup the worker from TLS and compare to w.
144 __cilkrts_worker* tmp = __cilkrts_get_tls_worker();
145 if (w != tmp) {
146 fprintf(stderr, "Error. W=%d, actual worker =%d...\n",
147 w->self,
148 tmp->self);
150 CILK_ASSERT(w == tmp);
151 #endif
154 static enum schedule_t worker_runnable(__cilkrts_worker *w);
156 // Scheduling-fiber functions:
157 static void do_return_from_spawn (__cilkrts_worker *w,
158 full_frame *ff,
159 __cilkrts_stack_frame *sf);
160 static void do_sync (__cilkrts_worker *w,
161 full_frame *ff,
162 __cilkrts_stack_frame *sf);
164 // max is defined on Windows and VxWorks
165 #if (! defined(_WIN32)) && (! defined(__VXWORKS__))
166 // TBD: definition of max() for Linux.
167 # define max(a, b) ((a) < (b) ? (b) : (a))
168 #endif
170 void __cilkrts_dump_stats_to_stderr(global_state_t *g)
172 #ifdef CILK_PROFILE
173 int i;
174 for (i = 0; i < g->total_workers; ++i) {
175 // Print out statistics for each worker. We collected them,
176 // so why not print them out?
177 fprintf(stderr, "Stats for worker %d\n", i);
178 dump_stats_to_file(stderr, g->workers[i]->l->stats);
179 __cilkrts_accum_stats(&g->stats, g->workers[i]->l->stats);
182 // Also print out aggregate statistics.
183 dump_stats_to_file(stderr, &g->stats);
184 #endif
185 fprintf(stderr,
186 "CILK PLUS Thread Info: P=%d, Q=%d\n",
187 g->P,
188 g->Q);
189 fprintf(stderr,
190 "CILK PLUS RUNTIME MEMORY USAGE: %lld bytes",
191 (long long)g->frame_malloc.allocated_from_os);
192 #ifdef CILK_PROFILE
193 if (g->stats.stack_hwm)
194 fprintf(stderr, ", %ld stacks", g->stats.stack_hwm);
195 #endif
196 fputc('\n', stderr);
199 static void validate_worker(__cilkrts_worker *w)
201 /* check the magic numbers, for debugging purposes */
202 if (w->l->worker_magic_0 != WORKER_MAGIC_0 ||
203 w->l->worker_magic_1 != WORKER_MAGIC_1)
204 abort_because_rts_is_corrupted();
207 static void double_link(full_frame *left_ff, full_frame *right_ff)
209 if (left_ff)
210 left_ff->right_sibling = right_ff;
211 if (right_ff)
212 right_ff->left_sibling = left_ff;
215 /* add CHILD to the right of all children of PARENT */
216 static void push_child(full_frame *parent_ff, full_frame *child_ff)
218 double_link(parent_ff->rightmost_child, child_ff);
219 double_link(child_ff, 0);
220 parent_ff->rightmost_child = child_ff;
223 /* unlink CHILD from the list of all children of PARENT */
224 static void unlink_child(full_frame *parent_ff, full_frame *child_ff)
226 double_link(child_ff->left_sibling, child_ff->right_sibling);
228 if (!child_ff->right_sibling) {
229 /* this is the rightmost child -- update parent link */
230 CILK_ASSERT(parent_ff->rightmost_child == child_ff);
231 parent_ff->rightmost_child = child_ff->left_sibling;
233 child_ff->left_sibling = child_ff->right_sibling = 0; /* paranoia */
236 static void incjoin(full_frame *ff)
238 ++ff->join_counter;
241 static int decjoin(full_frame *ff)
243 CILK_ASSERT(ff->join_counter > 0);
244 return (--ff->join_counter);
247 static int simulate_decjoin(full_frame *ff)
249 CILK_ASSERT(ff->join_counter > 0);
250 return (ff->join_counter - 1);
254 * Pseudo-random generator defined by the congruence S' = 69070 * S
255 * mod (2^32 - 5). Marsaglia (CACM July 1993) says on page 107 that
256 * this is a ``good one''. There you go.
258 * The literature makes a big fuss about avoiding the division, but
259 * for us it is not worth the hassle.
261 static const unsigned RNGMOD = ((1ULL << 32) - 5);
262 static const unsigned RNGMUL = 69070U;
264 static unsigned myrand(__cilkrts_worker *w)
266 unsigned state = w->l->rand_seed;
267 state = (unsigned)((RNGMUL * (unsigned long long)state) % RNGMOD);
268 w->l->rand_seed = state;
269 return state;
272 static void mysrand(__cilkrts_worker *w, unsigned seed)
274 seed %= RNGMOD;
275 seed += (seed == 0); /* 0 does not belong to the multiplicative
276 group. Use 1 instead */
277 w->l->rand_seed = seed;
280 /* W grabs its own lock */
281 void __cilkrts_worker_lock(__cilkrts_worker *w)
283 validate_worker(w);
284 CILK_ASSERT(w->l->do_not_steal == 0);
286 /* tell thieves to stay out of the way */
287 w->l->do_not_steal = 1;
288 __cilkrts_fence(); /* probably redundant */
290 __cilkrts_mutex_lock(w, &w->l->lock);
293 void __cilkrts_worker_unlock(__cilkrts_worker *w)
295 __cilkrts_mutex_unlock(w, &w->l->lock);
296 CILK_ASSERT(w->l->do_not_steal == 1);
297 /* The fence is probably redundant. Use a release
298 operation when supported (gcc and compatibile);
299 that is faster on x86 which serializes normal stores. */
300 #if defined __GNUC__ && (__GNUC__ * 10 + __GNUC_MINOR__ > 43 || __ICC >= 1110)
301 __sync_lock_release(&w->l->do_not_steal);
302 #else
303 w->l->do_not_steal = 0;
304 __cilkrts_fence(); /* store-store barrier, redundant on x86 */
305 #endif
308 /* try to acquire the lock of some *other* worker */
309 static int worker_trylock_other(__cilkrts_worker *w,
310 __cilkrts_worker *other)
312 int status = 0;
314 validate_worker(other);
316 /* This protocol guarantees that, after setting the DO_NOT_STEAL
317 flag, worker W can enter its critical section after waiting for
318 the thief currently in the critical section (if any) and at
319 most one other thief.
321 This requirement is overly paranoid, but it should protect us
322 against future nonsense from OS implementors.
325 /* compete for the right to disturb OTHER */
326 if (__cilkrts_mutex_trylock(w, &other->l->steal_lock)) {
327 if (other->l->do_not_steal) {
328 /* leave it alone */
329 } else {
330 status = __cilkrts_mutex_trylock(w, &other->l->lock);
332 __cilkrts_mutex_unlock(w, &other->l->steal_lock);
336 return status;
339 static void worker_unlock_other(__cilkrts_worker *w,
340 __cilkrts_worker *other)
342 __cilkrts_mutex_unlock(w, &other->l->lock);
346 /* Lock macro Usage:
347 BEGIN_WITH_WORKER_LOCK(w) {
348 statement;
349 statement;
350 BEGIN_WITH_FRAME_LOCK(w, ff) {
351 statement;
352 statement;
353 } END_WITH_FRAME_LOCK(w, ff);
354 } END_WITH_WORKER_LOCK(w);
356 #define BEGIN_WITH_WORKER_LOCK(w) __cilkrts_worker_lock(w); do
357 #define END_WITH_WORKER_LOCK(w) while (__cilkrts_worker_unlock(w), 0)
359 // TBD(jsukha): These are worker lock acquistions on
360 // a worker whose deque is empty. My conjecture is that we
361 // do not need to hold the worker lock at these points.
362 // I have left them in for now, however.
364 // #define REMOVE_POSSIBLY_OPTIONAL_LOCKS
365 #ifdef REMOVE_POSSIBLY_OPTIONAL_LOCKS
366 #define BEGIN_WITH_WORKER_LOCK_OPTIONAL(w) do
367 #define END_WITH_WORKER_LOCK_OPTIONAL(w) while (0)
368 #else
369 #define BEGIN_WITH_WORKER_LOCK_OPTIONAL(w) __cilkrts_worker_lock(w); do
370 #define END_WITH_WORKER_LOCK_OPTIONAL(w) while (__cilkrts_worker_unlock(w), 0)
371 #endif
374 #define BEGIN_WITH_FRAME_LOCK(w, ff) \
375 do { full_frame *_locked_ff = ff; __cilkrts_frame_lock(w, _locked_ff); do
377 #define END_WITH_FRAME_LOCK(w, ff) \
378 while (__cilkrts_frame_unlock(w, _locked_ff), 0); } while (0)
380 /* W becomes the owner of F and F can be stolen from W */
381 static void make_runnable(__cilkrts_worker *w, full_frame *ff)
383 w->l->frame_ff = ff;
385 /* CALL_STACK is invalid (the information is stored implicitly in W) */
386 ff->call_stack = 0;
390 * The worker parameter is unused, except for print-debugging purposes.
392 static void make_unrunnable(__cilkrts_worker *w,
393 full_frame *ff,
394 __cilkrts_stack_frame *sf,
395 int is_loot,
396 const char *why)
398 /* CALL_STACK becomes valid again */
399 ff->call_stack = sf;
401 if (sf) {
402 #if CILK_LIB_DEBUG
403 if (__builtin_expect(sf->flags & CILK_FRAME_EXITING, 0))
404 __cilkrts_bug("W%d suspending exiting frame %p/%p\n", w->self, ff, sf);
405 #endif
406 sf->flags |= CILK_FRAME_STOLEN | CILK_FRAME_SUSPENDED;
407 sf->worker = 0;
409 if (is_loot)
410 __cilkrts_put_stack(ff, sf);
412 /* perform any system-dependent action, such as saving the
413 state of the stack */
414 __cilkrts_make_unrunnable_sysdep(w, ff, sf, is_loot, why);
419 /* Push the next full frame to be made active in this worker and increment its
420 * join counter. __cilkrts_push_next_frame and pop_next_frame work on a
421 * one-element queue. This queue is used to communicate across the runtime
422 * from the code that wants to activate a frame to the code that can actually
423 * begin execution on that frame. They are asymetrical in that push
424 * increments the join counter but pop does not decrement it. Rather, a
425 * single push/pop combination makes a frame active and increments its join
426 * counter once. */
427 void __cilkrts_push_next_frame(__cilkrts_worker *w, full_frame *ff)
429 CILK_ASSERT(ff);
430 CILK_ASSERT(!w->l->next_frame_ff);
431 incjoin(ff);
432 w->l->next_frame_ff = ff;
435 /* Get the next full-frame to be made active in this worker. The join count
436 * of the full frame will have been incremented by the corresponding push
437 * event. See __cilkrts_push_next_frame, above.
439 static full_frame *pop_next_frame(__cilkrts_worker *w)
441 full_frame *ff;
442 ff = w->l->next_frame_ff;
443 // Remove the frame from the next_frame field.
445 // If this is a user worker, then there is a chance that another worker
446 // from our team could push work into our next_frame (if it is the last
447 // worker doing work for this team). The other worker's setting of the
448 // next_frame could race with our setting of next_frame to NULL. This is
449 // the only possible race condition on next_frame. However, if next_frame
450 // has a non-NULL value, then it means the team still has work to do, and
451 // there is no chance of another team member populating next_frame. Thus,
452 // it is safe to set next_frame to NULL, if it was populated. There is no
453 // need for an atomic op.
454 if (NULL != ff) {
455 w->l->next_frame_ff = NULL;
457 return ff;
461 * Identify the single worker that is allowed to cross a sync in this frame. A
462 * thief should call this function when it is the first to steal work from a
463 * user worker. "First to steal work" may mean that there has been parallelism
464 * in the user worker before, but the whole team sync'd, and this is the first
465 * steal after that.
467 * This should happen while holding the worker and frame lock.
469 static void set_sync_master(__cilkrts_worker *w, full_frame *ff)
471 w->l->last_full_frame = ff;
472 ff->sync_master = w;
476 * The sync that ends all parallelism for a particular user worker is about to
477 * be crossed. Decouple the worker and frame.
479 * No locks need to be held since the user worker isn't doing anything, and none
480 * of the system workers can steal from it. But unset_sync_master() should be
481 * called before the user worker knows about this work (i.e., before it is
482 * inserted into the w->l->next_frame_ff is set).
484 static void unset_sync_master(__cilkrts_worker *w, full_frame *ff)
486 CILK_ASSERT(WORKER_USER == w->l->type);
487 CILK_ASSERT(ff->sync_master == w);
488 ff->sync_master = NULL;
489 w->l->last_full_frame = NULL;
492 /********************************************************************
493 * THE protocol:
494 ********************************************************************/
496 * This is a protocol for work stealing that minimizes the overhead on
497 * the victim.
499 * The protocol uses three shared pointers into the worker's deque:
500 * - T - the "tail"
501 * - H - the "head"
502 * - E - the "exception" NB: In this case, "exception" has nothing to do
503 * with C++ throw-catch exceptions -- it refers only to a non-normal return,
504 * i.e., a steal or similar scheduling exception.
506 * with H <= E, H <= T.
508 * Stack frames SF, where H <= E < T, are available for stealing.
510 * The worker operates on the T end of the stack. The frame being
511 * worked on is not on the stack. To make a continuation available for
512 * stealing the worker pushes a from onto the stack: stores *T++ = SF.
513 * To return, it pops the frame off the stack: obtains SF = *--T.
515 * After decrementing T, the condition E > T signals to the victim that
516 * it should invoke the runtime system's "THE" exception handler. The
517 * pointer E can become INFINITY, in which case the victim must invoke
518 * the THE exception handler as soon as possible.
520 * See "The implementation of the Cilk-5 multithreaded language", PLDI 1998,
521 * http://portal.acm.org/citation.cfm?doid=277652.277725, for more information
522 * on the THE protocol.
525 /* the infinity value of E */
526 #define EXC_INFINITY ((__cilkrts_stack_frame **) (-1))
528 static void increment_E(__cilkrts_worker *victim)
530 __cilkrts_stack_frame *volatile *tmp;
532 // The currently executing worker must own the worker lock to touch
533 // victim->exc
534 ASSERT_WORKER_LOCK_OWNED(victim);
536 tmp = victim->exc;
537 if (tmp != EXC_INFINITY) {
538 /* On most x86 this pair of operations would be slightly faster
539 as an atomic exchange due to the implicit memory barrier in
540 an atomic instruction. */
541 victim->exc = tmp + 1;
542 __cilkrts_fence();
546 static void decrement_E(__cilkrts_worker *victim)
548 __cilkrts_stack_frame *volatile *tmp;
550 // The currently executing worker must own the worker lock to touch
551 // victim->exc
552 ASSERT_WORKER_LOCK_OWNED(victim);
554 tmp = victim->exc;
555 if (tmp != EXC_INFINITY) {
556 /* On most x86 this pair of operations would be slightly faster
557 as an atomic exchange due to the implicit memory barrier in
558 an atomic instruction. */
559 victim->exc = tmp - 1;
560 __cilkrts_fence(); /* memory fence not really necessary */
564 #if 0
565 /* for now unused, will be necessary if we implement abort */
566 static void signal_THE_exception(__cilkrts_worker *wparent)
568 wparent->exc = EXC_INFINITY;
569 __cilkrts_fence();
571 #endif
573 static void reset_THE_exception(__cilkrts_worker *w)
575 // The currently executing worker must own the worker lock to touch
576 // w->exc
577 ASSERT_WORKER_LOCK_OWNED(w);
579 w->exc = w->head;
580 __cilkrts_fence();
583 /* conditions under which victim->head can be stolen: */
584 static int can_steal_from(__cilkrts_worker *victim)
586 return ((victim->head < victim->tail) &&
587 (victim->head < victim->protected_tail));
590 /* Return TRUE if the frame can be stolen, false otherwise */
591 static int dekker_protocol(__cilkrts_worker *victim)
593 // increment_E and decrement_E are going to touch victim->exc. The
594 // currently executing worker must own victim's lock before they can
595 // modify it
596 ASSERT_WORKER_LOCK_OWNED(victim);
598 /* ASSERT(E >= H); */
600 increment_E(victim);
602 /* ASSERT(E >= H + 1); */
603 if (can_steal_from(victim)) {
604 /* success, we can steal victim->head and set H <- H + 1
605 in detach() */
606 return 1;
607 } else {
608 /* failure, restore previous state */
609 decrement_E(victim);
610 return 0;
615 /* Link PARENT and CHILD in the spawn tree */
616 static full_frame *make_child(__cilkrts_worker *w,
617 full_frame *parent_ff,
618 __cilkrts_stack_frame *child_sf,
619 cilk_fiber *fiber)
621 full_frame *child_ff = __cilkrts_make_full_frame(w, child_sf);
623 child_ff->parent = parent_ff;
624 push_child(parent_ff, child_ff);
626 //DBGPRINTF("%d- make_child - child_frame: %p, parent_frame: %p, child_sf: %p\n"
627 // " parent - parent: %p, left_sibling: %p, right_sibling: %p, rightmost_child: %p\n"
628 // " child - parent: %p, left_sibling: %p, right_sibling: %p, rightmost_child: %p\n",
629 // w->self, child, parent, child_sf,
630 // parent->parent, parent->left_sibling, parent->right_sibling, parent->rightmost_child,
631 // child->parent, child->left_sibling, child->right_sibling, child->rightmost_child);
632 CILK_ASSERT(parent_ff->call_stack);
633 child_ff->is_call_child = (fiber == NULL);
635 /* PLACEHOLDER_FIBER is used as non-null marker indicating that
636 child should be treated as a spawn child even though we have not
637 yet assigned a real fiber to its parent. */
638 if (fiber == PLACEHOLDER_FIBER)
639 fiber = NULL; /* Parent actually gets a null fiber, for now */
641 /* perform any system-dependent actions, such as capturing
642 parameter passing information */
643 /*__cilkrts_make_child_sysdep(child, parent);*/
645 /* Child gets reducer map and stack of parent.
646 Parent gets a new map and new stack. */
647 child_ff->fiber_self = parent_ff->fiber_self;
648 child_ff->sync_master = NULL;
650 if (child_ff->is_call_child) {
651 /* Cause segfault on any attempted access. The parent gets
652 the child map and stack when the child completes. */
653 parent_ff->fiber_self = 0;
654 } else {
655 parent_ff->fiber_self = fiber;
658 incjoin(parent_ff);
659 return child_ff;
662 static inline __cilkrts_stack_frame *__cilkrts_advance_frame(__cilkrts_stack_frame *sf)
664 __cilkrts_stack_frame *p = sf->call_parent;
665 sf->call_parent = 0;
666 return p;
669 /* w should be the currently executing worker.
670 * loot_sf is the youngest stack frame in the call stack being
671 * unrolled (i.e., the most deeply nested stack frame.)
673 * When this method is called for a steal, loot_sf should be on a
674 * victim worker which is different from w.
675 * For CILK_FORCE_REDUCE, the victim worker will equal w.
677 * Before execution, the __cilkrts_stack_frame's have pointers from
678 * older to younger, i.e., a __cilkrts_stack_frame points to parent.
680 * This method creates a full frame for each __cilkrts_stack_frame in
681 * the call stack, with each full frame also pointing to its parent.
683 * The method returns the full frame created for loot_sf, i.e., the
684 * youngest full frame.
686 static full_frame *unroll_call_stack(__cilkrts_worker *w,
687 full_frame *ff,
688 __cilkrts_stack_frame *const loot_sf)
690 __cilkrts_stack_frame *sf = loot_sf;
691 __cilkrts_stack_frame *rev_sf = 0;
692 __cilkrts_stack_frame *t_sf;
694 CILK_ASSERT(sf);
695 /*CILK_ASSERT(sf->call_parent != sf);*/
697 /* The leafmost frame is unsynched. */
698 if (sf->worker != w)
699 sf->flags |= CILK_FRAME_UNSYNCHED;
701 /* Reverse the call stack to make a linked list ordered from parent
702 to child. sf->call_parent points to the child of SF instead of
703 the parent. */
704 do {
705 t_sf = (sf->flags & (CILK_FRAME_DETACHED|CILK_FRAME_STOLEN|CILK_FRAME_LAST))? 0 : sf->call_parent;
706 sf->call_parent = rev_sf;
707 rev_sf = sf;
708 sf = t_sf;
709 } while (sf);
710 sf = rev_sf;
712 /* Promote each stack frame to a full frame in order from parent
713 to child, following the reversed list we just built. */
714 make_unrunnable(w, ff, sf, sf == loot_sf, "steal 1");
715 /* T is the *child* of SF, because we have reversed the list */
716 for (t_sf = __cilkrts_advance_frame(sf); t_sf;
717 sf = t_sf, t_sf = __cilkrts_advance_frame(sf)) {
718 ff = make_child(w, ff, t_sf, NULL);
719 make_unrunnable(w, ff, t_sf, t_sf == loot_sf, "steal 2");
722 /* XXX What if the leafmost frame does not contain a sync
723 and this steal is from promote own deque? */
724 /*sf->flags |= CILK_FRAME_UNSYNCHED;*/
726 CILK_ASSERT(!sf->call_parent);
727 return ff;
730 /* detach the top of the deque frame from the VICTIM and install a new
731 CHILD frame in its place */
732 static void detach_for_steal(__cilkrts_worker *w,
733 __cilkrts_worker *victim,
734 cilk_fiber* fiber)
736 /* ASSERT: we own victim->lock */
738 full_frame *parent_ff, *child_ff, *loot_ff;
739 __cilkrts_stack_frame *volatile *h;
740 __cilkrts_stack_frame *sf;
742 w->l->team = victim->l->team;
744 CILK_ASSERT(w->l->frame_ff == 0 || w == victim);
746 h = victim->head;
748 CILK_ASSERT(*h);
750 victim->head = h + 1;
752 parent_ff = victim->l->frame_ff;
753 BEGIN_WITH_FRAME_LOCK(w, parent_ff) {
754 /* parent no longer referenced by victim */
755 decjoin(parent_ff);
757 /* obtain the victim call stack */
758 sf = *h;
760 /* perform system-dependent normalizations */
761 /*__cilkrts_normalize_call_stack_on_steal(sf);*/
763 /* unroll PARENT_FF with call stack SF, adopt the youngest
764 frame LOOT. If loot_ff == parent_ff, then we hold loot_ff->lock,
765 otherwise, loot_ff is newly created and we can modify it without
766 holding its lock. */
767 loot_ff = unroll_call_stack(w, parent_ff, sf);
769 #if REDPAR_DEBUG >= 3
770 fprintf(stderr, "[W=%d, victim=%d, desc=detach, parent_ff=%p, loot=%p]\n",
771 w->self, victim->self,
772 parent_ff, loot_ff);
773 #endif
775 if (WORKER_USER == victim->l->type &&
776 NULL == victim->l->last_full_frame) {
777 // Mark this looted frame as special: only the original user worker
778 // may cross the sync.
780 // This call is a shared access to
781 // victim->l->last_full_frame.
782 set_sync_master(victim, loot_ff);
785 /* LOOT is the next frame that the thief W is supposed to
786 run, unless the thief is stealing from itself, in which
787 case the thief W == VICTIM executes CHILD and nobody
788 executes LOOT. */
789 if (w == victim) {
790 /* Pretend that frame has been stolen */
791 loot_ff->call_stack->flags |= CILK_FRAME_UNSYNCHED;
792 loot_ff->simulated_stolen = 1;
794 else
795 __cilkrts_push_next_frame(w, loot_ff);
797 // After this "push_next_frame" call, w now owns loot_ff.
798 child_ff = make_child(w, loot_ff, 0, fiber);
800 BEGIN_WITH_FRAME_LOCK(w, child_ff) {
801 /* install child in the victim's work queue, taking
802 the parent_ff's place */
803 /* child is referenced by victim */
804 incjoin(child_ff);
806 // With this call, w is bestowing ownership of the newly
807 // created frame child_ff to the victim, and victim is
808 // giving up ownership of parent_ff.
810 // Worker w will either take ownership of parent_ff
811 // if parent_ff == loot_ff, or parent_ff will be
812 // suspended.
814 // Note that this call changes the victim->frame_ff
815 // while the victim may be executing.
816 make_runnable(victim, child_ff);
817 } END_WITH_FRAME_LOCK(w, child_ff);
818 } END_WITH_FRAME_LOCK(w, parent_ff);
822 * @brief cilk_fiber_proc that resumes user code after a successful
823 * random steal.
825 * This function longjmps back into the user code whose state is
826 * stored in cilk_fiber_get_data(fiber)->resume_sf. The stack pointer
827 * is adjusted so that the code resumes on the specified fiber stack
828 * instead of its original stack.
830 * This method gets executed only on a fiber freshly allocated from a
831 * pool.
833 * @param fiber The fiber being used to resume user code.
834 * @param arg Unused.
836 static
837 void fiber_proc_to_resume_user_code_for_random_steal(cilk_fiber *fiber)
839 cilk_fiber_data *data = cilk_fiber_get_data(fiber);
840 __cilkrts_stack_frame* sf = data->resume_sf;
841 full_frame *ff;
843 CILK_ASSERT(sf);
845 // When we pull the resume_sf out of the fiber to resume it, clear
846 // the old value.
847 data->resume_sf = NULL;
848 CILK_ASSERT(sf->worker == data->owner);
849 ff = sf->worker->l->frame_ff;
851 // For Win32, we need to overwrite the default exception handler
852 // in this function, so that when the OS exception handling code
853 // walks off the top of the current Cilk stack, it reaches our stub
854 // handler.
856 // Also, this function needs to be wrapped into a try-catch block
857 // so the compiler generates the appropriate exception information
858 // in this frame.
860 // TBD: IS THIS HANDLER IN THE WRONG PLACE? Can we longjmp out of
861 // this function (and does it matter?)
862 #if defined(_WIN32) && !defined(_WIN64)
863 install_exception_stub_handler();
864 __try
865 #endif
867 char* new_sp = sysdep_reset_jump_buffers_for_resume(fiber, ff, sf);
869 // Notify the Intel tools that we're stealing code
870 ITT_SYNC_ACQUIRED(sf->worker);
871 NOTIFY_ZC_INTRINSIC("cilk_continue", sf);
873 // TBD: We'd like to move TBB-interop methods into the fiber
874 // eventually.
875 cilk_fiber_invoke_tbb_stack_op(fiber, CILK_TBB_STACK_ADOPT);
877 sf->flags &= ~CILK_FRAME_SUSPENDED;
879 // longjmp to user code. Don't process exceptions here,
880 // because we are resuming a stolen frame.
881 sysdep_longjmp_to_sf(new_sp, sf, NULL);
882 /*NOTREACHED*/
883 // Intel's C compiler respects the preceding lint pragma
885 #if defined(_WIN32) && !defined(_WIN64)
886 __except (CILK_ASSERT(!"should not execute the the stub filter"),
887 EXCEPTION_EXECUTE_HANDLER)
889 // If we are here, that means something very wrong
890 // has happened in our exception processing...
891 CILK_ASSERT(! "should not be here!");
893 #endif
896 static void random_steal(__cilkrts_worker *w)
898 __cilkrts_worker *victim = NULL;
899 cilk_fiber *fiber = NULL;
900 int n;
901 int success = 0;
902 int32_t victim_id;
904 // Nothing's been stolen yet. When true, this will flag
905 // setup_for_execution_pedigree to increment the pedigree
906 w->l->work_stolen = 0;
908 /* If the user has disabled stealing (using the debugger) we fail */
909 if (__builtin_expect(w->g->stealing_disabled, 0))
910 return;
912 CILK_ASSERT(w->l->type == WORKER_SYSTEM || w->l->team == w);
914 /* If there is only one processor work can still be stolen.
915 There must be only one worker to prevent stealing. */
916 CILK_ASSERT(w->g->total_workers > 1);
918 /* pick random *other* victim */
919 n = myrand(w) % (w->g->total_workers - 1);
920 if (n >= w->self)
921 ++n;
923 // If we're replaying a log, override the victim. -1 indicates that
924 // we've exhausted the list of things this worker stole when we recorded
925 // the log so just return. If we're not replaying a log,
926 // replay_get_next_recorded_victim() just returns the victim ID passed in.
927 n = replay_get_next_recorded_victim(w, n);
928 if (-1 == n)
929 return;
931 victim = w->g->workers[n];
933 START_INTERVAL(w, INTERVAL_FIBER_ALLOCATE) {
934 /* Verify that we can get a stack. If not, no need to continue. */
935 fiber = cilk_fiber_allocate(&w->l->fiber_pool);
936 } STOP_INTERVAL(w, INTERVAL_FIBER_ALLOCATE);
939 if (NULL == fiber) {
940 #if FIBER_DEBUG >= 2
941 fprintf(stderr, "w=%d: failed steal because we could not get a fiber\n",
942 w->self);
943 #endif
944 return;
947 /* do not steal from self */
948 CILK_ASSERT (victim != w);
950 /* Execute a quick check before engaging in the THE protocol.
951 Avoid grabbing locks if there is nothing to steal. */
952 if (!can_steal_from(victim)) {
953 NOTE_INTERVAL(w, INTERVAL_STEAL_FAIL_EMPTYQ);
954 START_INTERVAL(w, INTERVAL_FIBER_DEALLOCATE) {
955 int ref_count = cilk_fiber_remove_reference(fiber, &w->l->fiber_pool);
956 // Fibers we use when trying to steal should not be active,
957 // and thus should not have any other references.
958 CILK_ASSERT(0 == ref_count);
959 } STOP_INTERVAL(w, INTERVAL_FIBER_DEALLOCATE);
960 return;
963 /* Attempt to steal work from the victim */
964 if (worker_trylock_other(w, victim)) {
965 if (w->l->type == WORKER_USER && victim->l->team != w) {
967 // Fail to steal if this is a user worker and the victim is not
968 // on this team. If a user worker were allowed to steal work
969 // descended from another user worker, the former might not be
970 // done with its work by the time it was needed to resume and
971 // unbind. Therefore, user workers are not permitted to change
972 // teams.
974 // There is no race on the victim's team because the victim cannot
975 // change its team until it runs out of work to do, at which point
976 // it will try to take out its own lock, and this worker already
977 // holds it.
978 NOTE_INTERVAL(w, INTERVAL_STEAL_FAIL_USER_WORKER);
980 } else if (victim->l->frame_ff) {
981 // A successful steal will change victim->frame_ff, even
982 // though the victim may be executing. Thus, the lock on
983 // the victim's deque is also protecting victim->frame_ff.
984 if (dekker_protocol(victim)) {
985 int proceed_with_steal = 1; // optimistic
987 // If we're replaying a log, verify that this the correct frame
988 // to steal from the victim
989 if (! replay_match_victim_pedigree(w, victim))
991 // Abort the steal attempt. decrement_E(victim) to
992 // counter the increment_E(victim) done by the
993 // dekker protocol
994 decrement_E(victim);
995 proceed_with_steal = 0;
998 if (proceed_with_steal)
1000 START_INTERVAL(w, INTERVAL_STEAL_SUCCESS) {
1001 success = 1;
1002 detach_for_steal(w, victim, fiber);
1003 victim_id = victim->self;
1005 #if REDPAR_DEBUG >= 1
1006 fprintf(stderr, "Wkr %d stole from victim %d, fiber = %p\n",
1007 w->self, victim->self, fiber);
1008 #endif
1010 // The use of victim->self contradicts our
1011 // classification of the "self" field as
1012 // local. But since this code is only for
1013 // debugging, it is ok.
1014 DBGPRINTF ("%d-%p: Stealing work from worker %d\n"
1015 " sf: %p, call parent: %p\n",
1016 w->self, GetCurrentFiber(), victim->self,
1017 w->l->next_frame_ff->call_stack,
1018 w->l->next_frame_ff->call_stack->call_parent);
1019 } STOP_INTERVAL(w, INTERVAL_STEAL_SUCCESS);
1020 } // end if(proceed_with_steal)
1021 } else {
1022 NOTE_INTERVAL(w, INTERVAL_STEAL_FAIL_DEKKER);
1024 } else {
1025 NOTE_INTERVAL(w, INTERVAL_STEAL_FAIL_EMPTYQ);
1027 worker_unlock_other(w, victim);
1028 } else {
1029 NOTE_INTERVAL(w, INTERVAL_STEAL_FAIL_LOCK);
1032 // Record whether work was stolen. When true, this will flag
1033 // setup_for_execution_pedigree to increment the pedigree
1034 w->l->work_stolen = success;
1036 if (0 == success) {
1037 // failed to steal work. Return the fiber to the pool.
1038 START_INTERVAL(w, INTERVAL_FIBER_DEALLOCATE) {
1039 int ref_count = cilk_fiber_remove_reference(fiber, &w->l->fiber_pool);
1040 // Fibers we use when trying to steal should not be active,
1041 // and thus should not have any other references.
1042 CILK_ASSERT(0 == ref_count);
1043 } STOP_INTERVAL(w, INTERVAL_FIBER_DEALLOCATE);
1045 else
1047 // Since our steal was successful, finish initialization of
1048 // the fiber.
1049 cilk_fiber_reset_state(fiber,
1050 fiber_proc_to_resume_user_code_for_random_steal);
1051 // Record the pedigree of the frame that w has stolen.
1052 // record only if CILK_RECORD_LOG is set
1053 replay_record_steal(w, victim_id);
1060 * At a provably good steal, we need to transfer the child reducer map
1061 * from ff->children_reducer_map into v->reducer_map, where v is the
1062 * worker that resumes execution of ff.
1064 * Normally, we have v == w, where w is the currently executing
1065 * worker. In the case where we are resuming a team leader on a user
1066 * worker, however, v might differ from w.
1068 * Thus, this, operation is a no-op, since we can't really move
1069 * ff->children_reducer_map into w here.
1071 * Instead, this work is done in setup_for_execution_reducers().
1073 static inline void provably_good_steal_reducers(__cilkrts_worker *w,
1074 full_frame *ff)
1076 // No-op.
1079 /* at a provably good steal, incorporate the accumulated exceptions of
1080 children into the parent's exception */
1081 static void provably_good_steal_exceptions(__cilkrts_worker *w,
1082 full_frame *ff)
1084 // ASSERT: we own ff->lock
1085 ff->pending_exception =
1086 __cilkrts_merge_pending_exceptions(w,
1087 ff->child_pending_exception,
1088 ff->pending_exception);
1089 ff->child_pending_exception = NULL;
1092 /* At sync discard the frame's old stack and take the leftmost child's. */
1093 static void provably_good_steal_stacks(__cilkrts_worker *w, full_frame *ff)
1095 CILK_ASSERT(NULL == ff->fiber_self);
1096 ff->fiber_self = ff->fiber_child;
1097 ff->fiber_child = NULL;
1100 static void __cilkrts_mark_synched(full_frame *ff)
1102 ff->call_stack->flags &= ~CILK_FRAME_UNSYNCHED;
1103 ff->simulated_stolen = 0;
1106 static
1107 enum provably_good_steal_t provably_good_steal(__cilkrts_worker *w,
1108 full_frame *ff)
1110 // ASSERT: we hold w->lock and ff->lock
1112 enum provably_good_steal_t result = ABANDON_EXECUTION;
1114 // If the current replay entry is a sync record matching the worker's
1115 // pedigree, AND this isn't the last child to the sync, return
1116 // WAIT_FOR_CONTINUE to indicate that the caller should loop until
1117 // we find the right frame to steal and CONTINUE_EXECUTION is returned.
1118 int match_found = replay_match_sync_pedigree(w);
1119 if (match_found && (0 != simulate_decjoin(ff)))
1120 return WAIT_FOR_CONTINUE;
1122 START_INTERVAL(w, INTERVAL_PROVABLY_GOOD_STEAL) {
1123 if (decjoin(ff) == 0) {
1124 provably_good_steal_reducers(w, ff);
1125 provably_good_steal_exceptions(w, ff);
1126 provably_good_steal_stacks(w, ff);
1127 __cilkrts_mark_synched(ff);
1129 // If the original owner wants this frame back (to resume
1130 // it on its original thread) pass it back now.
1131 if (NULL != ff->sync_master) {
1132 // The frame wants to go back and be executed by the original
1133 // user thread. We can throw caution to the wind and push the
1134 // frame straight onto its queue because the only way we have
1135 // gotten to this point of being able to continue execution of
1136 // the frame is if the original user worker is spinning without
1137 // work.
1139 unset_sync_master(w->l->team, ff);
1140 __cilkrts_push_next_frame(w->l->team, ff);
1142 // If this is the team leader we're not abandoning the work
1143 if (w == w->l->team)
1144 result = CONTINUE_EXECUTION;
1145 } else {
1146 __cilkrts_push_next_frame(w, ff);
1147 result = CONTINUE_EXECUTION; // Continue working on this thread
1150 // The __cilkrts_push_next_frame() call changes ownership
1151 // of ff to the specified worker.
1153 } STOP_INTERVAL(w, INTERVAL_PROVABLY_GOOD_STEAL);
1155 // Only write a SYNC record if:
1156 // - We're recording a log *AND*
1157 // - We're the worker continuing from this sync
1158 replay_record_sync(w, result == CONTINUE_EXECUTION);
1160 // If we're replaying a log, and matched a sync from the log, mark the
1161 // sync record seen if the sync isn't going to be abandoned.
1162 replay_advance_from_sync (w, match_found, result == CONTINUE_EXECUTION);
1164 return result;
1167 static void unconditional_steal(__cilkrts_worker *w,
1168 full_frame *ff)
1170 // ASSERT: we hold ff->lock
1172 START_INTERVAL(w, INTERVAL_UNCONDITIONAL_STEAL) {
1173 decjoin(ff);
1174 __cilkrts_push_next_frame(w, ff);
1175 } STOP_INTERVAL(w, INTERVAL_UNCONDITIONAL_STEAL);
1179 /* CHILD is about to die. Give its exceptions to a sibling or to the
1180 parent. */
1181 static inline void splice_exceptions_for_call(__cilkrts_worker *w,
1182 full_frame *parent_ff,
1183 full_frame *child_ff)
1185 // ASSERT: We own parent_ff->lock
1186 CILK_ASSERT(child_ff->is_call_child);
1187 CILK_ASSERT(NULL == child_ff->right_pending_exception);
1188 CILK_ASSERT(NULL == parent_ff->pending_exception);
1190 parent_ff->pending_exception = child_ff->pending_exception;
1191 child_ff->pending_exception = NULL;
1195 * Merge exceptions for a dying child.
1197 * @param w The currently executing worker.
1198 * @param ff The child frame that is dying.
1199 * @param left_exception_ptr Pointer to the exception that is to our left.
1201 static inline
1202 void splice_exceptions_for_spawn(__cilkrts_worker *w,
1203 full_frame *ff,
1204 struct pending_exception_info **left_exception_ptr)
1206 // ASSERT: parent_ff == child_ff->parent.
1207 // ASSERT: We own parent_ff->lock
1209 // Merge current exception into the slot where the left
1210 // exception should go.
1211 *left_exception_ptr =
1212 __cilkrts_merge_pending_exceptions(w,
1213 *left_exception_ptr,
1214 ff->pending_exception);
1215 ff->pending_exception = NULL;
1218 // Merge right exception into the slot where the left exception
1219 // should go.
1220 *left_exception_ptr =
1221 __cilkrts_merge_pending_exceptions(w,
1222 *left_exception_ptr,
1223 ff->right_pending_exception);
1224 ff->right_pending_exception = NULL;
1228 static inline void splice_stacks_for_call(__cilkrts_worker *w,
1229 full_frame *parent_ff,
1230 full_frame *child_ff)
1232 #if CILK_LIB_DEBUG
1233 if (parent_ff->call_stack)
1234 CILK_ASSERT(!(parent_ff->call_stack->flags & CILK_FRAME_MBZ));
1235 #endif
1237 /* A synched frame does not have accumulated child reducers. */
1238 CILK_ASSERT(!child_ff->fiber_child);
1239 CILK_ASSERT(child_ff->is_call_child);
1241 /* An attached parent has no self fiber. It may have
1242 accumulated child fibers or child owners, which should be
1243 ignored until sync. */
1244 CILK_ASSERT(!parent_ff->fiber_self);
1245 parent_ff->fiber_self = child_ff->fiber_self;
1246 child_ff->fiber_self = NULL;
1249 static void finalize_child_for_call(__cilkrts_worker *w,
1250 full_frame *parent_ff,
1251 full_frame *child_ff)
1253 // ASSERT: we hold w->lock and parent_ff->lock
1255 START_INTERVAL(w, INTERVAL_FINALIZE_CHILD) {
1256 CILK_ASSERT(child_ff->is_call_child);
1257 CILK_ASSERT(child_ff->join_counter == 0);
1258 CILK_ASSERT(!child_ff->rightmost_child);
1259 CILK_ASSERT(child_ff == parent_ff->rightmost_child);
1261 // CHILD is about to die.
1262 // Splicing out reducers is a no-op for a call since
1263 // w->reducer_map should already store the correct
1264 // reducer map.
1266 // ASSERT there are no maps left to reduce.
1267 CILK_ASSERT(NULL == child_ff->children_reducer_map);
1268 CILK_ASSERT(NULL == child_ff->right_reducer_map);
1270 splice_exceptions_for_call(w, parent_ff, child_ff);
1272 splice_stacks_for_call(w, parent_ff, child_ff);
1274 /* remove CHILD from list of children of PARENT */
1275 unlink_child(parent_ff, child_ff);
1277 /* continue with the parent. */
1278 unconditional_steal(w, parent_ff);
1279 __cilkrts_destroy_full_frame(w, child_ff);
1280 } STOP_INTERVAL(w, INTERVAL_FINALIZE_CHILD);
1285 * The invariant on ff->children_reducer_map is that when ff is
1286 * synched and when we are about to resume execution of ff, at least
1287 * one of ff->children_reducer_map and w->reducer_map must be NULL.
1289 * Consider the two possibilities before resuming execution of ff:
1291 * 1. Suppose ff is synched and suspended. Then either
1293 * (a) ff->children_reducer_map stores the reducer map that w
1294 * should use, where w is the worker resuming execution of ff,
1295 * OR
1296 * (b) w already has a user map, and ff->children_reducer_map is NULL.
1298 * Case (a) happens when we are resuming execution of ff as a
1299 * provably good steal. In this case, w->reducer_map should be
1300 * NULL and ff->children_reducer_map is valid. To resume
1301 * execution of ff on w, set w->reducer_map to
1302 * ff->children_reducer_map.
1304 * Case (b) occurs when we resume execution of ff because ff is a
1305 * called child. Then, ff->children_reducer_map should be NULL,
1306 * and w should already have a valid reducer map when resuming
1307 * execution of ff. We resume execution of ff without changing
1308 * w->reducer_map.
1310 * 2. Suppose frame ff is not synched (i.e., it is active and might have
1311 * active children). Then ff->children_reducer_map is the slot for
1312 * storing the reducer map from ff's leftmost child, as in the reducer
1313 * protocol. The runtime may resume execution of ff while it is not
1314 * synched only because of a steal.
1315 * In this case, while we are resuming ff, ff->children_reducer_map
1316 * may be non-NULL (because one of ff's children has completed).
1317 * We resume execution of ff without changing w->reducer_map.
1319 static void setup_for_execution_reducers(__cilkrts_worker *w,
1320 full_frame *ff)
1322 // We only need to move ff->children_reducer_map into
1323 // w->reducer_map in case 1(a).
1325 // First check whether ff is synched.
1326 __cilkrts_stack_frame *sf = ff->call_stack;
1327 if (!(sf->flags & CILK_FRAME_UNSYNCHED)) {
1328 // In this case, ff is synched. (Case 1).
1329 CILK_ASSERT(!ff->rightmost_child);
1331 // Test whether we are in case 1(a) and have
1332 // something to do. Note that if both
1333 // ff->children_reducer_map and w->reducer_map are NULL, we
1334 // can't distinguish between cases 1(a) and 1(b) here.
1335 if (ff->children_reducer_map) {
1336 // We are in Case 1(a).
1337 CILK_ASSERT(!w->reducer_map);
1338 w->reducer_map = ff->children_reducer_map;
1339 ff->children_reducer_map = NULL;
1344 static void setup_for_execution_exceptions(__cilkrts_worker *w,
1345 full_frame *ff)
1347 CILK_ASSERT(NULL == w->l->pending_exception);
1348 w->l->pending_exception = ff->pending_exception;
1349 ff->pending_exception = NULL;
1352 #if 0 /* unused */
1353 static void setup_for_execution_stack(__cilkrts_worker *w,
1354 full_frame *ff)
1357 #endif
1360 * setup_for_execution_pedigree
1362 * Copies the pedigree information from the frame we're resuming to the
1363 * worker. Increments the pedigree if this is work that has been stolen
1364 * to match the increment on a return from a spawn helper.
1366 static void setup_for_execution_pedigree(__cilkrts_worker *w)
1368 int pedigree_unsynched;
1369 __cilkrts_stack_frame *sf = w->current_stack_frame;
1371 CILK_ASSERT(NULL != sf);
1373 // If this isn't an ABI 1 or later frame, there's no pedigree information
1374 if (0 == CILK_FRAME_VERSION_VALUE(sf->flags))
1375 return;
1377 // Note whether the pedigree is unsynched and clear the flag before
1378 // we forget
1379 pedigree_unsynched = sf->flags & CILK_FRAME_SF_PEDIGREE_UNSYNCHED;
1380 sf->flags &= ~CILK_FRAME_SF_PEDIGREE_UNSYNCHED;
1382 // If we're just marshalling onto this worker, do not increment
1383 // the rank since that wouldn't happen in a sequential execution
1384 if (w->l->work_stolen || pedigree_unsynched)
1386 if (w->l->work_stolen)
1387 w->pedigree.rank = sf->parent_pedigree.rank + 1;
1388 else
1389 w->pedigree.rank = sf->parent_pedigree.rank;
1392 w->pedigree.parent = sf->parent_pedigree.parent;
1393 w->l->work_stolen = 0;
1396 static void setup_for_execution(__cilkrts_worker *w,
1397 full_frame *ff,
1398 int is_return_from_call)
1400 // ASSERT: We own w->lock and ff->lock || P == 1
1402 setup_for_execution_reducers(w, ff);
1403 setup_for_execution_exceptions(w, ff);
1404 /*setup_for_execution_stack(w, ff);*/
1406 ff->call_stack->worker = w;
1407 w->current_stack_frame = ff->call_stack;
1409 // If this is a return from a call, leave the pedigree alone
1410 if (! is_return_from_call)
1411 setup_for_execution_pedigree(w);
1413 __cilkrts_setup_for_execution_sysdep(w, ff);
1415 w->head = w->tail = w->l->ltq;
1416 reset_THE_exception(w);
1418 make_runnable(w, ff);
1423 * Called by the scheduling fiber, right before
1424 * resuming a sf/ff for user code.
1426 * This method associates the specified sf with the worker.
1428 * It also asserts that w, ff, and sf all have the expected properties
1429 * for resuming user code.
1431 void scheduling_fiber_prepare_to_resume_user_code(__cilkrts_worker *w,
1432 full_frame *ff,
1433 __cilkrts_stack_frame *sf)
1435 w->current_stack_frame = sf;
1436 sf->worker = w;
1438 // Lots of debugging checks on the state of the fiber we might be
1439 // resuming.
1440 #if FIBER_DEBUG >= 1
1441 # if FIBER_DEBUG >= 3
1443 fprintf(stderr, "w=%d: ff=%p, sf=%p. about to resume user code\n",
1444 w->self, ff, sf);
1446 # endif
1448 const int flags = sf->flags;
1449 CILK_ASSERT(flags & CILK_FRAME_SUSPENDED);
1450 CILK_ASSERT(!sf->call_parent);
1451 CILK_ASSERT(w->head == w->tail);
1453 /* A frame can not be resumed unless it was suspended. */
1454 CILK_ASSERT(ff->sync_sp != NULL);
1456 /* The leftmost frame has no allocated stack */
1457 if (ff->simulated_stolen)
1458 CILK_ASSERT(flags & CILK_FRAME_UNSYNCHED);
1459 else if (flags & CILK_FRAME_UNSYNCHED)
1460 /* XXX By coincidence sync_sp could be null. */
1461 CILK_ASSERT(ff->fiber_self != NULL);
1462 else
1463 /* XXX This frame could be resumed unsynched on the leftmost stack */
1464 CILK_ASSERT((ff->sync_master == 0 || ff->sync_master == w));
1465 CILK_ASSERT(w->l->frame_ff == ff);
1466 #endif
1471 * This method is the first method that should execute after we've
1472 * switched to a scheduling fiber from user code.
1474 * @param fiber The scheduling fiber for the current worker.
1475 * @param wptr The current worker.
1477 static void enter_runtime_transition_proc(cilk_fiber *fiber)
1479 // We can execute this method for one of three reasons:
1480 // 1. Undo-detach finds parent stolen.
1481 // 2. Sync suspends frame.
1482 // 3. Return from Cilk entry point.
1485 // In cases 1 and 2, the frame may be truly suspended or
1486 // may be immediately executed by this worker after provably_good_steal.
1489 // There is a fourth case, which can, but does not need to execute
1490 // this function:
1491 // 4. Starting up the scheduling loop on a user or
1492 // system worker. In this case, we won't have
1493 // a scheduling stack function to run.
1494 __cilkrts_worker* w = cilk_fiber_get_owner(fiber);
1495 if (w->l->post_suspend) {
1496 // Run the continuation function passed to longjmp_into_runtime
1497 run_scheduling_stack_fcn(w);
1499 // After we have jumped into the runtime and run the
1500 // scheduling function, any reducer map the worker had before entering the runtime
1501 // should have already been saved into the appropriate full
1502 // frame.
1503 CILK_ASSERT(NULL == w->reducer_map);
1505 // There shouldn't be any uncaught exceptions.
1507 // In Windows, the OS catches any exceptions not caught by the
1508 // user code. Thus, we are omitting the check on Windows.
1510 // On Android, calling std::uncaught_exception with the stlport
1511 // library causes a seg fault. Since we're not supporting
1512 // exceptions there at this point, just don't do the check
1514 // TBD: Is this check also safe to do on Windows?
1515 CILKBUG_ASSERT_NO_UNCAUGHT_EXCEPTION();
1521 * Method called to jump back to executing user code.
1523 * A normal return from the runtime back to resuming user code calls
1524 * this method. A computation executed using force_reduce also calls
1525 * this method to return to user code.
1527 * This function should not contain any code that depends on a fiber.
1528 * In a force-reduce case, the user worker may not have a fiber. In
1529 * the force-reduce case, we call this method directly instead of
1530 * calling @c user_code_resume_after_switch_into_runtime.
1532 static inline NORETURN
1533 cilkrts_resume(__cilkrts_stack_frame *sf, full_frame *ff)
1535 // Save the sync stack pointer, and do the bookkeeping
1536 char* sync_sp = ff->sync_sp;
1537 __cilkrts_take_stack(ff, sync_sp); // leaves ff->sync_sp null
1539 sf->flags &= ~CILK_FRAME_SUSPENDED;
1540 // Actually longjmp to the user code.
1541 // We may have exceptions to deal with, since we are resuming
1542 // a previous-suspended frame.
1543 sysdep_longjmp_to_sf(sync_sp, sf, ff);
1548 * Called by the user-code fiber right before resuming a full frame
1549 * (sf/ff).
1551 * This method pulls sf/ff out of the worker, and then calls
1552 * cilkrts_resume to jump to user code.
1554 static NORETURN
1555 user_code_resume_after_switch_into_runtime(cilk_fiber *fiber)
1557 __cilkrts_worker *w = cilk_fiber_get_owner(fiber);
1558 __cilkrts_stack_frame *sf;
1559 full_frame *ff;
1560 sf = w->current_stack_frame;
1561 ff = sf->worker->l->frame_ff;
1563 #if FIBER_DEBUG >= 1
1564 CILK_ASSERT(ff->fiber_self == fiber);
1565 cilk_fiber_data *fdata = cilk_fiber_get_data(fiber);
1566 DBGPRINTF ("%d-%p: resume_after_switch_into_runtime, fiber=%p\n",
1567 w->self, w, fiber);
1568 CILK_ASSERT(sf == fdata->resume_sf);
1569 #endif
1571 // Notify the Intel tools that we're stealing code
1572 ITT_SYNC_ACQUIRED(sf->worker);
1573 NOTIFY_ZC_INTRINSIC("cilk_continue", sf);
1574 cilk_fiber_invoke_tbb_stack_op(fiber, CILK_TBB_STACK_ADOPT);
1576 // Actually jump to user code.
1577 cilkrts_resume(sf, ff);
1581 /* The current stack is about to either be suspended or destroyed. This
1582 * function will switch to the stack on which the scheduler is suspended and
1583 * resume running the scheduler within function do_work(). Upon waking up,
1584 * the scheduler will run the 'cont' function, using the supplied worker and
1585 * frame.
1587 static NORETURN
1588 longjmp_into_runtime(__cilkrts_worker *w,
1589 scheduling_stack_fcn_t fcn,
1590 __cilkrts_stack_frame *sf)
1592 full_frame *ff, *ff2;
1594 CILK_ASSERT(!w->l->post_suspend);
1595 ff = w->l->frame_ff;
1597 // If we've got only one worker, stealing shouldn't be possible.
1598 // Assume that this is a steal or return from spawn in a force-reduce case.
1599 // We don't have a scheduling stack to switch to, so call the continuation
1600 // function directly.
1601 if (1 == w->g->P) {
1602 fcn(w, ff, sf);
1604 /* The call to function c() will have pushed ff as the next frame. If
1605 * this were a normal (non-forced-reduce) execution, there would have
1606 * been a pop_next_frame call in a separate part of the runtime. We
1607 * must call pop_next_frame here to complete the push/pop cycle. */
1608 ff2 = pop_next_frame(w);
1610 setup_for_execution(w, ff2, 0);
1611 scheduling_fiber_prepare_to_resume_user_code(w, ff2, w->current_stack_frame);
1612 cilkrts_resume(w->current_stack_frame, ff2);
1614 // Suppress clang warning that the expression result is unused
1615 #if defined(__clang__) && (! defined(__INTEL_COMPILER))
1616 # pragma clang diagnostic push
1617 # pragma clang diagnostic ignored "-Wunused-value"
1618 #endif // __clang__
1619 /* no return */
1620 CILK_ASSERT(((void)"returned from __cilkrts_resume", 0));
1621 #if defined(__clang__) && (! defined(__INTEL_COMPILER))
1622 # pragma clang diagnostic pop
1623 #endif // __clang__
1626 w->l->post_suspend = fcn;
1627 w->l->suspended_stack = sf;
1629 ITT_SYNC_RELEASING(w);
1630 ITT_SYNC_PREPARE(w);
1632 #if FIBER_DEBUG >= 2
1633 fprintf(stderr, "ThreadId=%p, W=%d: about to switch into runtime... w->l->frame_ff = %p, sf=%p\n",
1634 cilkos_get_current_thread_id(),
1635 w->self, w->l->frame_ff,
1636 sf);
1637 #endif
1639 // Current fiber is either the (1) one we are about to free,
1640 // or (2) it has been passed up to the parent.
1641 cilk_fiber *current_fiber = ( w->l->fiber_to_free ?
1642 w->l->fiber_to_free :
1643 w->l->frame_ff->parent->fiber_child );
1644 cilk_fiber_data* fdata = cilk_fiber_get_data(current_fiber);
1645 CILK_ASSERT(NULL == w->l->frame_ff->fiber_self);
1647 // Clear the sf in the current fiber for cleanliness, to prevent
1648 // us from accidentally resuming a bad sf.
1649 // Technically, resume_sf gets overwritten for a fiber when
1650 // we are about to resume it anyway.
1651 fdata->resume_sf = NULL;
1652 CILK_ASSERT(fdata->owner == w);
1654 // Set the function to execute immediately after switching to the
1655 // scheduling fiber, but before freeing any fibers.
1656 cilk_fiber_set_post_switch_proc(w->l->scheduling_fiber,
1657 enter_runtime_transition_proc);
1658 cilk_fiber_invoke_tbb_stack_op(current_fiber, CILK_TBB_STACK_ORPHAN);
1660 if (w->l->fiber_to_free) {
1661 // Case 1: we are freeing this fiber. We never
1662 // resume this fiber again after jumping into the runtime.
1663 w->l->fiber_to_free = NULL;
1665 // Extra check. Normally, the fiber we are about to switch to
1666 // should have a NULL owner.
1667 CILK_ASSERT(NULL == cilk_fiber_get_data(w->l->scheduling_fiber)->owner);
1668 #if FIBER_DEBUG >= 4
1669 fprintf(stderr, "ThreadId=%p, W=%d: about to switch into runtime.. current_fiber = %p, deallcoate, switch to fiber %p\n",
1670 cilkos_get_current_thread_id(),
1671 w->self,
1672 current_fiber, w->l->scheduling_fiber);
1673 #endif
1674 cilk_fiber_invoke_tbb_stack_op(current_fiber, CILK_TBB_STACK_RELEASE);
1675 NOTE_INTERVAL(w, INTERVAL_DEALLOCATE_RESUME_OTHER);
1676 cilk_fiber_remove_reference_from_self_and_resume_other(current_fiber,
1677 &w->l->fiber_pool,
1678 w->l->scheduling_fiber);
1679 // We should never come back here!
1680 CILK_ASSERT(0);
1682 else {
1683 // Case 2: We are passing the fiber to our parent because we
1684 // are leftmost. We should come back later to
1685 // resume execution of user code.
1687 // If we are not freeing a fiber, there we must be
1688 // returning from a spawn or processing an exception. The
1689 // "sync" path always frees a fiber.
1691 // We must be the leftmost child, and by left holder logic, we
1692 // have already moved the current fiber into our parent full
1693 // frame.
1694 #if FIBER_DEBUG >= 2
1695 fprintf(stderr, "ThreadId=%p, W=%d: about to suspend self into runtime.. current_fiber = %p, deallcoate, switch to fiber %p\n",
1696 cilkos_get_current_thread_id(),
1697 w->self,
1698 current_fiber, w->l->scheduling_fiber);
1699 #endif
1701 NOTE_INTERVAL(w, INTERVAL_SUSPEND_RESUME_OTHER);
1703 cilk_fiber_suspend_self_and_resume_other(current_fiber,
1704 w->l->scheduling_fiber);
1705 // Resuming this fiber returns control back to
1706 // this function because our implementation uses OS fibers.
1708 // On Unix, we could have the choice of passing the
1709 // user_code_resume_after_switch_into_runtime as an extra "resume_proc"
1710 // that resumes execution of user code instead of the
1711 // jumping back here, and then jumping back to user code.
1712 #if FIBER_DEBUG >= 2
1713 CILK_ASSERT(fdata->owner == __cilkrts_get_tls_worker());
1714 #endif
1715 user_code_resume_after_switch_into_runtime(current_fiber);
1720 * Send a message to the children of the specified worker: run or wait.
1722 static void notify_children(__cilkrts_worker *w, unsigned int msg)
1724 int child_num;
1725 __cilkrts_worker *child;
1726 int num_sys_workers = w->g->P - 1;
1728 // If worker is "n", then its children are 2n + 1, and 2n + 2.
1729 child_num = (w->self << 1) + 1;
1730 if (child_num < num_sys_workers) {
1731 child = w->g->workers[child_num];
1732 CILK_ASSERT(child->l->signal_node);
1733 signal_node_msg(child->l->signal_node, msg);
1734 child_num++;
1735 if (child_num < num_sys_workers) {
1736 child = w->g->workers[child_num];
1737 CILK_ASSERT(child->l->signal_node);
1738 signal_node_msg(child->l->signal_node, msg);
1744 * Notify this worker's children that they need to wait.
1746 static void notify_children_wait(__cilkrts_worker *w)
1748 notify_children(w, 0);
1752 * Notify this worker's children to run and start trying to steal.
1754 static void notify_children_run(__cilkrts_worker *w)
1756 notify_children(w, 1);
1760 * A single "check" to find work, either on our queue or through a
1761 * steal attempt. This method checks our local queue once, and
1762 * performs one steal attempt.
1764 static full_frame* check_for_work(__cilkrts_worker *w)
1766 full_frame *ff = NULL;
1767 ff = pop_next_frame(w);
1768 // If there is no work on the queue, try to steal some.
1769 if (NULL == ff) {
1770 START_INTERVAL(w, INTERVAL_STEALING) {
1771 if (w->l->type != WORKER_USER && w->l->team != NULL) {
1772 // At this point, the worker knows for certain that it has run
1773 // out of work. Therefore, it loses its team affiliation. User
1774 // workers never change teams, of course.
1775 __cilkrts_worker_lock(w);
1776 w->l->team = NULL;
1777 __cilkrts_worker_unlock(w);
1780 // If we are about to do a random steal, we should have no
1781 // full frame...
1782 CILK_ASSERT(NULL == w->l->frame_ff);
1783 random_steal(w);
1784 } STOP_INTERVAL(w, INTERVAL_STEALING);
1786 // If the steal was successful, then the worker has populated its next
1787 // frame with the work to resume.
1788 ff = pop_next_frame(w);
1789 if (NULL == ff) {
1790 // Punish the worker for failing to steal.
1791 // No quantum for you!
1792 unsigned int max_fails = w->g->max_steal_failures << 1;
1793 if (w->l->has_stolen == 0 &&
1794 w->l->steal_failure_count % max_fails == max_fails - 1) {
1795 // Idle briefly if the worker has never stolen anything for
1796 // the given grace period
1797 __cilkrts_idle();
1798 } else {
1799 __cilkrts_yield();
1801 w->l->steal_failure_count++;
1802 if (w->l->steal_failure_count > (max_fails << 8)) {
1803 // Reset the flag after certain amount of failures
1804 // - This will reduce cpu time in top-level synched regions
1805 // - max_fails can be controlled by user (CILK_STEAL_FAILURES)
1806 w->l->has_stolen = 0;
1808 } else {
1809 // Reset steal_failure_count since there is obviously still work to
1810 // be done.
1811 w->l->steal_failure_count = 0;
1812 w->l->has_stolen = 1;
1815 return ff;
1819 * Keep stealing or looking on our queue.
1821 * Returns either when a full frame is found, or NULL if the
1822 * computation is done.
1824 static full_frame* search_until_work_found_or_done(__cilkrts_worker *w)
1826 full_frame *ff = NULL;
1827 // Find a full frame to execute (either through random stealing,
1828 // or because we pull it off w's 1-element queue).
1829 while (!ff) {
1830 // Check worker state to figure out our next action.
1831 switch (worker_runnable(w))
1833 case SCHEDULE_RUN: // One attempt at checking for work.
1834 ff = check_for_work(w);
1835 break;
1836 case SCHEDULE_WAIT: // go into wait-mode.
1837 START_INTERVAL(w, INTERVAL_SCHEDULE_WAIT);
1838 CILK_ASSERT(WORKER_SYSTEM == w->l->type);
1839 // If we are about to wait, then we better not have
1840 // a frame that we should execute...
1841 CILK_ASSERT(NULL == w->l->next_frame_ff);
1842 notify_children_wait(w);
1843 signal_node_wait(w->l->signal_node);
1844 // ...
1845 // Runtime is waking up.
1846 notify_children_run(w);
1847 w->l->steal_failure_count = 0;
1848 STOP_INTERVAL(w, INTERVAL_SCHEDULE_WAIT);
1849 break;
1850 case SCHEDULE_EXIT: // exit the scheduler.
1851 CILK_ASSERT(WORKER_USER != w->l->type);
1852 return NULL;
1853 default:
1854 CILK_ASSERT(0);
1855 abort();
1858 return ff;
1862 * The proc method for a scheduling fiber on a user worker.
1864 * When a user worker jumps into the runtime, it jumps into this
1865 * method by either starting it if the scheduling fiber has never run
1866 * before, or resuming the fiber if it was previously suspended.
1868 COMMON_PORTABLE
1869 void scheduler_fiber_proc_for_user_worker(cilk_fiber *fiber)
1871 __cilkrts_worker* w = cilk_fiber_get_owner(fiber);
1872 CILK_ASSERT(w);
1874 // This must be a user worker
1875 CILK_ASSERT(WORKER_USER == w->l->type);
1877 // If we aren't the current worker, then something is very wrong
1878 // here..
1879 verify_current_wkr(w);
1881 __cilkrts_run_scheduler_with_exceptions(w);
1886 * The body of the runtime scheduling loop. This function executes in
1887 * 4 stages:
1889 * 1. Transitions from the user code into the runtime by
1890 * executing any scheduling-stack functions.
1891 * 2. Looks for a full frame enqueued from a successful provably
1892 * good steal.
1893 * 3. If no full frame is found in step 2, steal until
1894 * a frame is found or we are done. If we are done, finish
1895 * the scheduling loop.
1896 * 4. When a frame is found, setup to resume user code.
1897 * In particular, suspend the current fiber and resume the
1898 * user fiber to execute the frame.
1900 * Returns a fiber object that we should switch to after completing
1901 * the body of the loop, or NULL if we should continue executing on
1902 * this fiber.
1904 * @pre @c current_fiber should equal @c wptr->l->scheduling_fiber
1906 * @param current_fiber The currently executing (scheduling_ fiber
1907 * @param wptr The currently executing worker.
1908 * @param return The next fiber we should switch to.
1910 static cilk_fiber* worker_scheduling_loop_body(cilk_fiber* current_fiber,
1911 void* wptr)
1913 __cilkrts_worker *w = (__cilkrts_worker*) wptr;
1914 CILK_ASSERT(current_fiber == w->l->scheduling_fiber);
1916 // Stage 1: Transition from executing user code to the runtime code.
1917 // We don't need to do this call here any more, because
1918 // every switch to the scheduling fiber should make this call
1919 // using a post_switch_proc on the fiber.
1921 // enter_runtime_transition_proc(w->l->scheduling_fiber, wptr);
1923 // After Stage 1 is complete, w should no longer have
1924 // an associated full frame.
1925 CILK_ASSERT(NULL == w->l->frame_ff);
1927 // Stage 2. First do a quick check of our 1-element queue.
1928 full_frame *ff = pop_next_frame(w);
1930 if (!ff) {
1931 // Stage 3. We didn't find anything from our 1-element
1932 // queue. Now go through the steal loop to find work.
1933 ff = search_until_work_found_or_done(w);
1934 if (!ff) {
1935 CILK_ASSERT(w->g->work_done);
1936 return NULL;
1940 // Stage 4. Now that we have found a full frame to work on,
1941 // actually execute it.
1942 __cilkrts_stack_frame *sf;
1944 // There shouldn't be any uncaught exceptions.
1946 // In Windows, the OS catches any exceptions not caught by the
1947 // user code. Thus, we are omitting the check on Windows.
1949 // On Android, calling std::uncaught_exception with the stlport
1950 // library causes a seg fault. Since we're not supporting
1951 // exceptions there at this point, just don't do the check
1952 CILKBUG_ASSERT_NO_UNCAUGHT_EXCEPTION();
1954 BEGIN_WITH_WORKER_LOCK(w) {
1955 CILK_ASSERT(!w->l->frame_ff);
1956 BEGIN_WITH_FRAME_LOCK(w, ff) {
1957 sf = ff->call_stack;
1958 CILK_ASSERT(sf && !sf->call_parent);
1959 setup_for_execution(w, ff, 0);
1960 } END_WITH_FRAME_LOCK(w, ff);
1961 } END_WITH_WORKER_LOCK(w);
1963 /* run it */
1965 // Prepare to run the full frame. To do so, we need to:
1966 // (a) Execute some code on this fiber (the scheduling
1967 // fiber) to set up data structures, and
1968 // (b) Suspend the scheduling fiber, and resume the
1969 // user-code fiber.
1971 // Part (a). Set up data structures.
1972 scheduling_fiber_prepare_to_resume_user_code(w, ff, sf);
1974 cilk_fiber *other = w->l->frame_ff->fiber_self;
1975 cilk_fiber_data* other_data = cilk_fiber_get_data(other);
1976 cilk_fiber_data* current_fiber_data = cilk_fiber_get_data(current_fiber);
1978 // I believe two cases are possible here, both of which
1979 // should have other_data->resume_sf as NULL.
1981 // 1. Resuming a fiber that was previously executing
1982 // user code (i.e., a provably-good-steal).
1983 // In this case, resume_sf should have been
1984 // set to NULL when it was suspended.
1986 // 2. Resuming code on a steal. In this case, since we
1987 // grabbed a new fiber, resume_sf should be NULL.
1988 CILK_ASSERT(NULL == other_data->resume_sf);
1990 #if FIBER_DEBUG >= 2
1991 fprintf(stderr, "W=%d: other fiber=%p, setting resume_sf to %p\n",
1992 w->self, other, other_data->resume_sf);
1993 #endif
1994 // Update our own fiber's data.
1995 current_fiber_data->resume_sf = NULL;
1996 // The scheduling fiber should have the right owner from before.
1997 CILK_ASSERT(current_fiber_data->owner == w);
1998 other_data->resume_sf = sf;
2001 #if FIBER_DEBUG >= 3
2002 fprintf(stderr, "ThreadId=%p (about to suspend self resume other), W=%d: current_fiber=%p, other=%p, current_fiber->resume_sf = %p, other->resume_sf = %p\n",
2003 cilkos_get_current_thread_id(),
2004 w->self,
2005 current_fiber, other,
2006 current_fiber_data->resume_sf,
2007 other_data->resume_sf);
2008 #endif
2009 return other;
2014 * This function is executed once by each worker, to initialize its
2015 * scheduling loop.
2017 static void worker_scheduler_init_function(__cilkrts_worker *w)
2019 // First, execute the startup tasks that must happen for all
2020 // worker types.
2021 ITT_SYNC_PREPARE(w);
2022 /* Notify tools about the new worker. Inspector needs this, but we
2023 don't want to confuse Cilkscreen with system threads. User threads
2024 do this notification in bind_thread */
2025 if (! w->g->under_ptool)
2026 __cilkrts_cilkscreen_establish_worker(w);
2028 // Seed the initial random number generator.
2029 // If we forget to do this, then the worker always steals from 0.
2030 // Programs will still execute correctly, but
2031 // you may see a subtle performance bug...
2032 mysrand(w, (w->self + 1));
2034 // The startup work varies, depending on the worker type.
2035 switch (w->l->type) {
2036 case WORKER_USER:
2037 break;
2039 case WORKER_SYSTEM:
2040 // If a system worker is starting, we must also be starting
2041 // the runtime.
2043 // Runtime begins in a wait-state and is woken up by the first user
2044 // worker when the runtime is ready.
2045 signal_node_wait(w->l->signal_node);
2046 // ...
2047 // Runtime is waking up.
2048 notify_children_run(w);
2049 w->l->steal_failure_count = 0;
2050 break;
2051 default:
2052 __cilkrts_bug("Unknown worker %p of type %d entering scheduling loop\n",
2053 w, w->l->type);
2058 * This function is executed once by each worker, to finish its
2059 * scheduling loop.
2061 * @note Currently, only system workers finish their loops. User
2062 * workers will jump away to user code without exiting their
2063 * scheduling loop.
2065 static void worker_scheduler_terminate_function(__cilkrts_worker *w)
2067 // A user worker should never finish by falling through the
2068 // scheduling loop.
2069 CILK_ASSERT(WORKER_USER != w->l->type);
2073 * The main scheduler function executed by a worker's scheduling
2074 * fiber.
2076 * This method is started by either a new system worker, or a user
2077 * worker that has stalled and just been imported into the runtime.
2079 static void worker_scheduler_function(__cilkrts_worker *w)
2081 START_INTERVAL(w, INTERVAL_INIT_WORKER);
2082 worker_scheduler_init_function(w);
2083 STOP_INTERVAL(w, INTERVAL_INIT_WORKER);
2085 // The main scheduling loop body.
2087 while (!w->g->work_done) {
2088 // Execute the "body" of the scheduling loop, and figure
2089 // out the fiber to jump to next.
2090 START_INTERVAL(w, INTERVAL_SCHED_LOOP);
2091 cilk_fiber* fiber_to_resume
2092 = worker_scheduling_loop_body(w->l->scheduling_fiber, w);
2093 STOP_INTERVAL(w, INTERVAL_SCHED_LOOP);
2095 if (fiber_to_resume) {
2096 // Suspend the current fiber and resume next one.
2097 NOTE_INTERVAL(w, INTERVAL_SUSPEND_RESUME_OTHER);
2099 // Whenever we jump to resume user code, we stop being in
2100 // the runtime, and start working.
2101 STOP_INTERVAL(w, INTERVAL_IN_RUNTIME);
2102 START_INTERVAL(w, INTERVAL_WORKING);
2103 cilk_fiber_suspend_self_and_resume_other(w->l->scheduling_fiber,
2104 fiber_to_resume);
2105 // Return here only when this (scheduling) fiber is
2106 // resumed (i.e., this worker wants to reenter the runtime).
2108 // We've already switched from WORKING to IN_RUNTIME in
2109 // the runtime code that handles the fiber switch. Thus, at
2110 // this point we are IN_RUNTIME already.
2114 // Finish the scheduling loop.
2115 worker_scheduler_terminate_function(w);
2119 /*************************************************************
2120 Forward declarations for reduction protocol.
2121 *************************************************************/
2123 static __cilkrts_worker*
2124 execute_reductions_for_sync(__cilkrts_worker *w,
2125 full_frame *ff,
2126 __cilkrts_stack_frame *sf_at_sync);
2128 static __cilkrts_worker*
2129 execute_reductions_for_spawn_return(__cilkrts_worker *w,
2130 full_frame *ff,
2131 __cilkrts_stack_frame *returning_sf);
2135 /*************************************************************
2136 Scheduler functions that are callable by client code
2137 *************************************************************/
2138 static full_frame *disown(__cilkrts_worker *w,
2139 full_frame *ff,
2140 __cilkrts_stack_frame *sf,
2141 const char *why)
2143 CILK_ASSERT(ff);
2144 make_unrunnable(w, ff, sf, sf != 0, why);
2145 w->l->frame_ff = 0;
2146 return ff->parent;
2150 * Called when ff is returning from a spawn, and we need to execute a
2151 * reduction.
2153 * @param w The currently executing worker.
2154 * @param ff The full frame for w.
2155 * @param returning_sf The stack frame for the spawn helper that is returning.
2157 * Normally, by the time we gain control in the runtime, the worker
2158 * has already popped off the __cilkrts_stack_frame "returning_sf"
2159 * from its call chain.
2161 * When we have only serial reductions, w->current_stack_frame is not
2162 * needed any more, because w is about to enter the runtime scheduling
2163 * loop anyway. Similarly, the frame "ff" is slated to be destroyed
2164 * after the runtime finishes the return from spawn and splices ff out
2165 * of the tree of full frames.
2167 * To execute a parallel reduction, however, we still want
2168 * w->current_stack_frame == returning_sf, and we are going to use the
2169 * frame ff for a little bit longer.
2171 * This method:
2173 * 1. Puts returning_sf back as w's current stack frame.
2174 * 2. Makes "ff" runnable again on w.
2176 static inline
2177 void restore_frame_for_spawn_return_reduction(__cilkrts_worker *w,
2178 full_frame *ff,
2179 __cilkrts_stack_frame *returning_sf) {
2180 #if REDPAR_DEBUG >= 2
2181 CILK_ASSERT(returning_sf);
2182 CILK_ASSERT(returning_sf->worker == w);
2183 #endif
2184 // Change w's current stack frame back to "returning_sf".
2186 // Intuitively, w->current_stack_frame should be
2187 // returning_sf->call_parent at this point.
2189 // We can not assert this, however, because the pop of
2190 // returning_sf from the call chain has already cleared
2191 // returning_sf->call_parent. We don't want to restore the call
2192 // parent of returning_sf, because its parent has been stolen, and
2193 // the runtime assumes that steals break this link.
2195 // We cannot assert call_parent is NULL either, since that's not true for
2196 // Win64 exception handling
2197 // CILK_ASSERT(returning_sf->call_parent == NULL);
2198 w->current_stack_frame = returning_sf;
2200 // Make the full frame "ff" runnable again, in preparation for
2201 // executing the reduction.
2202 make_runnable(w, ff);
2206 NORETURN __cilkrts_c_sync(__cilkrts_worker *w,
2207 __cilkrts_stack_frame *sf_at_sync)
2209 full_frame *ff;
2210 STOP_INTERVAL(w, INTERVAL_WORKING);
2211 START_INTERVAL(w, INTERVAL_IN_RUNTIME);
2213 // Claim: This read of w->l->frame_ff can occur without
2214 // holding the worker lock because when w has reached a sync
2215 // and entered the runtime (because it stalls), w's deque is empty
2216 // and no one else can steal and change w->l->frame_ff.
2218 ff = w->l->frame_ff;
2219 #ifdef _WIN32
2220 __cilkrts_save_exception_state(w, ff);
2221 #else
2222 // Move any pending exceptions into the full frame
2223 CILK_ASSERT(NULL == ff->pending_exception);
2224 ff->pending_exception = w->l->pending_exception;
2225 w->l->pending_exception = NULL;
2226 #endif
2228 w = execute_reductions_for_sync(w, ff, sf_at_sync);
2230 #if FIBER_DEBUG >= 3
2231 fprintf(stderr, "ThreadId=%p, w->self = %d. about to longjmp_into_runtim[c_sync] with ff=%p\n",
2232 cilkos_get_current_thread_id(), w->self, ff);
2233 #endif
2235 longjmp_into_runtime(w, do_sync, sf_at_sync);
2238 static void do_sync(__cilkrts_worker *w, full_frame *ff,
2239 __cilkrts_stack_frame *sf)
2241 //int abandoned = 1;
2242 enum provably_good_steal_t steal_result = ABANDON_EXECUTION;
2244 START_INTERVAL(w, INTERVAL_SYNC_CHECK) {
2245 BEGIN_WITH_WORKER_LOCK_OPTIONAL(w) {
2247 CILK_ASSERT(ff);
2248 BEGIN_WITH_FRAME_LOCK(w, ff) {
2249 CILK_ASSERT(sf->call_parent == 0);
2250 CILK_ASSERT(sf->flags & CILK_FRAME_UNSYNCHED);
2252 // Before switching into the scheduling fiber, we should have
2253 // already taken care of deallocating the current
2254 // fiber.
2255 CILK_ASSERT(NULL == ff->fiber_self);
2257 // Update the frame's pedigree information if this is an ABI 1
2258 // or later frame
2259 if (CILK_FRAME_VERSION_VALUE(sf->flags) >= 1)
2261 sf->parent_pedigree.rank = w->pedigree.rank;
2262 sf->parent_pedigree.parent = w->pedigree.parent;
2264 // Note that the pedigree rank needs to be updated
2265 // when setup_for_execution_pedigree runs
2266 sf->flags |= CILK_FRAME_SF_PEDIGREE_UNSYNCHED;
2269 /* the decjoin() occurs in provably_good_steal() */
2270 steal_result = provably_good_steal(w, ff);
2272 } END_WITH_FRAME_LOCK(w, ff);
2273 // set w->l->frame_ff = NULL after checking abandoned
2274 if (WAIT_FOR_CONTINUE != steal_result) {
2275 w->l->frame_ff = NULL;
2277 } END_WITH_WORKER_LOCK_OPTIONAL(w);
2278 } STOP_INTERVAL(w, INTERVAL_SYNC_CHECK);
2280 // Now, if we are in a replay situation and provably_good_steal() returned
2281 // WAIT_FOR_CONTINUE, we should sleep, reacquire locks, call
2282 // provably_good_steal(), and release locks until we get a value other
2283 // than WAIT_FOR_CONTINUE from the function.
2284 #ifdef CILK_RECORD_REPLAY
2285 // We don't have to explicitly check for REPLAY_LOG below because
2286 // steal_result can only be set to WAIT_FOR_CONTINUE during replay
2287 while(WAIT_FOR_CONTINUE == steal_result)
2289 __cilkrts_sleep();
2290 BEGIN_WITH_WORKER_LOCK_OPTIONAL(w)
2292 ff = w->l->frame_ff;
2293 BEGIN_WITH_FRAME_LOCK(w, ff)
2295 steal_result = provably_good_steal(w, ff);
2296 } END_WITH_FRAME_LOCK(w, ff);
2297 if (WAIT_FOR_CONTINUE != steal_result)
2298 w->l->frame_ff = NULL;
2299 } END_WITH_WORKER_LOCK_OPTIONAL(w);
2301 #endif // CILK_RECORD_REPLAY
2303 #ifdef ENABLE_NOTIFY_ZC_INTRINSIC
2304 // If we can't make any further progress on this thread, tell Inspector
2305 // that we're abandoning the work and will go find something else to do.
2306 if (ABANDON_EXECUTION == steal_result)
2308 NOTIFY_ZC_INTRINSIC("cilk_sync_abandon", 0);
2310 #endif // defined ENABLE_NOTIFY_ZC_INTRINSIC
2312 return; /* back to scheduler loop */
2315 /* worker W completely promotes its own deque, simulating the case
2316 where the whole deque is stolen. We use this mechanism to force
2317 the allocation of new storage for reducers for race-detection
2318 purposes. */
2319 void __cilkrts_promote_own_deque(__cilkrts_worker *w)
2321 // Remember the fiber we start this method on.
2322 CILK_ASSERT(w->l->frame_ff);
2323 cilk_fiber* starting_fiber = w->l->frame_ff->fiber_self;
2325 BEGIN_WITH_WORKER_LOCK(w) {
2326 while (dekker_protocol(w)) {
2327 /* PLACEHOLDER_FIBER is used as non-null marker to tell detach()
2328 and make_child() that this frame should be treated as a spawn
2329 parent, even though we have not assigned it a stack. */
2330 detach_for_steal(w, w, PLACEHOLDER_FIBER);
2332 } END_WITH_WORKER_LOCK(w);
2335 // TBD: The management of full frames and fibers is a bit
2336 // sketchy here. We are promoting stack frames into full frames,
2337 // and pretending they are stolen away, but no other worker is
2338 // actually working on them. Some runtime invariants
2339 // may be broken here.
2341 // Technically, if we are simulating a steal from w
2342 // w should get a new full frame, but
2343 // keep the same fiber. A real thief would be taking the
2344 // loot frame away, get a new fiber, and starting executing the
2345 // loot frame.
2347 // What should a fake thief do? Where does the frame go?
2349 // In any case, we should be finishing the promotion process with
2350 // the same fiber with.
2351 CILK_ASSERT(w->l->frame_ff);
2352 CILK_ASSERT(w->l->frame_ff->fiber_self == starting_fiber);
2357 /* the client code calls this function after a spawn when the dekker
2358 protocol fails. The function may either return or longjmp
2359 into the rts
2361 This function takes in a "returning_sf" argument which corresponds
2362 to the __cilkrts_stack_frame that we are finishing (i.e., the
2363 argument to __cilkrts_leave_frame).
2365 void __cilkrts_c_THE_exception_check(__cilkrts_worker *w,
2366 __cilkrts_stack_frame *returning_sf)
2368 full_frame *ff;
2369 int stolen_p;
2370 __cilkrts_stack_frame *saved_sf = NULL;
2372 // For the exception check, stop working and count as time in
2373 // runtime.
2374 STOP_INTERVAL(w, INTERVAL_WORKING);
2375 START_INTERVAL(w, INTERVAL_IN_RUNTIME);
2377 START_INTERVAL(w, INTERVAL_THE_EXCEPTION_CHECK);
2379 BEGIN_WITH_WORKER_LOCK(w) {
2380 ff = w->l->frame_ff;
2381 CILK_ASSERT(ff);
2382 /* This code is called only upon a normal return and never
2383 upon an exceptional return. Assert that this is the
2384 case. */
2385 CILK_ASSERT(!w->l->pending_exception);
2387 reset_THE_exception(w);
2388 stolen_p = !(w->head < (w->tail + 1)); /* +1 because tail was
2389 speculatively
2390 decremented by the
2391 compiled code */
2393 if (stolen_p) {
2394 /* XXX This will be charged to THE for accounting purposes */
2395 __cilkrts_save_exception_state(w, ff);
2397 // Save the value of the current stack frame.
2398 saved_sf = w->current_stack_frame;
2400 // Reverse the decrement from undo_detach.
2401 // This update effectively resets the deque to be
2402 // empty (i.e., changes w->tail back to equal w->head).
2403 // We need to reset the deque to execute parallel
2404 // reductions. When we have only serial reductions, it
2405 // does not matter, since serial reductions do not
2406 // change the deque.
2407 w->tail++;
2408 #if REDPAR_DEBUG > 1
2409 // ASSERT our deque is empty.
2410 CILK_ASSERT(w->head == w->tail);
2411 #endif
2413 } END_WITH_WORKER_LOCK(w);
2415 STOP_INTERVAL(w, INTERVAL_THE_EXCEPTION_CHECK);
2417 if (stolen_p)
2419 w = execute_reductions_for_spawn_return(w, ff, returning_sf);
2421 // "Mr. Policeman? My parent always told me that if I was in trouble
2422 // I should ask a nice policeman for help. I can't find my parent
2423 // anywhere..."
2425 // Write a record to the replay log for an attempt to return to a stolen parent
2426 replay_record_orphaned(w);
2428 // Update the pedigree only after we've finished the
2429 // reductions.
2430 update_pedigree_on_leave_frame(w, returning_sf);
2432 // Notify Inspector that the parent has been stolen and we're
2433 // going to abandon this work and go do something else. This
2434 // will match the cilk_leave_begin in the compiled code
2435 NOTIFY_ZC_INTRINSIC("cilk_leave_stolen", saved_sf);
2437 DBGPRINTF ("%d: longjmp_into_runtime from __cilkrts_c_THE_exception_check\n", w->self);
2438 longjmp_into_runtime(w, do_return_from_spawn, 0);
2439 DBGPRINTF ("%d: returned from longjmp_into_runtime from __cilkrts_c_THE_exception_check?!\n", w->self);
2441 else
2443 NOTE_INTERVAL(w, INTERVAL_THE_EXCEPTION_CHECK_USELESS);
2445 // If we fail the exception check and return, then switch back
2446 // to working.
2447 STOP_INTERVAL(w, INTERVAL_IN_RUNTIME);
2448 START_INTERVAL(w, INTERVAL_WORKING);
2449 return;
2453 /* Return an exception to a stolen parent. */
2454 NORETURN __cilkrts_exception_from_spawn(__cilkrts_worker *w,
2455 __cilkrts_stack_frame *returning_sf)
2457 full_frame *ff = w->l->frame_ff;
2458 STOP_INTERVAL(w, INTERVAL_WORKING);
2459 START_INTERVAL(w, INTERVAL_IN_RUNTIME);
2461 // This is almost the same as THE_exception_check, except
2462 // the detach didn't happen, we don't need to undo the tail
2463 // update.
2464 CILK_ASSERT(w->head == w->tail);
2465 w = execute_reductions_for_spawn_return(w, ff, returning_sf);
2467 longjmp_into_runtime(w, do_return_from_spawn, 0);
2468 CILK_ASSERT(0);
2471 static void do_return_from_spawn(__cilkrts_worker *w,
2472 full_frame *ff,
2473 __cilkrts_stack_frame *sf)
2475 full_frame *parent_ff;
2476 enum provably_good_steal_t steal_result = ABANDON_EXECUTION;
2478 BEGIN_WITH_WORKER_LOCK_OPTIONAL(w) {
2479 CILK_ASSERT(ff);
2480 CILK_ASSERT(!ff->is_call_child);
2481 CILK_ASSERT(sf == NULL);
2482 parent_ff = ff->parent;
2484 BEGIN_WITH_FRAME_LOCK(w, ff) {
2485 decjoin(ff);
2486 } END_WITH_FRAME_LOCK(w, ff);
2488 BEGIN_WITH_FRAME_LOCK(w, parent_ff) {
2489 if (parent_ff->simulated_stolen)
2490 unconditional_steal(w, parent_ff);
2491 else
2492 steal_result = provably_good_steal(w, parent_ff);
2493 } END_WITH_FRAME_LOCK(w, parent_ff);
2495 } END_WITH_WORKER_LOCK_OPTIONAL(w);
2497 // Loop here in replay mode
2498 #ifdef CILK_RECORD_REPLAY
2499 // We don't have to explicitly check for REPLAY_LOG below because
2500 // steal_result can only get set to WAIT_FOR_CONTINUE during replay.
2501 // We also don't have to worry about the simulated_stolen flag
2502 // because steal_result can only be set to WAIT_FOR_CONTINUE by
2503 // provably_good_steal().
2504 while(WAIT_FOR_CONTINUE == steal_result)
2506 __cilkrts_sleep();
2507 BEGIN_WITH_WORKER_LOCK_OPTIONAL(w)
2509 BEGIN_WITH_FRAME_LOCK(w, parent_ff)
2511 steal_result = provably_good_steal(w, parent_ff);
2512 } END_WITH_FRAME_LOCK(w, parent_ff);
2513 } END_WITH_WORKER_LOCK_OPTIONAL(w);
2515 #endif // CILK_RECORD_REPLAY
2517 // Cleanup the child frame.
2518 __cilkrts_destroy_full_frame(w, ff);
2519 return;
2522 #ifdef _WIN32
2523 /* migrate an exception across fibers. Call this function when an exception has
2524 * been thrown and has to traverse across a steal. The exception has already
2525 * been wrapped up, so all that remains is to longjmp() into the continuation,
2526 * sync, and re-raise it.
2528 void __cilkrts_migrate_exception(__cilkrts_stack_frame *sf) {
2530 __cilkrts_worker *w = sf->worker;
2531 full_frame *ff;
2533 BEGIN_WITH_WORKER_LOCK(w) {
2534 ff = w->l->frame_ff;
2535 reset_THE_exception(w);
2536 /* there is no need to check for a steal because we wouldn't be here if
2537 there weren't a steal. */
2538 __cilkrts_save_exception_state(w, ff);
2540 CILK_ASSERT(w->head == w->tail);
2541 } END_WITH_WORKER_LOCK(w);
2544 // TBD(jsukha): This function emulates the
2545 // the "do_return_from_spawn" path.
2546 w = execute_reductions_for_spawn_return(w, ff, sf);
2549 longjmp_into_runtime(w, do_return_from_spawn, 0); /* does not return. */
2550 CILK_ASSERT(! "Shouldn't be here...");
2552 #endif
2555 /* Pop a call stack from TAIL. Return the call stack, or NULL if the
2556 queue is empty */
2557 __cilkrts_stack_frame *__cilkrts_pop_tail(__cilkrts_worker *w)
2559 __cilkrts_stack_frame *sf;
2560 BEGIN_WITH_WORKER_LOCK(w) {
2561 __cilkrts_stack_frame *volatile *tail = w->tail;
2562 if (w->head < tail) {
2563 --tail;
2564 sf = *tail;
2565 w->tail = tail;
2566 } else {
2567 sf = 0;
2569 } END_WITH_WORKER_LOCK(w);
2570 return sf;
2573 #ifdef CILK_RECORD_REPLAY
2574 __cilkrts_stack_frame *simulate_pop_tail(__cilkrts_worker *w)
2576 __cilkrts_stack_frame *sf;
2577 BEGIN_WITH_WORKER_LOCK(w) {
2578 if (w->head < w->tail) {
2579 sf = *(w->tail-1);
2580 } else {
2581 sf = 0;
2583 } END_WITH_WORKER_LOCK(w);
2584 return sf;
2586 #endif
2589 /* Return from a call, not a spawn. */
2590 void __cilkrts_return(__cilkrts_worker *w)
2592 full_frame *ff, *parent_ff;
2594 // Count time during the return as in the runtime.
2595 STOP_INTERVAL(w, INTERVAL_WORKING);
2596 START_INTERVAL(w, INTERVAL_IN_RUNTIME);
2597 START_INTERVAL(w, INTERVAL_RETURNING);
2599 BEGIN_WITH_WORKER_LOCK_OPTIONAL(w) {
2600 ff = w->l->frame_ff;
2601 CILK_ASSERT(ff);
2602 CILK_ASSERT(ff->join_counter == 1);
2603 /* This path is not used to return from spawn. */
2604 CILK_ASSERT(ff->is_call_child);
2606 BEGIN_WITH_FRAME_LOCK(w, ff) {
2607 // After this call, w->l->frame_ff != ff.
2608 // Technically, w will "own" ff until ff is freed,
2609 // however, because ff is a dying leaf full frame.
2610 parent_ff = disown(w, ff, 0, "return");
2611 decjoin(ff);
2613 #ifdef _WIN32
2614 __cilkrts_save_exception_state(w, ff);
2615 #else
2616 // Move the pending exceptions into the full frame
2617 // This should always be NULL if this isn't a
2618 // return with an exception
2619 CILK_ASSERT(NULL == ff->pending_exception);
2620 ff->pending_exception = w->l->pending_exception;
2621 w->l->pending_exception = NULL;
2622 #endif // _WIN32
2624 } END_WITH_FRAME_LOCK(w, ff);
2626 __cilkrts_fence(); /* redundant */
2628 CILK_ASSERT(parent_ff);
2630 BEGIN_WITH_FRAME_LOCK(w, parent_ff) {
2631 finalize_child_for_call(w, parent_ff, ff);
2632 } END_WITH_FRAME_LOCK(w, parent_ff);
2634 ff = pop_next_frame(w);
2635 /* ff will be non-null except when the parent frame is owned
2636 by another worker.
2637 CILK_ASSERT(ff)
2639 CILK_ASSERT(!w->l->frame_ff);
2640 if (ff) {
2641 BEGIN_WITH_FRAME_LOCK(w, ff) {
2642 __cilkrts_stack_frame *sf = ff->call_stack;
2643 CILK_ASSERT(sf && !sf->call_parent);
2644 setup_for_execution(w, ff, 1);
2645 } END_WITH_FRAME_LOCK(w, ff);
2647 } END_WITH_WORKER_LOCK_OPTIONAL(w);
2649 STOP_INTERVAL(w, INTERVAL_RETURNING);
2650 STOP_INTERVAL(w, INTERVAL_IN_RUNTIME);
2651 START_INTERVAL(w, INTERVAL_WORKING);
2654 static void __cilkrts_unbind_thread()
2656 int stop_cilkscreen = 0;
2657 global_state_t *g;
2659 // Take out the global OS mutex to protect accesses to the table of workers
2660 global_os_mutex_lock();
2662 if (cilkg_is_published()) {
2663 __cilkrts_worker *w = __cilkrts_get_tls_worker();
2664 if (w) {
2665 g = w->g;
2668 // Matches the START in bind_thread in cilk-abi.c.
2669 STOP_INTERVAL(w, INTERVAL_IN_RUNTIME);
2670 STOP_INTERVAL(w, INTERVAL_IN_SCHEDULER);
2672 __cilkrts_set_tls_worker(0);
2674 if (w->self == -1) {
2675 // This worker is an overflow worker. I.e., it was created on-
2676 // demand when the global pool ran out of workers.
2677 destroy_worker(w);
2678 __cilkrts_free(w);
2679 } else {
2680 // This is a normal user worker and needs to be counted by the
2681 // global state for the purposes of throttling system workers.
2682 w->l->type = WORKER_FREE;
2683 __cilkrts_leave_cilk(g);
2686 stop_cilkscreen = (0 == g->Q);
2689 global_os_mutex_unlock();
2691 /* Turn off Cilkscreen. This needs to be done when we are NOT holding the
2692 * os mutex. */
2693 if (stop_cilkscreen)
2694 __cilkrts_cilkscreen_disable_instrumentation();
2697 /* special return from the initial frame */
2699 void __cilkrts_c_return_from_initial(__cilkrts_worker *w)
2701 struct cilkred_map *rm;
2703 // When we are returning from the initial frame, switch from
2704 // INTERVAL_WORKING into INTERVAL_IN_RUNTIME.
2705 STOP_INTERVAL(w, INTERVAL_WORKING);
2706 START_INTERVAL(w, INTERVAL_IN_RUNTIME);
2708 /* This is only called on a user thread worker. */
2709 CILK_ASSERT(w->l->type == WORKER_USER);
2711 #if REDPAR_DEBUG >= 3
2712 fprintf(stderr, "[W=%d, desc=cilkrts_c_return_from_initial, ff=%p]\n",
2713 w->self, w->l->frame_ff);
2714 #endif
2716 BEGIN_WITH_WORKER_LOCK_OPTIONAL(w) {
2717 full_frame *ff = w->l->frame_ff;
2718 CILK_ASSERT(ff);
2719 CILK_ASSERT(ff->join_counter == 1);
2720 w->l->frame_ff = 0;
2722 CILK_ASSERT(ff->fiber_self);
2723 // Save any TBB interop data for the next time this thread enters Cilk
2724 cilk_fiber_tbb_interop_save_info_from_stack(ff->fiber_self);
2726 // Deallocate cilk_fiber that mapped to the user stack. The stack
2727 // itself does not get deallocated (of course) but our data
2728 // structure becomes divorced from it.
2730 #if FIBER_DEBUG >= 1
2731 fprintf(stderr, "ThreadId=%p: w=%d: We are about to deallocate ff->fiber_self = %p here. w->l->scheduling_fiber = %p. w->l->type = %d\n",
2732 cilkos_get_current_thread_id(),
2733 w->self,
2734 ff->fiber_self,
2735 w->l->scheduling_fiber,
2736 w->l->type);
2737 #endif
2738 // The fiber in ff is a user-code fiber. The fiber in
2739 // w->l->scheduling_fiber is a scheduling fiber. These fibers should
2740 // never be equal. When a user worker returns (and will unbind), we
2741 // should destroy only the fiber in ff. The scheduling fiber will be
2742 // re-used.
2744 CILK_ASSERT(ff->fiber_self != w->l->scheduling_fiber);
2746 START_INTERVAL(w, INTERVAL_FIBER_DEALLOCATE) {
2747 // This fiber might not be deallocated here if there
2748 // is a pending exception on Windows that refers
2749 // to this fiber.
2751 // First "suspend" the fiber, and then try to delete it.
2752 cilk_fiber_deallocate_from_thread(ff->fiber_self);
2753 } STOP_INTERVAL(w, INTERVAL_FIBER_DEALLOCATE);
2754 ff->fiber_self = NULL;
2756 /* Save reducer map into global_state object */
2757 rm = w->reducer_map;
2758 w->reducer_map = NULL;
2760 #if REDPAR_DEBUG >= 3
2761 fprintf(stderr, "W=%d, reducer_map_to_delete=%p, was in ff=%p\n",
2762 w->self,
2764 ff);
2765 #endif
2766 __cilkrts_destroy_full_frame(w, ff);
2769 /* Work is never done. w->g->work_done = 1; __cilkrts_fence(); */
2770 } END_WITH_WORKER_LOCK_OPTIONAL(w);
2773 save_pedigree_leaf_from_user_worker(w);
2775 // Workers can have NULL reducer maps now.
2776 if (rm) {
2777 __cilkrts_destroy_reducer_map(w, rm);
2781 #if FIBER_DEBUG >= 1
2782 __cilkrts_worker* tmp = w;
2783 int tmp_id = w->self;
2784 fprintf(stderr, "w=%d: We are about unbind thread (w= %p)\n",
2785 w->self,
2787 #endif
2789 w = NULL;
2791 __cilkrts_unbind_thread();
2793 #if FIBER_DEBUG >= 1
2795 fprintf(stderr, "w=%p, %d: Finished unbind\n",
2796 tmp, tmp_id);
2797 #endif
2799 /* Other workers will stop trying to steal if this was the last worker. */
2801 return;
2806 * __cilkrts_restore_stealing
2808 * Restore the protected_tail to a previous state, possibly allowing frames
2809 * to be stolen. The dekker_protocol has been extended to steal only if
2810 * head+1 is < protected_tail.
2813 void __cilkrts_restore_stealing(
2814 __cilkrts_worker *w,
2815 __cilkrts_stack_frame *volatile *saved_protected_tail)
2817 /* On most x86 this pair of operations would be slightly faster
2818 as an atomic exchange due to the implicit memory barrier in
2819 an atomic instruction. */
2820 w->protected_tail = saved_protected_tail;
2821 __cilkrts_fence();
2825 * __cilkrts_disallow_stealing
2827 * Move the protected_tail to NEW_PROTECTED_TAIL, preventing any
2828 * frames from being stolen. If NEW_PROTECTED_TAIL is NULL, prevent
2829 * stealing from the whole queue. The dekker_protocol has been
2830 * extended to only steal if head+1 is also < protected_tail.
2833 __cilkrts_stack_frame *volatile *__cilkrts_disallow_stealing(
2834 __cilkrts_worker *w,
2835 __cilkrts_stack_frame *volatile *new_protected_tail)
2837 __cilkrts_stack_frame *volatile *saved_protected_tail = w->protected_tail;
2839 if (!new_protected_tail)
2840 new_protected_tail = w->l->ltq;
2842 if (w->protected_tail > new_protected_tail) {
2843 w->protected_tail = new_protected_tail;
2844 /* Issue a store-store barrier. The update to protected_tail
2845 here must precede the update to tail in the next spawn.
2846 On x86 this is probably not needed. */
2847 #if defined __GNUC__ && __ICC >= 1200 && !(__MIC__ ||__MIC2__)
2848 _mm_sfence();
2849 #else
2850 __cilkrts_fence();
2851 #endif
2854 return saved_protected_tail;
2857 /*************************************************************
2858 Initialization and startup
2859 *************************************************************/
2861 __cilkrts_worker *make_worker(global_state_t *g,
2862 int self, __cilkrts_worker *w)
2864 w->self = self;
2865 w->g = g;
2867 w->pedigree.rank = 0; // Initial rank is 0
2868 w->pedigree.parent = NULL;
2870 w->l = (local_state *)__cilkrts_malloc(sizeof(*w->l));
2872 __cilkrts_frame_malloc_per_worker_init(w);
2874 w->reducer_map = NULL;
2875 w->current_stack_frame = NULL;
2876 w->reserved = NULL;
2878 w->l->worker_magic_0 = WORKER_MAGIC_0;
2879 w->l->team = NULL;
2880 w->l->type = WORKER_FREE;
2882 __cilkrts_mutex_init(&w->l->lock);
2883 __cilkrts_mutex_init(&w->l->steal_lock);
2884 w->l->do_not_steal = 0;
2885 w->l->frame_ff = 0;
2886 w->l->next_frame_ff = 0;
2887 w->l->last_full_frame = NULL;
2889 w->l->ltq = (__cilkrts_stack_frame **)
2890 __cilkrts_malloc(g->ltqsize * sizeof(*w->l->ltq));
2891 w->ltq_limit = w->l->ltq + g->ltqsize;
2892 w->head = w->tail = w->l->ltq;
2894 cilk_fiber_pool_init(&w->l->fiber_pool,
2895 &g->fiber_pool,
2896 g->stack_size,
2897 g->fiber_pool_size,
2898 0, // alloc_max is 0. We don't allocate from the heap directly without checking the parent pool.
2900 #if FIBER_DEBUG >= 2
2901 fprintf(stderr, "ThreadId=%p: Making w=%d (%p), pool = %p\n",
2902 cilkos_get_current_thread_id(),
2903 w->self, w,
2904 &w->l->fiber_pool);
2905 #endif
2906 w->l->scheduling_fiber = NULL;
2907 w->l->original_pedigree_leaf = NULL;
2908 w->l->rand_seed = 0; /* the scheduler will overwrite this field */
2910 w->l->post_suspend = 0;
2911 w->l->suspended_stack = 0;
2912 w->l->fiber_to_free = NULL;
2913 w->l->pending_exception = NULL;
2915 #if CILK_PROFILE
2916 w->l->stats = __cilkrts_malloc(sizeof(statistics));
2917 __cilkrts_init_stats(w->l->stats);
2918 #else
2919 w->l->stats = NULL;
2920 #endif
2921 w->l->steal_failure_count = 0;
2922 w->l->has_stolen = 0;
2924 w->l->work_stolen = 0;
2926 // Initialize record/replay assuming we're doing neither
2927 w->l->record_replay_fptr = NULL;
2928 w->l->replay_list_root = NULL;
2929 w->l->replay_list_entry = NULL;
2930 w->l->signal_node = NULL;
2931 // Nothing's been stolen yet
2932 w->l->worker_magic_1 = WORKER_MAGIC_1;
2934 /*w->parallelism_disabled = 0;*/
2936 // Allow stealing all frames. Sets w->saved_protected_tail
2937 __cilkrts_restore_stealing(w, w->ltq_limit);
2939 __cilkrts_init_worker_sysdep(w);
2941 reset_THE_exception(w);
2943 return w;
2946 void destroy_worker(__cilkrts_worker *w)
2948 CILK_ASSERT (NULL == w->l->pending_exception);
2950 // Deallocate the scheduling fiber
2951 if (NULL != w->l->scheduling_fiber)
2953 // The scheduling fiber is the main fiber for system workers and must
2954 // be deallocated by the thread that created it. Thus, we can
2955 // deallocate only free workers' (formerly user workers) scheduling
2956 // fibers here.
2957 CILK_ASSERT(WORKER_FREE == w->l->type);
2959 #if FIBER_DEBUG >=1
2960 fprintf(stderr, "ThreadId=%p, w=%p, %d, deallocating scheduling fiber = %p, \n",
2961 cilkos_get_current_thread_id(),
2963 w->self,
2964 w->l->scheduling_fiber);
2965 #endif
2966 int ref_count = cilk_fiber_remove_reference(w->l->scheduling_fiber, NULL);
2967 // Scheduling fiber should never have extra references because of exceptions.
2968 CILK_ASSERT(0 == ref_count);
2969 w->l->scheduling_fiber = NULL;
2972 #if CILK_PROFILE
2973 if (w->l->stats) {
2974 __cilkrts_free(w->l->stats);
2976 #else
2977 CILK_ASSERT(NULL == w->l->stats);
2978 #endif
2980 /* Free any cached fibers. */
2981 cilk_fiber_pool_destroy(&w->l->fiber_pool);
2983 __cilkrts_destroy_worker_sysdep(w);
2985 if (w->l->signal_node) {
2986 CILK_ASSERT(WORKER_SYSTEM == w->l->type);
2987 signal_node_destroy(w->l->signal_node);
2990 __cilkrts_free(w->l->ltq);
2991 __cilkrts_mutex_destroy(0, &w->l->lock);
2992 __cilkrts_mutex_destroy(0, &w->l->steal_lock);
2993 __cilkrts_frame_malloc_per_worker_cleanup(w);
2995 __cilkrts_free(w->l);
2997 // The caller is responsible for freeing the worker memory
3001 * Make a worker into a system worker.
3003 static void make_worker_system(__cilkrts_worker *w) {
3004 CILK_ASSERT(WORKER_FREE == w->l->type);
3005 w->l->type = WORKER_SYSTEM;
3006 w->l->signal_node = signal_node_create();
3009 void __cilkrts_deinit_internal(global_state_t *g)
3011 int i;
3012 __cilkrts_worker *w;
3014 // If there's no global state then we're done
3015 if (NULL == g)
3016 return;
3018 #ifdef CILK_PROFILE
3019 __cilkrts_dump_stats_to_stderr(g);
3020 #endif
3022 w = g->workers[0];
3023 if (w->l->frame_ff) {
3024 __cilkrts_destroy_full_frame(w, w->l->frame_ff);
3025 w->l->frame_ff = 0;
3028 // Release any resources used for record/replay
3029 replay_term(g);
3031 // Destroy any system dependent global state
3032 __cilkrts_destroy_global_sysdep(g);
3034 for (i = 0; i < g->total_workers; ++i)
3035 destroy_worker(g->workers[i]);
3037 // Free memory for all worker blocks which were allocated contiguously
3038 __cilkrts_free(g->workers[0]);
3040 __cilkrts_free(g->workers);
3042 cilk_fiber_pool_destroy(&g->fiber_pool);
3043 __cilkrts_frame_malloc_global_cleanup(g);
3045 cilkg_deinit_global_state();
3049 * Wake the runtime by notifying the system workers that they can steal. The
3050 * first user worker into the runtime should call this.
3052 static void wake_runtime(global_state_t *g)
3054 __cilkrts_worker *root;
3055 if (g->P > 1) {
3056 // Send a message to the root node. The message will propagate.
3057 root = g->workers[0];
3058 CILK_ASSERT(root->l->signal_node);
3059 signal_node_msg(root->l->signal_node, 1);
3064 * Put the runtime to sleep. The last user worker out of the runtime should
3065 * call this. Like Dad always said, turn out the lights when nobody's in the
3066 * room.
3068 static void sleep_runtime(global_state_t *g)
3070 __cilkrts_worker *root;
3071 if (g->P > 1) {
3072 // Send a message to the root node. The message will propagate.
3073 root = g->workers[0];
3074 CILK_ASSERT(root->l->signal_node);
3075 signal_node_msg(root->l->signal_node, 0);
3079 /* Called when a user thread joins Cilk.
3080 Global lock must be held. */
3081 void __cilkrts_enter_cilk(global_state_t *g)
3083 if (g->Q++ == 0) {
3084 // If this is the first user thread to enter Cilk wake
3085 // up all the workers.
3086 wake_runtime(g);
3090 /* Called when a user thread leaves Cilk.
3091 Global lock must be held. */
3092 void __cilkrts_leave_cilk(global_state_t *g)
3094 if (--g->Q == 0) {
3095 // Put the runtime to sleep.
3096 sleep_runtime(g);
3101 * worker_runnable
3103 * Return true if the worker should continue to try to steal. False, otherwise.
3106 NOINLINE
3107 static enum schedule_t worker_runnable(__cilkrts_worker *w)
3109 global_state_t *g = w->g;
3111 /* If this worker has something to do, do it.
3112 Otherwise the work would be lost. */
3113 if (w->l->next_frame_ff)
3114 return SCHEDULE_RUN;
3116 // If Cilk has explicitly (by the user) been told to exit (i.e., by
3117 // __cilkrts_end_cilk() -> __cilkrts_stop_workers(g)), then return 0.
3118 if (g->work_done)
3119 return SCHEDULE_EXIT;
3121 if (0 == w->self) {
3122 // This worker is the root node and is the only one that may query the
3123 // global state to see if there are still any user workers in Cilk.
3124 if (w->l->steal_failure_count > g->max_steal_failures) {
3125 if (signal_node_should_wait(w->l->signal_node)) {
3126 return SCHEDULE_WAIT;
3127 } else {
3128 // Reset the steal_failure_count since we have verified that
3129 // user workers are still in Cilk.
3130 w->l->steal_failure_count = 0;
3133 } else if (WORKER_SYSTEM == w->l->type &&
3134 signal_node_should_wait(w->l->signal_node)) {
3135 // This worker has been notified by its parent that it should stop
3136 // trying to steal.
3137 return SCHEDULE_WAIT;
3140 return SCHEDULE_RUN;
3145 // Initialize the worker structs, but don't start the workers themselves.
3146 static void init_workers(global_state_t *g)
3148 int total_workers = g->total_workers;
3149 int i;
3150 struct CILK_ALIGNAS(256) buffered_worker {
3151 __cilkrts_worker w;
3152 char buf[64];
3153 } *workers_memory;
3155 /* not needed if only one worker */
3156 cilk_fiber_pool_init(&g->fiber_pool,
3157 NULL,
3158 g->stack_size,
3159 g->global_fiber_pool_size, // buffer_size
3160 g->max_stacks, // maximum # to allocate
3163 cilk_fiber_pool_set_fiber_limit(&g->fiber_pool,
3164 (g->max_stacks ? g->max_stacks : INT_MAX));
3166 g->workers = (__cilkrts_worker **)
3167 __cilkrts_malloc(total_workers * sizeof(*g->workers));
3169 // Allocate 1 block of memory for workers to make life easier for tools
3170 // like Inspector which run multithreaded and need to know the memory
3171 // range for all the workers that will be accessed in a user's program
3172 workers_memory = (struct buffered_worker*)
3173 __cilkrts_malloc(sizeof(*workers_memory) * total_workers);
3175 // Notify any tools that care (Cilkscreen and Inspector) that they should
3176 // ignore memory allocated for the workers
3177 __cilkrts_cilkscreen_ignore_block(&workers_memory[0],
3178 &workers_memory[total_workers]);
3180 // Initialize worker structs, including unused worker slots.
3181 for (i = 0; i < total_workers; ++i) {
3182 g->workers[i] = make_worker(g, i, &workers_memory[i].w);
3185 // Set the workers in the first P - 1 slots to be system workers.
3186 // Remaining worker structs already have type == 0.
3187 for (i = 0; i < g->system_workers; ++i) {
3188 make_worker_system(g->workers[i]);
3192 void __cilkrts_init_internal(int start)
3194 global_state_t *g = NULL;
3196 if (cilkg_is_published()) {
3197 g = cilkg_init_global_state();
3199 else {
3201 // We think the state has not been published yet.
3202 // Grab the lock and try to initialize/publish.
3203 global_os_mutex_lock();
3205 if (cilkg_is_published()) {
3206 // Some other thread must have snuck in and published.
3207 g = cilkg_init_global_state();
3209 else {
3210 // Initialize and retrieve global state
3211 g = cilkg_init_global_state();
3213 // Set the scheduler pointer
3214 g->scheduler = worker_scheduler_function;
3216 // If we're running under a sequential P-Tool (Cilkscreen or
3217 // Cilkview) then there's only one worker and we need to tell
3218 // the tool about the extent of the stack
3219 if (g->under_ptool)
3220 __cilkrts_establish_c_stack();
3221 init_workers(g);
3223 // Initialize per-work record/replay logging
3224 replay_init_workers(g);
3226 // Initialize any system dependent global state
3227 __cilkrts_init_global_sysdep(g);
3230 cilkg_publish_global_state(g);
3233 global_os_mutex_unlock();
3236 CILK_ASSERT(g);
3238 if (start && !g->workers_running)
3240 // Acquire the global OS mutex while we're starting the workers
3241 global_os_mutex_lock();
3242 if (!g->workers_running)
3243 // Start P - 1 system workers since P includes the first user
3244 // worker.
3245 __cilkrts_start_workers(g, g->P - 1);
3246 global_os_mutex_unlock();
3251 /************************************************************************
3252 Methods for reducer protocol.
3254 Reductions occur in two places:
3255 A. A full frame "ff" is returning from a spawn with a stolen parent.
3256 B. A full frame "ff" is stalling at a sync.
3258 To support parallel reductions, reduction functions need to be
3259 executed while control is on a user stack, before jumping into the
3260 runtime. These reductions can not occur while holding a worker or
3261 frame lock.
3263 Before a worker w executes a reduction in either Case A or B, w's
3264 deque is empty.
3266 Since parallel reductions push work onto the deque, we must do extra
3267 work to set up runtime data structures properly before reductions
3268 begin to allow stealing. ( Normally, when we have only serial
3269 reductions, once a worker w starts a reduction, its deque remains
3270 empty until w either steals another frame or resumes a suspended
3271 frame. Thus, we don't care about the state of the deque, since w
3272 will reset its deque when setting up execution of a frame. )
3274 To allow for parallel reductions, we coerce the runtime data
3275 structures so that, from their perspective, it looks as though we
3276 have spliced in an "execute_reductions()" function. Consider the
3277 two cases for reductions:
3279 Case A: Return from a spawn with a stolen parent.
3280 Consider a spawned function g is returning on a worker w.
3281 Assume:
3282 - g was spawned from a parent function f.
3283 - ff is the full frame for g's spawn helper
3284 - sf be the __cilkrts_stack_frame for g's spawn helper.
3286 We are conceptually splicing "execute_reductions()" so that it
3287 occurs immediately before the spawn helper of g returns to f.
3289 We do so by creating two different world views --- one for the
3290 runtime data structures, and one for the actual control flow.
3292 - Before reductions begin, the runtime data structures should
3293 look as though the spawn helper of g is calling
3294 "execute_reductions()", in terms of both the user stack and
3295 worker deque. More precisely, w should satisfy the
3296 following properties:
3298 (a) w has ff as its full frame,
3299 (b) w has sf as its __cilkrts_stack_frame, and
3300 (c) w has an empty deque.
3302 If the runtime satisfies these properties, then if w
3303 encounters a spawn in a parallel reduction, it can push onto
3304 a valid deque. Also, when a steal from w occurs, it will
3305 build the correct tree of full frames when w is stolen from.
3307 - In actual control flow, however, once the
3308 "execute_reductions()" function returns, it is actually
3309 returning to runtime code instead of g's spawn helper.
3311 At the point a worker w began executing reductions, the
3312 control flow / compiled code had already finished g's spawn
3313 helper, and w was about to enter the runtime. With parallel
3314 reductions, some worker v (which might be different from w)
3315 is the one returning to the runtime.
3318 The reduction logic consists of 4 steps:
3320 A1. Restore runtime data structures to make it look as though
3321 the spawn helper of g() is still the currently executing
3322 frame for w.
3324 A2. Execute reductions on the user stack. Reductions also
3325 includes the logic for exceptions and stacks. Note that
3326 reductions start on w, but may finish on a different
3327 worker if there is parallelism in the reduce.
3329 A3. Splice out ff from the tree of full frames.
3331 A4. Jump into the runtime/scheduling stack and execute
3332 "do_return_from_spawn". This method
3334 (a) Frees the user stack we were just on if it is no longer needed.
3335 (b) Decrement the join counter on ff->parent, and tries to do a
3336 provably good steal.
3337 (c) Clean up the full frame ff.
3340 Case B: Stalling at a sync.
3342 Consider a function g(), with full frame ff and
3343 __cilkrts_stack_frame sf. Suppose g() stalls at a sync, and we
3344 are executing reductions.
3346 Conceptually, we are splicing in an "execute_reductions()"
3347 function into g() as the last action that g() takes immediately
3348 before it executes the cilk_sync.
3350 The reduction logic for this case is similar to Case A.
3352 B1. Restore the runtime data structures.
3354 The main difference from Case A is that ff/sf is still a
3355 frame that needs to be executed later (since it is stalling
3356 at a cilk_sync). Thus, we also need to save the current
3357 stack information into "ff" so that we can correctly resume
3358 execution of "ff" after the sync.
3360 B2. Execute reductions on the user stack.
3362 B3. No frame to splice out of the tree.
3364 B4. Jump into the runtime/scheduling stack and execute "do_sync".
3365 This method:
3366 (a) Frees the user stack we were just on if it is no longer needed.
3367 (b) Tries to execute a provably good steal.
3369 Finally, for the reducer protocol, we consider two reduction paths,
3370 namely a "fast" and "slow" path. On a fast path, only trivial
3371 merges of reducer maps happen (i.e., one or both of the maps are
3372 NULL). Otherwise, on the slow path, a reduction actually needs to
3373 happen.
3375 *****************************************************************/
3378 * @brief Locations to store the result of a reduction.
3380 * Struct storing pointers to the fields in our "left" sibling that we
3381 * should update when splicing out a full frame or stalling at a sync.
3383 typedef struct {
3384 /** A pointer to the location of our left reducer map. */
3385 struct cilkred_map **map_ptr;
3387 /** A pointer to the location of our left exception. */
3388 struct pending_exception_info **exception_ptr;
3389 } splice_left_ptrs;
3392 * For a full frame returning from a spawn, calculate the pointers to
3393 * the maps and exceptions to my left.
3395 * @param w The currently executing worker.
3396 * @param ff Full frame that is dying
3397 * @return Pointers to our "left" for reducers and exceptions.
3399 static inline
3400 splice_left_ptrs compute_left_ptrs_for_spawn_return(__cilkrts_worker *w,
3401 full_frame *ff)
3403 // ASSERT: we hold the lock on ff->parent
3405 splice_left_ptrs left_ptrs;
3406 if (ff->left_sibling) {
3407 left_ptrs.map_ptr = &ff->left_sibling->right_reducer_map;
3408 left_ptrs.exception_ptr = &ff->left_sibling->right_pending_exception;
3410 else {
3411 full_frame *parent_ff = ff->parent;
3412 left_ptrs.map_ptr = &parent_ff->children_reducer_map;
3413 left_ptrs.exception_ptr = &parent_ff->child_pending_exception;
3415 return left_ptrs;
3419 * For a full frame at a sync, calculate the pointers to the maps and
3420 * exceptions to my left.
3422 * @param w The currently executing worker.
3423 * @param ff Full frame that is stalling at a sync.
3424 * @return Pointers to our "left" for reducers and exceptions.
3426 static inline
3427 splice_left_ptrs compute_left_ptrs_for_sync(__cilkrts_worker *w,
3428 full_frame *ff)
3430 // ASSERT: we hold the lock on ff
3431 splice_left_ptrs left_ptrs;
3433 // Figure out which map to the left we should merge into.
3434 if (ff->rightmost_child) {
3435 CILK_ASSERT(ff->rightmost_child->parent == ff);
3436 left_ptrs.map_ptr = &(ff->rightmost_child->right_reducer_map);
3437 left_ptrs.exception_ptr = &(ff->rightmost_child->right_pending_exception);
3439 else {
3440 // We have no children. Then, we should be the last
3441 // worker at the sync... "left" is our child map.
3442 left_ptrs.map_ptr = &(ff->children_reducer_map);
3443 left_ptrs.exception_ptr = &(ff->child_pending_exception);
3445 return left_ptrs;
3449 * After we have completed all reductions on a spawn return, call this
3450 * method to finish up before jumping into the runtime.
3452 * 1. Perform the "reduction" on stacks, i.e., execute the left
3453 * holder logic to pass the leftmost stack up.
3455 * w->l->fiber_to_free holds any stack that needs to be freed
3456 * when control switches into the runtime fiber.
3458 * 2. Unlink and remove child_ff from the tree of full frames.
3460 * @param w The currently executing worker.
3461 * @param parent_ff The parent of child_ff.
3462 * @param child_ff The full frame returning from a spawn.
3464 static inline
3465 void finish_spawn_return_on_user_stack(__cilkrts_worker *w,
3466 full_frame *parent_ff,
3467 full_frame *child_ff)
3469 CILK_ASSERT(w->l->fiber_to_free == NULL);
3471 // Execute left-holder logic for stacks.
3472 if (child_ff->left_sibling || parent_ff->fiber_child) {
3473 // Case where we are not the leftmost stack.
3474 CILK_ASSERT(parent_ff->fiber_child != child_ff->fiber_self);
3476 // Remember any fiber we need to free in the worker.
3477 // After we jump into the runtime, we will actually do the
3478 // free.
3479 w->l->fiber_to_free = child_ff->fiber_self;
3481 else {
3482 // We are leftmost, pass stack/fiber up to parent.
3483 // Thus, no stack/fiber to free.
3484 parent_ff->fiber_child = child_ff->fiber_self;
3485 w->l->fiber_to_free = NULL;
3488 child_ff->fiber_self = NULL;
3490 unlink_child(parent_ff, child_ff);
3495 * Executes any fast reductions necessary to splice ff out of the tree
3496 * of full frames.
3498 * This "fast" path performs only trivial merges of reducer maps,
3499 * i.e,. when one of them is NULL.
3500 * (See slow_path_reductions_for_spawn_return() for slow path.)
3502 * Returns: 1 if we finished all reductions.
3503 * Returns: 0 if there are still reductions to execute, and
3504 * we should execute the slow path.
3506 * This method assumes w holds the frame lock on parent_ff.
3507 * After this method completes:
3508 * 1. We have spliced ff out of the tree of full frames.
3509 * 2. The reducer maps of child_ff have been deposited
3510 * "left" according to the reducer protocol.
3511 * 3. w->l->stack_to_free stores the stack
3512 * that needs to be freed once we jump into the runtime.
3514 * We have not, however, decremented the join counter on ff->parent.
3515 * This prevents any other workers from resuming execution of the parent.
3517 * @param w The currently executing worker.
3518 * @param ff The full frame returning from a spawn.
3519 * @return NULL if we finished all reductions.
3520 * @return The address where the left map is stored (which should be passed to
3521 * slow_path_reductions_for_spawn_return()) if there are
3522 * still reductions to execute.
3524 struct cilkred_map**
3525 fast_path_reductions_for_spawn_return(__cilkrts_worker *w,
3526 full_frame *ff)
3528 // ASSERT: we hold ff->parent->lock.
3529 splice_left_ptrs left_ptrs;
3531 CILK_ASSERT(NULL == w->l->pending_exception);
3533 // Figure out the pointers to the left where I want
3534 // to put reducers and exceptions.
3535 left_ptrs = compute_left_ptrs_for_spawn_return(w, ff);
3537 // Go ahead and merge exceptions while holding the lock.
3538 splice_exceptions_for_spawn(w, ff, left_ptrs.exception_ptr);
3540 // Now check if we have any reductions to perform.
3542 // Consider all the cases of left, middle and right maps.
3543 // 0. (-, -, -) : finish and return 1
3544 // 1. (L, -, -) : finish and return 1
3545 // 2. (-, M, -) : slide over to left, finish, and return 1.
3546 // 3. (L, M, -) : return 0
3547 // 4. (-, -, R) : slide over to left, finish, and return 1.
3548 // 5. (L, -, R) : return 0
3549 // 6. (-, M, R) : return 0
3550 // 7. (L, M, R) : return 0
3552 // In terms of code:
3553 // L == *left_ptrs.map_ptr
3554 // M == w->reducer_map
3555 // R == f->right_reducer_map.
3557 // The goal of the code below is to execute the fast path with
3558 // as few branches and writes as possible.
3560 int case_value = (*(left_ptrs.map_ptr) != NULL);
3561 case_value += ((w->reducer_map != NULL) << 1);
3562 case_value += ((ff->right_reducer_map != NULL) << 2);
3564 // Fastest path is case_value == 0 or 1.
3565 if (case_value >=2) {
3566 switch (case_value) {
3567 case 2:
3568 *(left_ptrs.map_ptr) = w->reducer_map;
3569 w->reducer_map = NULL;
3570 return NULL;
3571 break;
3572 case 4:
3573 *(left_ptrs.map_ptr) = ff->right_reducer_map;
3574 ff->right_reducer_map = NULL;
3575 return NULL;
3576 default:
3577 // If we have to execute the slow path, then
3578 // return the pointer to the place to deposit the left
3579 // map.
3580 return left_ptrs.map_ptr;
3584 // Do nothing
3585 return NULL;
3590 * Executes any reductions necessary to splice "ff" frame out of
3591 * the steal tree.
3593 * This method executes the "slow" path for reductions on a spawn
3594 * return, i.e., there are non-NULL maps that need to be merged
3595 * together.
3597 * This method should execute only if
3598 * fast_path_reductions_for_spawn_return() returns a non-NULL
3599 * left_map_ptr.
3601 * Upon entry, left_map_ptr should be the location of the left map
3602 * at the start of the reduction, as calculated by
3603 * fast_path_reductions_for_spawn_return().
3605 * After this method completes:
3606 * 1. We have spliced ff out of the tree of full frames.
3607 * 2. The reducer maps of child_ff have been deposited
3608 * "left" according to the reducer protocol.
3609 * 3. w->l->stack_to_free stores the stack
3610 * that needs to be freed once we jump into the runtime.
3611 * We have not, however, decremented the join counter on ff->parent,
3612 * so no one can resume execution of the parent yet.
3614 * WARNING:
3615 * This method assumes the lock on ff->parent is held upon entry, and
3616 * Upon exit, the worker that returns still holds a lock on ff->parent
3617 * This method can, however, release and reacquire the lock on ff->parent.
3619 * @param w The currently executing worker.
3620 * @param ff The full frame returning from a spawn.
3621 * @param left_map_ptr Pointer to our initial left map.
3622 * @return The worker that this method returns on.
3624 static __cilkrts_worker*
3625 slow_path_reductions_for_spawn_return(__cilkrts_worker *w,
3626 full_frame *ff,
3627 struct cilkred_map **left_map_ptr)
3630 // CILK_ASSERT: w is holding frame lock on parent_ff.
3631 #if REDPAR_DEBUG > 0
3632 CILK_ASSERT(!ff->rightmost_child);
3633 CILK_ASSERT(!ff->is_call_child);
3634 #endif
3636 // Loop invariant:
3637 // When beginning this loop, we should
3638 // 1. Be holding the lock on ff->parent.
3639 // 2. left_map_ptr should be the address of the pointer to the left map.
3640 // 3. All maps should be slid over left by one, if possible.
3641 // 4. All exceptions should be merged so far.
3642 while (1) {
3644 // Slide middle map left if possible.
3645 if (!(*left_map_ptr)) {
3646 *left_map_ptr = w->reducer_map;
3647 w->reducer_map = NULL;
3649 // Slide right map to middle if possible.
3650 if (!w->reducer_map) {
3651 w->reducer_map = ff->right_reducer_map;
3652 ff->right_reducer_map = NULL;
3655 // Since we slid everything left by one,
3656 // we are finished if there is no middle map.
3657 if (!w->reducer_map) {
3658 verify_current_wkr(w);
3659 return w;
3661 else {
3662 struct cilkred_map* left_map;
3663 struct cilkred_map* middle_map;
3664 struct cilkred_map* right_map;
3666 // Take all the maps from their respective locations.
3667 // We can't leave them in place and execute a reduction because these fields
3668 // might change once we release the lock.
3669 left_map = *left_map_ptr;
3670 *left_map_ptr = NULL;
3671 middle_map = w->reducer_map;
3672 w->reducer_map = NULL;
3673 right_map = ff->right_reducer_map;
3674 ff->right_reducer_map = NULL;
3676 // WARNING!!! Lock release here.
3677 // We have reductions to execute (and we can't hold locks).
3678 __cilkrts_frame_unlock(w, ff->parent);
3680 // After we've released the lock, start counting time as
3681 // WORKING again.
3682 STOP_INTERVAL(w, INTERVAL_IN_RUNTIME);
3683 START_INTERVAL(w, INTERVAL_WORKING);
3685 // Merge all reducers into the left map.
3686 left_map = repeated_merge_reducer_maps(&w,
3687 left_map,
3688 middle_map);
3689 verify_current_wkr(w);
3690 left_map = repeated_merge_reducer_maps(&w,
3691 left_map,
3692 right_map);
3693 verify_current_wkr(w);
3694 CILK_ASSERT(NULL == w->reducer_map);
3695 // Put the final answer back into w->reducer_map.
3696 w->reducer_map = left_map;
3698 // Save any exceptions generated because of the reduction
3699 // process from the returning worker. These get merged
3700 // the next time around the loop.
3701 CILK_ASSERT(NULL == ff->pending_exception);
3702 ff->pending_exception = w->l->pending_exception;
3703 w->l->pending_exception = NULL;
3705 STOP_INTERVAL(w, INTERVAL_WORKING);
3706 START_INTERVAL(w, INTERVAL_IN_RUNTIME);
3708 // Lock ff->parent for the next loop around.
3709 __cilkrts_frame_lock(w, ff->parent);
3711 // Once we have the lock again, recompute who is to our
3712 // left.
3713 splice_left_ptrs left_ptrs;
3714 left_ptrs = compute_left_ptrs_for_spawn_return(w, ff);
3716 // Update the pointer for the left map.
3717 left_map_ptr = left_ptrs.map_ptr;
3718 // Splice the exceptions for spawn.
3719 splice_exceptions_for_spawn(w, ff, left_ptrs.exception_ptr);
3722 // We should never break out of this loop.
3724 CILK_ASSERT(0);
3725 return NULL;
3731 * Execute reductions when returning from a spawn whose parent has
3732 * been stolen.
3734 * Execution may start on w, but may finish on a different worker.
3735 * This method acquires/releases the lock on ff->parent.
3737 * @param w The currently executing worker.
3738 * @param ff The full frame of the spawned function that is returning.
3739 * @param returning_sf The __cilkrts_stack_frame for this returning function.
3740 * @return The worker returning from this method.
3742 static __cilkrts_worker*
3743 execute_reductions_for_spawn_return(__cilkrts_worker *w,
3744 full_frame *ff,
3745 __cilkrts_stack_frame *returning_sf)
3747 // Step A1 from reducer protocol described above.
3749 // Coerce the runtime into thinking that
3750 // ff/returning_sf are still on the bottom of
3751 // w's deque.
3752 restore_frame_for_spawn_return_reduction(w, ff, returning_sf);
3754 // Step A2 and A3: Execute reductions on user stack.
3755 BEGIN_WITH_FRAME_LOCK(w, ff->parent) {
3756 struct cilkred_map **left_map_ptr;
3757 left_map_ptr = fast_path_reductions_for_spawn_return(w, ff);
3759 // Pointer will be non-NULL if there are
3760 // still reductions to execute.
3761 if (left_map_ptr) {
3762 // WARNING: This method call may release the lock
3763 // on ff->parent and re-acquire it (possibly on a
3764 // different worker).
3765 // We can't hold locks while actually executing
3766 // reduce functions.
3767 w = slow_path_reductions_for_spawn_return(w,
3769 left_map_ptr);
3770 verify_current_wkr(w);
3773 finish_spawn_return_on_user_stack(w, ff->parent, ff);
3774 // WARNING: the use of this lock macro is deceptive.
3775 // The worker may have changed here.
3776 } END_WITH_FRAME_LOCK(w, ff->parent);
3777 return w;
3783 * Execute fast "reductions" when ff stalls at a sync.
3785 * @param w The currently executing worker.
3786 * @param ff The full frame stalling at a sync.
3787 * @return 1 if we are finished with all reductions after calling this method.
3788 * @return 0 if we still need to execute the slow path reductions.
3790 static inline
3791 int fast_path_reductions_for_sync(__cilkrts_worker *w,
3792 full_frame *ff) {
3793 // Return 0 if there is some reduction that needs to happen.
3794 return !(w->reducer_map || ff->pending_exception);
3798 * Executes slow reductions when ff stalls at a sync.
3799 * This method should execute only if
3800 * fast_path_reductions_for_sync(w, ff) returned 0.
3802 * After this method completes:
3803 * 1. ff's current reducer map has been deposited into
3804 * right_reducer_map of ff's rightmost child, or
3805 * ff->children_reducer_map if ff has no children.
3806 * 2. Similarly for ff's current exception.
3807 * 3. Nothing to calculate for stacks --- if we are stalling
3808 * we will always free a stack.
3810 * This method may repeatedly acquire/release the lock on ff.
3812 * @param w The currently executing worker.
3813 * @param ff The full frame stalling at a sync.
3814 * @return The worker returning from this method.
3816 static __cilkrts_worker*
3817 slow_path_reductions_for_sync(__cilkrts_worker *w,
3818 full_frame *ff)
3820 struct cilkred_map *left_map;
3821 struct cilkred_map *middle_map;
3823 #if (REDPAR_DEBUG > 0)
3824 CILK_ASSERT(ff);
3825 CILK_ASSERT(w->head == w->tail);
3826 #endif
3828 middle_map = w->reducer_map;
3829 w->reducer_map = NULL;
3831 // Loop invariant: middle_map should be valid (the current map to reduce).
3832 // left_map is junk.
3833 // w->reducer_map == NULL.
3834 while (1) {
3835 BEGIN_WITH_FRAME_LOCK(w, ff) {
3836 splice_left_ptrs left_ptrs = compute_left_ptrs_for_sync(w, ff);
3838 // Grab the "left" map and store pointers to those locations.
3839 left_map = *(left_ptrs.map_ptr);
3840 *(left_ptrs.map_ptr) = NULL;
3842 // Slide the maps in our struct left as far as possible.
3843 if (!left_map) {
3844 left_map = middle_map;
3845 middle_map = NULL;
3848 *(left_ptrs.exception_ptr) =
3849 __cilkrts_merge_pending_exceptions(w,
3850 *left_ptrs.exception_ptr,
3851 ff->pending_exception);
3852 ff->pending_exception = NULL;
3854 // If there is no middle map, then we are done.
3855 // Deposit left and return.
3856 if (!middle_map) {
3857 *(left_ptrs).map_ptr = left_map;
3858 #if (REDPAR_DEBUG > 0)
3859 CILK_ASSERT(NULL == w->reducer_map);
3860 #endif
3861 // Sanity check upon leaving the loop.
3862 verify_current_wkr(w);
3863 // Make sure to unlock before we return!
3864 __cilkrts_frame_unlock(w, ff);
3865 return w;
3867 } END_WITH_FRAME_LOCK(w, ff);
3869 // After we've released the lock, start counting time as
3870 // WORKING again.
3871 STOP_INTERVAL(w, INTERVAL_IN_RUNTIME);
3872 START_INTERVAL(w, INTERVAL_WORKING);
3874 // If we get here, we have a nontrivial reduction to execute.
3875 middle_map = repeated_merge_reducer_maps(&w,
3876 left_map,
3877 middle_map);
3878 verify_current_wkr(w);
3880 STOP_INTERVAL(w, INTERVAL_WORKING);
3881 START_INTERVAL(w, INTERVAL_IN_RUNTIME);
3883 // Save any exceptions generated because of the reduction
3884 // process. These get merged the next time around the
3885 // loop.
3886 CILK_ASSERT(NULL == ff->pending_exception);
3887 ff->pending_exception = w->l->pending_exception;
3888 w->l->pending_exception = NULL;
3891 // We should never break out of the loop above.
3892 CILK_ASSERT(0);
3893 return NULL;
3898 * Execute reductions when ff stalls at a sync.
3900 * Execution starts on w, but may finish on a different worker.
3901 * This method may acquire/release the lock on ff.
3903 * @param w The currently executing worker.
3904 * @param ff The full frame of the spawned function at the sync
3905 * @param sf_at_sync The __cilkrts_stack_frame stalling at a sync
3906 * @return The worker returning from this method.
3908 static __cilkrts_worker*
3909 execute_reductions_for_sync(__cilkrts_worker *w,
3910 full_frame *ff,
3911 __cilkrts_stack_frame *sf_at_sync)
3913 int finished_reductions;
3914 // Step B1 from reducer protocol above:
3915 // Restore runtime invariants.
3917 // The following code for this step is almost equivalent to
3918 // the following sequence:
3919 // 1. disown(w, ff, sf_at_sync, "sync") (which itself
3920 // calls make_unrunnable(w, ff, sf_at_sync))
3921 // 2. make_runnable(w, ff, sf_at_sync).
3923 // The "disown" will mark the frame "sf_at_sync"
3924 // as stolen and suspended, and save its place on the stack,
3925 // so it can be resumed after the sync.
3927 // The difference is, that we don't want the disown to
3928 // break the following connections yet, since we are
3929 // about to immediately make sf/ff runnable again anyway.
3930 // sf_at_sync->worker == w
3931 // w->l->frame_ff == ff.
3933 // These connections are needed for parallel reductions, since
3934 // we will use sf / ff as the stack frame / full frame for
3935 // executing any potential reductions.
3937 // TBD: Can we refactor the disown / make_unrunnable code
3938 // to avoid the code duplication here?
3940 ff->call_stack = NULL;
3942 // Normally, "make_unrunnable" would add CILK_FRAME_STOLEN and
3943 // CILK_FRAME_SUSPENDED to sf_at_sync->flags and save the state of
3944 // the stack so that a worker can resume the frame in the correct
3945 // place.
3947 // But on this path, CILK_FRAME_STOLEN should already be set.
3948 // Also, we technically don't want to suspend the frame until
3949 // the reduction finishes.
3950 // We do, however, need to save the stack before
3951 // we start any reductions, since the reductions might push more
3952 // data onto the stack.
3953 CILK_ASSERT(sf_at_sync->flags | CILK_FRAME_STOLEN);
3955 __cilkrts_put_stack(ff, sf_at_sync);
3956 __cilkrts_make_unrunnable_sysdep(w, ff, sf_at_sync, 1,
3957 "execute_reductions_for_sync");
3958 CILK_ASSERT(w->l->frame_ff == ff);
3960 // Step B2: Execute reductions on user stack.
3961 // Check if we have any "real" reductions to do.
3962 finished_reductions = fast_path_reductions_for_sync(w, ff);
3964 if (!finished_reductions) {
3965 // Still have some real reductions to execute.
3966 // Run them here.
3968 // This method may acquire/release the lock on ff.
3969 w = slow_path_reductions_for_sync(w, ff);
3971 // The previous call may return on a different worker.
3972 // than what we started on.
3973 verify_current_wkr(w);
3976 #if REDPAR_DEBUG >= 0
3977 CILK_ASSERT(w->l->frame_ff == ff);
3978 CILK_ASSERT(ff->call_stack == NULL);
3979 #endif
3981 // Now we suspend the frame ff (since we've
3982 // finished the reductions). Roughly, we've split apart the
3983 // "make_unrunnable" call here --- we've already saved the
3984 // stack info earlier before the reductions execute.
3985 // All that remains is to restore the call stack back into the
3986 // full frame, and mark the frame as suspended.
3987 ff->call_stack = sf_at_sync;
3988 sf_at_sync->flags |= CILK_FRAME_SUSPENDED;
3990 // At a nontrivial sync, we should always free the current fiber,
3991 // because it can not be leftmost.
3992 w->l->fiber_to_free = ff->fiber_self;
3993 ff->fiber_self = NULL;
3994 return w;
3999 Local Variables: **
4000 c-file-style:"bsd" **
4001 c-basic-offset:4 **
4002 indent-tabs-mode:nil **
4003 End: **