libcilkrts/runtime/scheduler.c

   1 /* scheduler.c                  -*-C-*-
   2  *
   3  *************************************************************************
   4  *
   5  *  Copyright (C) 2007-2016, Intel Corporation
   6  *  All rights reserved.
   7  *
   8  *  Redistribution and use in source and binary forms, with or without
   9  *  modification, are permitted provided that the following conditions
  10  *  are met:
  11  *
  12  *    * Redistributions of source code must retain the above copyright
  13  *      notice, this list of conditions and the following disclaimer.
  14  *    * Redistributions in binary form must reproduce the above copyright
  15  *      notice, this list of conditions and the following disclaimer in
  16  *      the documentation and/or other materials provided with the
  17  *      distribution.
  18  *    * Neither the name of Intel Corporation nor the names of its
  19  *      contributors may be used to endorse or promote products derived
  20  *      from this software without specific prior written permission.
  21  *
  22  *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  23  *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  24  *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  25  *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  26  *  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  27  *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  28  *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
  29  *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  30  *  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  31  *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
  32  *  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  33  *  POSSIBILITY OF SUCH DAMAGE.
  34  *
  35  *  *********************************************************************
  36  *
  37  *  PLEASE NOTE: This file is a downstream copy of a file mainitained in
  38  *  a repository at cilkplus.org. Changes made to this file that are not
  39  *  submitted through the contribution process detailed at
  40  *  http://www.cilkplus.org/submit-cilk-contribution will be lost the next
  41  *  time that a new version is released. Changes only submitted to the
  42  *  GNU compiler collection or posted to the git repository at
  43  *  https://bitbucket.org/intelcilkruntime/intel-cilk-runtime.git are
  44  *  not tracked.
  45  *
  46  *  We welcome your contributions to this open source project. Thank you
  47  *  for your assistance in helping us improve Cilk Plus.
  48  *
  49  **************************************************************************/
  50
  51 /*
  52  * Cilk scheduler
  53  */
  54
  55 #include "scheduler.h"
  56 #include "bug.h"
  57 #include "os.h"
  58 #include "os_mutex.h"
  59 #include "local_state.h"
  60 #include "signal_node.h"
  61 #include "full_frame.h"
  62 #include "sysdep.h"
  63 #include "except.h"
  64 #include "cilk_malloc.h"
  65 #include "pedigrees.h"
  66 #include "record-replay.h"
  67
  68 #include <limits.h>
  69 #include <string.h> /* memcpy */
  70 #include <stdio.h>  // sprintf
  71 #include <stdlib.h> // malloc, free, abort
  72
  73 #ifdef _WIN32
  74 #   pragma warning(disable:1786)   // disable warning: sprintf is deprecated
  75 #   include "sysdep-win.h"
  76 #   include "except-win32.h"
  77 #endif  // _WIN32
  78
  79 // ICL: Don't complain about conversion from pointer to same-sized integral
  80 // type in __cilkrts_put_stack.  That's why we're using ptrdiff_t
  81 #ifdef _WIN32
  82 #   pragma warning(disable: 1684)
  83 #endif
  84
  85 #include "cilk/cilk_api.h"
  86 #include "frame_malloc.h"
  87 #include "metacall_impl.h"
  88 #include "reducer_impl.h"
  89 #include "cilk-tbb-interop.h"
  90 #include "cilk-ittnotify.h"
  91 #include "stats.h"
  92
  93 // ICL: Don't complain about loss of precision in myrand
  94 // I tried restoring the warning after the function, but it didn't
  95 // suppress it
  96 #ifdef _WIN32
  97 #   pragma warning(disable: 2259)
  98 #endif
  99
 100 #ifndef _WIN32
 101 #   include <unistd.h>
 102 #endif
 103
 104 #ifdef __VXWORKS__
 105 // redeclare longjmp() with noreturn to stop warnings
 106 extern __attribute__((noreturn))
 107                 void longjmp(jmp_buf, int);
 108 #endif
 109
 110 //#define DEBUG_LOCKS 1
 111 #ifdef DEBUG_LOCKS
 112 // The currently executing worker must own this worker's lock
 113 #   define ASSERT_WORKER_LOCK_OWNED(w) \
 114         { \
 115             __cilkrts_worker *tls_worker = __cilkrts_get_tls_worker(); \
 116             CILK_ASSERT((w)->l->lock.owner == tls_worker); \
 117         }
 118 #else
 119 #   define ASSERT_WORKER_LOCK_OWNED(w)
 120 #endif // DEBUG_LOCKS
 121
 122 // Options for the scheduler.
 123 enum schedule_t { SCHEDULE_RUN,
 124                   SCHEDULE_WAIT,
 125                   SCHEDULE_EXIT };
 126
 127 // Return values for provably_good_steal()
 128 enum provably_good_steal_t
 129 {
 130     ABANDON_EXECUTION,  // Not the last child to the sync - attempt to steal work
 131     CONTINUE_EXECUTION, // Last child to the sync - continue executing on this worker
 132     WAIT_FOR_CONTINUE   // The replay log indicates that this was the worker
 133                         // which continued.  Loop until we are the last worker
 134                         // to the sync.
 135 };
 136
 137
 138 // Verify that "w" is the worker we are currently executing on.
 139 // Because this check is expensive, this method is usually a no-op.
 140 static inline void verify_current_wkr(__cilkrts_worker *w)
 141 {
 142 #if ((REDPAR_DEBUG >= 3) || (FIBER_DEBUG >= 1))
 143     // Lookup the worker from TLS and compare to w.
 144     __cilkrts_worker* tmp = __cilkrts_get_tls_worker();
 145     if (w != tmp) {
 146         fprintf(stderr, "Error.  W=%d, actual worker =%d...\n",
 147                 w->self,
 148                 tmp->self);
 149     }
 150     CILK_ASSERT(w == tmp);
 151 #endif
 152 }
 153
 154 static enum schedule_t worker_runnable(__cilkrts_worker *w);
 155
 156 // Scheduling-fiber functions:
 157 static void do_return_from_spawn (__cilkrts_worker *w,
 158                                   full_frame *ff,
 159                                   __cilkrts_stack_frame *sf);
 160 static void do_sync (__cilkrts_worker *w,
 161                      full_frame *ff,
 162                      __cilkrts_stack_frame *sf);
 163
 164 // max is defined on Windows and VxWorks
 165 #if (! defined(_WIN32)) && (! defined(__VXWORKS__))
 166     // TBD: definition of max() for Linux.
 167 #   define max(a, b) ((a) < (b) ? (b) : (a))
 168 #endif
 169
 170 void __cilkrts_dump_stats_to_stderr(global_state_t *g)
 171 {
 172 #ifdef CILK_PROFILE
 173     int i;
 174     for (i = 0; i < g->total_workers; ++i) {
 175         // Print out statistics for each worker.  We collected them,
 176         // so why not print them out?
 177         fprintf(stderr, "Stats for worker %d\n", i);
 178         dump_stats_to_file(stderr, g->workers[i]->l->stats);
 179         __cilkrts_accum_stats(&g->stats, g->workers[i]->l->stats);
 180     }
 181
 182     // Also print out aggregate statistics.
 183     dump_stats_to_file(stderr, &g->stats);
 184 #endif
 185     fprintf(stderr,
 186             "CILK PLUS Thread Info: P=%d, Q=%d\n",
 187             g->P,
 188             g->Q);
 189     fprintf(stderr,
 190             "CILK PLUS RUNTIME MEMORY USAGE: %lld bytes",
 191             (long long)g->frame_malloc.allocated_from_os);
 192 #ifdef CILK_PROFILE
 193     if (g->stats.stack_hwm)
 194         fprintf(stderr, ", %ld stacks", g->stats.stack_hwm);
 195 #endif
 196     fputc('\n', stderr);
 197 }
 198
 199 static void validate_worker(__cilkrts_worker *w)
 200 {
 201     /* check the magic numbers, for debugging purposes */
 202     if (w->l->worker_magic_0 != WORKER_MAGIC_0 ||
 203         w->l->worker_magic_1 != WORKER_MAGIC_1)
 204         abort_because_rts_is_corrupted();
 205 }
 206
 207 static void double_link(full_frame *left_ff, full_frame *right_ff)
 208 {
 209     if (left_ff)
 210         left_ff->right_sibling = right_ff;
 211     if (right_ff)
 212         right_ff->left_sibling = left_ff;
 213 }
 214
 215 /* add CHILD to the right of all children of PARENT */
 216 static void push_child(full_frame *parent_ff, full_frame *child_ff)
 217 {
 218     double_link(parent_ff->rightmost_child, child_ff);
 219     double_link(child_ff, 0);
 220     parent_ff->rightmost_child = child_ff;
 221 }
 222
 223 /* unlink CHILD from the list of all children of PARENT */
 224 static void unlink_child(full_frame *parent_ff, full_frame *child_ff)
 225 {
 226     double_link(child_ff->left_sibling, child_ff->right_sibling);
 227
 228     if (!child_ff->right_sibling) {
 229         /* this is the rightmost child -- update parent link */
 230         CILK_ASSERT(parent_ff->rightmost_child == child_ff);
 231         parent_ff->rightmost_child = child_ff->left_sibling;
 232     }
 233     child_ff->left_sibling = child_ff->right_sibling = 0; /* paranoia */
 234 }
 235
 236 static void incjoin(full_frame *ff)
 237 {
 238     ++ff->join_counter;
 239 }
 240
 241 static int decjoin(full_frame *ff)
 242 {
 243     CILK_ASSERT(ff->join_counter > 0);
 244     return (--ff->join_counter);
 245 }
 246
 247 static int simulate_decjoin(full_frame *ff)
 248 {
 249   CILK_ASSERT(ff->join_counter > 0);
 250   return (ff->join_counter - 1);
 251 }
 252
 253 /*
 254  * Pseudo-random generator defined by the congruence S' = 69070 * S
 255  * mod (2^32 - 5).  Marsaglia (CACM July 1993) says on page 107 that
 256  * this is a ``good one''.  There you go.
 257  *
 258  * The literature makes a big fuss about avoiding the division, but
 259  * for us it is not worth the hassle.
 260  */
 261 static const unsigned RNGMOD = ((1ULL << 32) - 5);
 262 static const unsigned RNGMUL = 69070U;
 263
 264 static unsigned myrand(__cilkrts_worker *w)
 265 {
 266     unsigned state = w->l->rand_seed;
 267     state = (unsigned)((RNGMUL * (unsigned long long)state) % RNGMOD);
 268     w->l->rand_seed = state;
 269     return state;
 270 }
 271
 272 static void mysrand(__cilkrts_worker *w, unsigned seed)
 273 {
 274     seed %= RNGMOD;
 275     seed += (seed == 0); /* 0 does not belong to the multiplicative
 276                             group.  Use 1 instead */
 277     w->l->rand_seed = seed;
 278 }
 279
 280 /* W grabs its own lock */
 281 void __cilkrts_worker_lock(__cilkrts_worker *w)
 282 {
 283     validate_worker(w);
 284     CILK_ASSERT(w->l->do_not_steal == 0);
 285
 286     /* tell thieves to stay out of the way */
 287     w->l->do_not_steal = 1;
 288     __cilkrts_fence(); /* probably redundant */
 289
 290     __cilkrts_mutex_lock(w, &w->l->lock);
 291 }
 292
 293 void __cilkrts_worker_unlock(__cilkrts_worker *w)
 294 {
 295     __cilkrts_mutex_unlock(w, &w->l->lock);
 296     CILK_ASSERT(w->l->do_not_steal == 1);
 297     /* The fence is probably redundant.  Use a release
 298        operation when supported (gcc and compatibile);
 299        that is faster on x86 which serializes normal stores. */
 300 #if defined __GNUC__ && (__GNUC__ * 10 + __GNUC_MINOR__ > 43 || __ICC >= 1110)
 301     __sync_lock_release(&w->l->do_not_steal);
 302 #else
 303     w->l->do_not_steal = 0;
 304     __cilkrts_fence(); /* store-store barrier, redundant on x86 */
 305 #endif
 306 }
 307
 308 /* try to acquire the lock of some *other* worker */
 309 static int worker_trylock_other(__cilkrts_worker *w,
 310                                 __cilkrts_worker *other)
 311 {
 312     int status = 0;
 313
 314     validate_worker(other);
 315
 316     /* This protocol guarantees that, after setting the DO_NOT_STEAL
 317        flag, worker W can enter its critical section after waiting for
 318        the thief currently in the critical section (if any) and at
 319        most one other thief.
 320
 321        This requirement is overly paranoid, but it should protect us
 322        against future nonsense from OS implementors.
 323     */
 324
 325     /* compete for the right to disturb OTHER */
 326     if (__cilkrts_mutex_trylock(w, &other->l->steal_lock)) {
 327         if (other->l->do_not_steal) {
 328             /* leave it alone */
 329         } else {
 330             status = __cilkrts_mutex_trylock(w, &other->l->lock);
 331         }
 332         __cilkrts_mutex_unlock(w, &other->l->steal_lock);
 333     }
 334
 335
 336     return status;
 337 }
 338
 339 static void worker_unlock_other(__cilkrts_worker *w,
 340                                 __cilkrts_worker *other)
 341 {
 342     __cilkrts_mutex_unlock(w, &other->l->lock);
 343 }
 344
 345
 346 /* Lock macro Usage:
 347     BEGIN_WITH_WORKER_LOCK(w) {
 348         statement;
 349         statement;
 350         BEGIN_WITH_FRAME_LOCK(w, ff) {
 351             statement;
 352             statement;
 353         } END_WITH_FRAME_LOCK(w, ff);
 354     } END_WITH_WORKER_LOCK(w);
 355  */
 356 #define BEGIN_WITH_WORKER_LOCK(w) __cilkrts_worker_lock(w); do
 357 #define END_WITH_WORKER_LOCK(w)   while (__cilkrts_worker_unlock(w), 0)
 358
 359 // TBD(jsukha): These are worker lock acquistions on
 360 // a worker whose deque is empty.  My conjecture is that we
 361 // do not need to hold the worker lock at these points.
 362 // I have left them in for now, however.
 363 //
 364 // #define REMOVE_POSSIBLY_OPTIONAL_LOCKS
 365 #ifdef REMOVE_POSSIBLY_OPTIONAL_LOCKS
 366     #define BEGIN_WITH_WORKER_LOCK_OPTIONAL(w) do
 367     #define END_WITH_WORKER_LOCK_OPTIONAL(w)   while (0)
 368 #else
 369     #define BEGIN_WITH_WORKER_LOCK_OPTIONAL(w) __cilkrts_worker_lock(w); do
 370     #define END_WITH_WORKER_LOCK_OPTIONAL(w)   while (__cilkrts_worker_unlock(w), 0)
 371 #endif
 372
 373
 374 #define BEGIN_WITH_FRAME_LOCK(w, ff)                                     \
 375     do { full_frame *_locked_ff = ff; __cilkrts_frame_lock(w, _locked_ff); do
 376
 377 #define END_WITH_FRAME_LOCK(w, ff)                       \
 378     while (__cilkrts_frame_unlock(w, _locked_ff), 0); } while (0)
 379
 380 /* W becomes the owner of F and F can be stolen from W */
 381 static void make_runnable(__cilkrts_worker *w, full_frame *ff)
 382 {
 383     w->l->frame_ff = ff;
 384
 385     /* CALL_STACK is invalid (the information is stored implicitly in W) */
 386     ff->call_stack = 0;
 387 }
 388
 389 /*
 390  * The worker parameter is unused, except for print-debugging purposes.
 391  */
 392 static void make_unrunnable(__cilkrts_worker *w,
 393                             full_frame *ff,
 394                             __cilkrts_stack_frame *sf,
 395                             int is_loot,
 396                             const char *why)
 397 {
 398     /* CALL_STACK becomes valid again */
 399     ff->call_stack = sf;
 400
 401     if (sf) {
 402 #if CILK_LIB_DEBUG
 403         if (__builtin_expect(sf->flags & CILK_FRAME_EXITING, 0))
 404             __cilkrts_bug("W%d suspending exiting frame %p/%p\n", w->self, ff, sf);
 405 #endif
 406         sf->flags |= CILK_FRAME_STOLEN | CILK_FRAME_SUSPENDED;
 407         sf->worker = 0;
 408
 409         if (is_loot)
 410             __cilkrts_put_stack(ff, sf);
 411
 412         /* perform any system-dependent action, such as saving the
 413            state of the stack */
 414         __cilkrts_make_unrunnable_sysdep(w, ff, sf, is_loot, why);
 415     }
 416 }
 417
 418
 419 /* Push the next full frame to be made active in this worker and increment its
 420  * join counter.  __cilkrts_push_next_frame and pop_next_frame work on a
 421  * one-element queue.  This queue is used to communicate across the runtime
 422  * from the code that wants to activate a frame to the code that can actually
 423  * begin execution on that frame.  They are asymetrical in that push
 424  * increments the join counter but pop does not decrement it.  Rather, a
 425  * single push/pop combination makes a frame active and increments its join
 426  * counter once. */
 427 void __cilkrts_push_next_frame(__cilkrts_worker *w, full_frame *ff)
 428 {
 429     CILK_ASSERT(ff);
 430     CILK_ASSERT(!w->l->next_frame_ff);
 431     incjoin(ff);
 432     w->l->next_frame_ff = ff;
 433 }
 434
 435 /* Get the next full-frame to be made active in this worker.  The join count
 436  * of the full frame will have been incremented by the corresponding push
 437  * event.  See __cilkrts_push_next_frame, above.
 438  */
 439 static full_frame *pop_next_frame(__cilkrts_worker *w)
 440 {
 441     full_frame *ff;
 442     ff = w->l->next_frame_ff;
 443     // Remove the frame from the next_frame field.
 444     //
 445     // If this is a user worker, then there is a chance that another worker
 446     // from our team could push work into our next_frame (if it is the last
 447     // worker doing work for this team).  The other worker's setting of the
 448     // next_frame could race with our setting of next_frame to NULL.  This is
 449     // the only possible race condition on next_frame.  However, if next_frame
 450     // has a non-NULL value, then it means the team still has work to do, and
 451     // there is no chance of another team member populating next_frame.  Thus,
 452     // it is safe to set next_frame to NULL, if it was populated.  There is no
 453     // need for an atomic op.
 454     if (NULL != ff) {
 455         w->l->next_frame_ff = NULL;
 456     }
 457     return ff;
 458 }
 459
 460 /*
 461  * Identify the single worker that is allowed to cross a sync in this frame.  A
 462  * thief should call this function when it is the first to steal work from a
 463  * user worker.  "First to steal work" may mean that there has been parallelism
 464  * in the user worker before, but the whole team sync'd, and this is the first
 465  * steal after that.
 466  *
 467  * This should happen while holding the worker and frame lock.
 468  */
 469 static void set_sync_master(__cilkrts_worker *w, full_frame *ff)
 470 {
 471     w->l->last_full_frame = ff;
 472     ff->sync_master = w;
 473 }
 474
 475 /*
 476  * The sync that ends all parallelism for a particular user worker is about to
 477  * be crossed.  Decouple the worker and frame.
 478  *
 479  * No locks need to be held since the user worker isn't doing anything, and none
 480  * of the system workers can steal from it.  But unset_sync_master() should be
 481  * called before the user worker knows about this work (i.e., before it is
 482  * inserted into the w->l->next_frame_ff is set).
 483  */
 484 static void unset_sync_master(__cilkrts_worker *w, full_frame *ff)
 485 {
 486     CILK_ASSERT(WORKER_USER == w->l->type);
 487     CILK_ASSERT(ff->sync_master == w);
 488     ff->sync_master = NULL;
 489     w->l->last_full_frame = NULL;
 490 }
 491
 492 /********************************************************************
 493  * THE protocol:
 494  ********************************************************************/
 495 /*
 496  * This is a protocol for work stealing that minimizes the overhead on
 497  * the victim.
 498  *
 499  * The protocol uses three shared pointers into the worker's deque:
 500  * - T - the "tail"
 501  * - H - the "head"
 502  * - E - the "exception"  NB: In this case, "exception" has nothing to do
 503  * with C++ throw-catch exceptions -- it refers only to a non-normal return,
 504  * i.e., a steal or similar scheduling exception.
 505  *
 506  * with H <= E, H <= T.
 507  *
 508  * Stack frames SF, where H <= E < T, are available for stealing.
 509  *
 510  * The worker operates on the T end of the stack.  The frame being
 511  * worked on is not on the stack.  To make a continuation available for
 512  * stealing the worker pushes a from onto the stack: stores *T++ = SF.
 513  * To return, it pops the frame off the stack: obtains SF = *--T.
 514  *
 515  * After decrementing T, the condition E > T signals to the victim that
 516  * it should invoke the runtime system's "THE" exception handler.  The
 517  * pointer E can become INFINITY, in which case the victim must invoke
 518  * the THE exception handler as soon as possible.
 519  *
 520  * See "The implementation of the Cilk-5 multithreaded language", PLDI 1998,
 521  * http://portal.acm.org/citation.cfm?doid=277652.277725, for more information
 522  * on the THE protocol.
 523  */
 524
 525 /* the infinity value of E */
 526 #define EXC_INFINITY  ((__cilkrts_stack_frame **) (-1))
 527
 528 static void increment_E(__cilkrts_worker *victim)
 529 {
 530     __cilkrts_stack_frame *volatile *tmp;
 531
 532     // The currently executing worker must own the worker lock to touch
 533     // victim->exc
 534     ASSERT_WORKER_LOCK_OWNED(victim);
 535
 536     tmp = victim->exc;
 537     if (tmp != EXC_INFINITY) {
 538         /* On most x86 this pair of operations would be slightly faster
 539            as an atomic exchange due to the implicit memory barrier in
 540            an atomic instruction. */
 541         victim->exc = tmp + 1;
 542         __cilkrts_fence();
 543     }
 544 }
 545
 546 static void decrement_E(__cilkrts_worker *victim)
 547 {
 548     __cilkrts_stack_frame *volatile *tmp;
 549
 550     // The currently executing worker must own the worker lock to touch
 551     // victim->exc
 552     ASSERT_WORKER_LOCK_OWNED(victim);
 553
 554     tmp = victim->exc;
 555     if (tmp != EXC_INFINITY) {
 556         /* On most x86 this pair of operations would be slightly faster
 557            as an atomic exchange due to the implicit memory barrier in
 558            an atomic instruction. */
 559         victim->exc = tmp - 1;
 560         __cilkrts_fence(); /* memory fence not really necessary */
 561     }
 562 }
 563
 564 #if 0
 565 /* for now unused, will be necessary if we implement abort */
 566 static void signal_THE_exception(__cilkrts_worker *wparent)
 567 {
 568     wparent->exc = EXC_INFINITY;
 569     __cilkrts_fence();
 570 }
 571 #endif
 572
 573 static void reset_THE_exception(__cilkrts_worker *w)
 574 {
 575     // The currently executing worker must own the worker lock to touch
 576     // w->exc
 577     ASSERT_WORKER_LOCK_OWNED(w);
 578
 579     w->exc = w->head;
 580     __cilkrts_fence();
 581 }
 582
 583 /* conditions under which victim->head can be stolen: */
 584 static int can_steal_from(__cilkrts_worker *victim)
 585 {
 586     return ((victim->head < victim->tail) &&
 587             (victim->head < victim->protected_tail));
 588 }
 589
 590 /* Return TRUE if the frame can be stolen, false otherwise */
 591 static int dekker_protocol(__cilkrts_worker *victim)
 592 {
 593     // increment_E and decrement_E are going to touch victim->exc.  The
 594     // currently executing worker must own victim's lock before they can
 595     // modify it
 596     ASSERT_WORKER_LOCK_OWNED(victim);
 597
 598     /* ASSERT(E >= H); */
 599
 600     increment_E(victim);
 601
 602     /* ASSERT(E >= H + 1); */
 603     if (can_steal_from(victim)) {
 604         /* success, we can steal victim->head and set H <- H + 1
 605            in detach() */
 606         return 1;
 607     } else {
 608         /* failure, restore previous state */
 609         decrement_E(victim);
 610         return 0;
 611     }
 612 }
 613
 614
 615 /* Link PARENT and CHILD in the spawn tree */
 616 static full_frame *make_child(__cilkrts_worker *w,
 617                               full_frame *parent_ff,
 618                               __cilkrts_stack_frame *child_sf,
 619                               cilk_fiber *fiber)
 620 {
 621     full_frame *child_ff = __cilkrts_make_full_frame(w, child_sf);
 622
 623     child_ff->parent = parent_ff;
 624     push_child(parent_ff, child_ff);
 625
 626     //DBGPRINTF("%d-          make_child - child_frame: %p, parent_frame: %p, child_sf: %p\n"
 627     //    "            parent - parent: %p, left_sibling: %p, right_sibling: %p, rightmost_child: %p\n"
 628     //    "            child  - parent: %p, left_sibling: %p, right_sibling: %p, rightmost_child: %p\n",
 629     //          w->self, child, parent, child_sf,
 630     //          parent->parent, parent->left_sibling, parent->right_sibling, parent->rightmost_child,
 631     //          child->parent, child->left_sibling, child->right_sibling, child->rightmost_child);
 632     CILK_ASSERT(parent_ff->call_stack);
 633     child_ff->is_call_child = (fiber == NULL);
 634
 635     /* PLACEHOLDER_FIBER is used as non-null marker indicating that
 636        child should be treated as a spawn child even though we have not
 637        yet assigned a real fiber to its parent. */
 638     if (fiber == PLACEHOLDER_FIBER)
 639         fiber = NULL; /* Parent actually gets a null fiber, for now */
 640
 641     /* perform any system-dependent actions, such as capturing
 642        parameter passing information */
 643     /*__cilkrts_make_child_sysdep(child, parent);*/
 644
 645     /* Child gets reducer map and stack of parent.
 646        Parent gets a new map and new stack. */
 647     child_ff->fiber_self = parent_ff->fiber_self;
 648     child_ff->sync_master = NULL;
 649
 650     if (child_ff->is_call_child) {
 651         /* Cause segfault on any attempted access.  The parent gets
 652            the child map and stack when the child completes. */
 653         parent_ff->fiber_self = 0;
 654     } else {
 655         parent_ff->fiber_self = fiber;
 656     }
 657
 658     incjoin(parent_ff);
 659     return child_ff;
 660 }
 661
 662 static inline __cilkrts_stack_frame *__cilkrts_advance_frame(__cilkrts_stack_frame *sf)
 663 {
 664     __cilkrts_stack_frame *p = sf->call_parent;
 665     sf->call_parent = 0;
 666     return p;
 667 }
 668
 669 /* w should be the currently executing worker.
 670  * loot_sf is the youngest stack frame in the call stack being
 671  *   unrolled (i.e., the most deeply nested stack frame.)
 672  *
 673  * When this method is called for a steal, loot_sf should be on a
 674  * victim worker which is different from w.
 675  * For CILK_FORCE_REDUCE, the victim worker will equal w.
 676  *
 677  * Before execution, the __cilkrts_stack_frame's have pointers from
 678  * older to younger, i.e., a __cilkrts_stack_frame points to parent.
 679  *
 680  * This method creates a full frame for each __cilkrts_stack_frame in
 681  * the call stack, with each full frame also pointing to its parent.
 682  *
 683  * The method returns the full frame created for loot_sf, i.e., the
 684  * youngest full frame.
 685  */
 686 static full_frame *unroll_call_stack(__cilkrts_worker *w,
 687                                      full_frame *ff,
 688                                      __cilkrts_stack_frame *const loot_sf)
 689 {
 690     __cilkrts_stack_frame *sf = loot_sf;
 691     __cilkrts_stack_frame *rev_sf = 0;
 692     __cilkrts_stack_frame *t_sf;
 693
 694     CILK_ASSERT(sf);
 695     /*CILK_ASSERT(sf->call_parent != sf);*/
 696
 697     /* The leafmost frame is unsynched. */
 698     if (sf->worker != w)
 699         sf->flags |= CILK_FRAME_UNSYNCHED;
 700
 701     /* Reverse the call stack to make a linked list ordered from parent
 702        to child.  sf->call_parent points to the child of SF instead of
 703        the parent.  */
 704     do {
 705         t_sf = (sf->flags & (CILK_FRAME_DETACHED|CILK_FRAME_STOLEN|CILK_FRAME_LAST))? 0 : sf->call_parent;
 706         sf->call_parent = rev_sf;
 707         rev_sf = sf;
 708         sf = t_sf;
 709     } while (sf);
 710     sf = rev_sf;
 711
 712     /* Promote each stack frame to a full frame in order from parent
 713        to child, following the reversed list we just built. */
 714     make_unrunnable(w, ff, sf, sf == loot_sf, "steal 1");
 715     /* T is the *child* of SF, because we have reversed the list */
 716     for (t_sf = __cilkrts_advance_frame(sf); t_sf;
 717          sf = t_sf, t_sf = __cilkrts_advance_frame(sf)) {
 718         ff = make_child(w, ff, t_sf, NULL);
 719         make_unrunnable(w, ff, t_sf, t_sf == loot_sf, "steal 2");
 720     }
 721
 722     /* XXX What if the leafmost frame does not contain a sync
 723        and this steal is from promote own deque? */
 724     /*sf->flags |= CILK_FRAME_UNSYNCHED;*/
 725
 726     CILK_ASSERT(!sf->call_parent);
 727     return ff;
 728 }
 729
 730 /* detach the top of the deque frame from the VICTIM and install a new
 731    CHILD frame in its place */
 732 static void detach_for_steal(__cilkrts_worker *w,
 733                              __cilkrts_worker *victim,
 734                              cilk_fiber* fiber)
 735 {
 736     /* ASSERT: we own victim->lock */
 737
 738     full_frame *parent_ff, *child_ff, *loot_ff;
 739     __cilkrts_stack_frame *volatile *h;
 740     __cilkrts_stack_frame *sf;
 741
 742     w->l->team = victim->l->team;
 743
 744     CILK_ASSERT(w->l->frame_ff == 0 || w == victim);
 745
 746     h = victim->head;
 747
 748     CILK_ASSERT(*h);
 749
 750     victim->head = h + 1;
 751
 752     parent_ff = victim->l->frame_ff;
 753     BEGIN_WITH_FRAME_LOCK(w, parent_ff) {
 754         /* parent no longer referenced by victim */
 755         decjoin(parent_ff);
 756
 757         /* obtain the victim call stack */
 758         sf = *h;
 759
 760         /* perform system-dependent normalizations */
 761         /*__cilkrts_normalize_call_stack_on_steal(sf);*/
 762
 763         /* unroll PARENT_FF with call stack SF, adopt the youngest
 764            frame LOOT.  If loot_ff == parent_ff, then we hold loot_ff->lock,
 765            otherwise, loot_ff is newly created and we can modify it without
 766            holding its lock. */
 767         loot_ff = unroll_call_stack(w, parent_ff, sf);
 768
 769         #if REDPAR_DEBUG >= 3
 770         fprintf(stderr, "[W=%d, victim=%d, desc=detach, parent_ff=%p, loot=%p]\n",
 771                 w->self, victim->self,
 772                 parent_ff, loot_ff);
 773         #endif
 774
 775         if (WORKER_USER == victim->l->type &&
 776             NULL == victim->l->last_full_frame) {
 777             // Mark this looted frame as special: only the original user worker
 778             // may cross the sync.
 779             //
 780             // This call is a shared access to
 781             // victim->l->last_full_frame.
 782             set_sync_master(victim, loot_ff);
 783         }
 784
 785         /* LOOT is the next frame that the thief W is supposed to
 786            run, unless the thief is stealing from itself, in which
 787            case the thief W == VICTIM executes CHILD and nobody
 788            executes LOOT. */
 789         if (w == victim) {
 790             /* Pretend that frame has been stolen */
 791             loot_ff->call_stack->flags |= CILK_FRAME_UNSYNCHED;
 792             loot_ff->simulated_stolen = 1;
 793         }
 794         else
 795             __cilkrts_push_next_frame(w, loot_ff);
 796
 797         // After this "push_next_frame" call, w now owns loot_ff.
 798         child_ff = make_child(w, loot_ff, 0, fiber);
 799
 800         BEGIN_WITH_FRAME_LOCK(w, child_ff) {
 801             /* install child in the victim's work queue, taking
 802                the parent_ff's place */
 803             /* child is referenced by victim */
 804             incjoin(child_ff);
 805
 806             // With this call, w is bestowing ownership of the newly
 807             // created frame child_ff to the victim, and victim is
 808             // giving up ownership of parent_ff.
 809             //
 810             // Worker w will either take ownership of parent_ff
 811             // if parent_ff == loot_ff, or parent_ff will be
 812             // suspended.
 813             //
 814             // Note that this call changes the victim->frame_ff
 815             // while the victim may be executing.
 816             make_runnable(victim, child_ff);
 817         } END_WITH_FRAME_LOCK(w, child_ff);
 818     } END_WITH_FRAME_LOCK(w, parent_ff);
 819 }
 820
 821 /**
 822  * @brief cilk_fiber_proc that resumes user code after a successful
 823  * random steal.
 824
 825  * This function longjmps back into the user code whose state is
 826  * stored in cilk_fiber_get_data(fiber)->resume_sf.  The stack pointer
 827  * is adjusted so that the code resumes on the specified fiber stack
 828  * instead of its original stack.
 829  *
 830  * This method gets executed only on a fiber freshly allocated from a
 831  * pool.
 832  *
 833  * @param fiber   The fiber being used to resume user code.
 834  * @param arg     Unused.
 835  */
 836 static
 837 void fiber_proc_to_resume_user_code_for_random_steal(cilk_fiber *fiber)
 838 {
 839     cilk_fiber_data *data = cilk_fiber_get_data(fiber);
 840     __cilkrts_stack_frame* sf = data->resume_sf;
 841     full_frame *ff;
 842
 843     CILK_ASSERT(sf);
 844
 845     // When we pull the resume_sf out of the fiber to resume it, clear
 846     // the old value.
 847     data->resume_sf = NULL;
 848     CILK_ASSERT(sf->worker == data->owner);
 849     ff = sf->worker->l->frame_ff;
 850
 851     // For Win32, we need to overwrite the default exception handler
 852     // in this function, so that when the OS exception handling code
 853     // walks off the top of the current Cilk stack, it reaches our stub
 854     // handler.
 855
 856     // Also, this function needs to be wrapped into a try-catch block
 857     // so the compiler generates the appropriate exception information
 858     // in this frame.
 859
 860     // TBD: IS THIS HANDLER IN THE WRONG PLACE?  Can we longjmp out of
 861     // this function (and does it matter?)
 862 #if defined(_WIN32) && !defined(_WIN64)
 863     install_exception_stub_handler();
 864     __try
 865 #endif
 866     {
 867         char* new_sp = sysdep_reset_jump_buffers_for_resume(fiber, ff, sf);
 868
 869         // Notify the Intel tools that we're stealing code
 870         ITT_SYNC_ACQUIRED(sf->worker);
 871         NOTIFY_ZC_INTRINSIC("cilk_continue", sf);
 872
 873         // TBD: We'd like to move TBB-interop methods into the fiber
 874         // eventually.
 875         cilk_fiber_invoke_tbb_stack_op(fiber, CILK_TBB_STACK_ADOPT);
 876
 877         sf->flags &= ~CILK_FRAME_SUSPENDED;
 878
 879         // longjmp to user code.  Don't process exceptions here,
 880         // because we are resuming a stolen frame.
 881         sysdep_longjmp_to_sf(new_sp, sf, NULL);
 882         /*NOTREACHED*/
 883         // Intel's C compiler respects the preceding lint pragma
 884     }
 885 #if defined(_WIN32) && !defined(_WIN64)
 886     __except (CILK_ASSERT(!"should not execute the the stub filter"),
 887               EXCEPTION_EXECUTE_HANDLER)
 888     {
 889         // If we are here, that means something very wrong
 890         // has happened in our exception processing...
 891         CILK_ASSERT(! "should not be here!");
 892     }
 893 #endif
 894 }
 895
 896 static void random_steal(__cilkrts_worker *w)
 897 {
 898     __cilkrts_worker *victim = NULL;
 899     cilk_fiber *fiber = NULL;
 900     int n;
 901     int success = 0;
 902     int32_t victim_id;
 903
 904     // Nothing's been stolen yet. When true, this will flag
 905     // setup_for_execution_pedigree to increment the pedigree
 906     w->l->work_stolen = 0;
 907
 908     /* If the user has disabled stealing (using the debugger) we fail */
 909     if (__builtin_expect(w->g->stealing_disabled, 0))
 910         return;
 911
 912     CILK_ASSERT(w->l->type == WORKER_SYSTEM || w->l->team == w);
 913
 914     /* If there is only one processor work can still be stolen.
 915        There must be only one worker to prevent stealing. */
 916     CILK_ASSERT(w->g->total_workers > 1);
 917
 918     /* pick random *other* victim */
 919     n = myrand(w) % (w->g->total_workers - 1);
 920     if (n >= w->self)
 921         ++n;
 922
 923     // If we're replaying a log, override the victim.  -1 indicates that
 924     // we've exhausted the list of things this worker stole when we recorded
 925     // the log so just return.  If we're not replaying a log,
 926     // replay_get_next_recorded_victim() just returns the victim ID passed in.
 927     n = replay_get_next_recorded_victim(w, n);
 928     if (-1 == n)
 929         return;
 930
 931     victim = w->g->workers[n];
 932
 933     START_INTERVAL(w, INTERVAL_FIBER_ALLOCATE) {
 934         /* Verify that we can get a stack.  If not, no need to continue. */
 935         fiber = cilk_fiber_allocate(&w->l->fiber_pool);
 936     } STOP_INTERVAL(w, INTERVAL_FIBER_ALLOCATE);
 937
 938
 939     if (NULL == fiber) {
 940 #if FIBER_DEBUG >= 2
 941         fprintf(stderr, "w=%d: failed steal because we could not get a fiber\n",
 942                 w->self);
 943 #endif
 944         return;
 945     }
 946
 947     /* do not steal from self */
 948     CILK_ASSERT (victim != w);
 949
 950     /* Execute a quick check before engaging in the THE protocol.
 951        Avoid grabbing locks if there is nothing to steal. */
 952     if (!can_steal_from(victim)) {
 953         NOTE_INTERVAL(w, INTERVAL_STEAL_FAIL_EMPTYQ);
 954         START_INTERVAL(w, INTERVAL_FIBER_DEALLOCATE) {
 955             int ref_count = cilk_fiber_remove_reference(fiber, &w->l->fiber_pool);
 956             // Fibers we use when trying to steal should not be active,
 957             // and thus should not have any other references.
 958             CILK_ASSERT(0 == ref_count);
 959         } STOP_INTERVAL(w, INTERVAL_FIBER_DEALLOCATE);
 960         return;
 961     }
 962
 963     /* Attempt to steal work from the victim */
 964     if (worker_trylock_other(w, victim)) {
 965         if (w->l->type == WORKER_USER && victim->l->team != w) {
 966
 967             // Fail to steal if this is a user worker and the victim is not
 968             // on this team.  If a user worker were allowed to steal work
 969             // descended from another user worker, the former might not be
 970             // done with its work by the time it was needed to resume and
 971             // unbind.  Therefore, user workers are not permitted to change
 972             // teams.
 973
 974             // There is no race on the victim's team because the victim cannot
 975             // change its team until it runs out of work to do, at which point
 976             // it will try to take out its own lock, and this worker already
 977             // holds it.
 978             NOTE_INTERVAL(w, INTERVAL_STEAL_FAIL_USER_WORKER);
 979
 980         } else if (victim->l->frame_ff) {
 981             // A successful steal will change victim->frame_ff, even
 982             // though the victim may be executing.  Thus, the lock on
 983             // the victim's deque is also protecting victim->frame_ff.
 984             if (dekker_protocol(victim)) {
 985                 int proceed_with_steal = 1; // optimistic
 986
 987                 // If we're replaying a log, verify that this the correct frame
 988                 // to steal from the victim
 989                 if (! replay_match_victim_pedigree(w, victim))
 990                 {
 991                     // Abort the steal attempt. decrement_E(victim) to
 992                     // counter the increment_E(victim) done by the
 993                     // dekker protocol
 994                     decrement_E(victim);
 995                     proceed_with_steal = 0;
 996                 }
 997
 998                 if (proceed_with_steal)
 999                 {
1000                     START_INTERVAL(w, INTERVAL_STEAL_SUCCESS) {
1001                         success = 1;
1002                         detach_for_steal(w, victim, fiber);
1003                         victim_id = victim->self;
1004
1005                         #if REDPAR_DEBUG >= 1
1006                         fprintf(stderr, "Wkr %d stole from victim %d, fiber = %p\n",
1007                                 w->self, victim->self, fiber);
1008                         #endif
1009
1010                         // The use of victim->self contradicts our
1011                         // classification of the "self" field as
1012                         // local.  But since this code is only for
1013                         // debugging, it is ok.
1014                         DBGPRINTF ("%d-%p: Stealing work from worker %d\n"
1015                             "            sf: %p, call parent: %p\n",
1016                             w->self, GetCurrentFiber(), victim->self,
1017                             w->l->next_frame_ff->call_stack,
1018                             w->l->next_frame_ff->call_stack->call_parent);
1019                     } STOP_INTERVAL(w, INTERVAL_STEAL_SUCCESS);
1020                 }  // end if(proceed_with_steal)
1021             } else {
1022                 NOTE_INTERVAL(w, INTERVAL_STEAL_FAIL_DEKKER);
1023             }
1024         } else {
1025             NOTE_INTERVAL(w, INTERVAL_STEAL_FAIL_EMPTYQ);
1026         }
1027         worker_unlock_other(w, victim);
1028     } else {
1029         NOTE_INTERVAL(w, INTERVAL_STEAL_FAIL_LOCK);
1030     }
1031
1032     // Record whether work was stolen.  When true, this will flag
1033     // setup_for_execution_pedigree to increment the pedigree
1034     w->l->work_stolen = success;
1035
1036     if (0 == success) {
1037         // failed to steal work.  Return the fiber to the pool.
1038         START_INTERVAL(w, INTERVAL_FIBER_DEALLOCATE) {
1039             int ref_count = cilk_fiber_remove_reference(fiber, &w->l->fiber_pool);
1040             // Fibers we use when trying to steal should not be active,
1041             // and thus should not have any other references.
1042             CILK_ASSERT(0 == ref_count);
1043         } STOP_INTERVAL(w, INTERVAL_FIBER_DEALLOCATE);
1044     }
1045     else
1046     {
1047         // Since our steal was successful, finish initialization of
1048         // the fiber.
1049         cilk_fiber_reset_state(fiber,
1050                                fiber_proc_to_resume_user_code_for_random_steal);
1051         // Record the pedigree of the frame that w has stolen.
1052         // record only if CILK_RECORD_LOG is set
1053         replay_record_steal(w, victim_id);
1054     }
1055 }
1056
1057
1058
1059 /**
1060  * At a provably good steal, we need to transfer the child reducer map
1061  * from ff->children_reducer_map into v->reducer_map, where v is the
1062  * worker that resumes execution of ff.
1063  *
1064  * Normally, we have v == w, where w is the currently executing
1065  * worker.  In the case where we are resuming a team leader on a user
1066  * worker, however, v might differ from w.
1067
1068  * Thus, this, operation is a no-op, since we can't really move
1069  * ff->children_reducer_map into w here.
1070  *
1071  * Instead, this work is done in setup_for_execution_reducers().
1072  */
1073 static inline void provably_good_steal_reducers(__cilkrts_worker *w,
1074                                                 full_frame       *ff)
1075 {
1076     // No-op.
1077 }
1078
1079 /* at a provably good steal, incorporate the accumulated exceptions of
1080    children into the parent's exception */
1081 static void provably_good_steal_exceptions(__cilkrts_worker *w,
1082                                            full_frame       *ff)
1083 {
1084     // ASSERT: we own ff->lock
1085     ff->pending_exception =
1086         __cilkrts_merge_pending_exceptions(w,
1087                                            ff->child_pending_exception,
1088                                            ff->pending_exception);
1089     ff->child_pending_exception = NULL;
1090 }
1091
1092 /* At sync discard the frame's old stack and take the leftmost child's. */
1093 static void provably_good_steal_stacks(__cilkrts_worker *w, full_frame *ff)
1094 {
1095     CILK_ASSERT(NULL == ff->fiber_self);
1096     ff->fiber_self = ff->fiber_child;
1097     ff->fiber_child = NULL;
1098 }
1099
1100 static void __cilkrts_mark_synched(full_frame *ff)
1101 {
1102     ff->call_stack->flags &= ~CILK_FRAME_UNSYNCHED;
1103     ff->simulated_stolen = 0;
1104 }
1105
1106 static
1107 enum provably_good_steal_t provably_good_steal(__cilkrts_worker *w,
1108                                                full_frame       *ff)
1109 {
1110     // ASSERT: we hold w->lock and ff->lock
1111
1112     enum provably_good_steal_t result = ABANDON_EXECUTION;
1113
1114     // If the current replay entry is a sync record matching the worker's
1115     // pedigree, AND this isn't the last child to the sync, return
1116     // WAIT_FOR_CONTINUE to indicate that the caller should loop until
1117     // we find the right frame to steal and CONTINUE_EXECUTION is returned.
1118     int match_found = replay_match_sync_pedigree(w);
1119     if (match_found && (0 != simulate_decjoin(ff)))
1120         return WAIT_FOR_CONTINUE;
1121
1122     START_INTERVAL(w, INTERVAL_PROVABLY_GOOD_STEAL) {
1123         if (decjoin(ff) == 0) {
1124             provably_good_steal_reducers(w, ff);
1125             provably_good_steal_exceptions(w, ff);
1126             provably_good_steal_stacks(w, ff);
1127             __cilkrts_mark_synched(ff);
1128
1129             // If the original owner wants this frame back (to resume
1130             // it on its original thread) pass it back now.
1131             if (NULL != ff->sync_master) {
1132                 // The frame wants to go back and be executed by the original
1133                 // user thread.  We can throw caution to the wind and push the
1134                 // frame straight onto its queue because the only way we have
1135                 // gotten to this point of being able to continue execution of
1136                 // the frame is if the original user worker is spinning without
1137                 // work.
1138
1139                 unset_sync_master(w->l->team, ff);
1140                 __cilkrts_push_next_frame(w->l->team, ff);
1141
1142                 // If this is the team leader we're not abandoning the work
1143                 if (w == w->l->team)
1144                     result = CONTINUE_EXECUTION;
1145             } else {
1146                 __cilkrts_push_next_frame(w, ff);
1147                 result = CONTINUE_EXECUTION;  // Continue working on this thread
1148             }
1149
1150             // The __cilkrts_push_next_frame() call changes ownership
1151             // of ff to the specified worker.
1152         }
1153     } STOP_INTERVAL(w, INTERVAL_PROVABLY_GOOD_STEAL);
1154
1155     // Only write a SYNC record if:
1156     // - We're recording a log *AND*
1157     // - We're the worker continuing from this sync
1158     replay_record_sync(w, result == CONTINUE_EXECUTION);
1159
1160     // If we're replaying a log, and matched a sync from the log, mark the
1161     // sync record seen if the sync isn't going to be abandoned.
1162     replay_advance_from_sync (w, match_found, result == CONTINUE_EXECUTION);
1163
1164     return result;
1165 }
1166
1167 static void unconditional_steal(__cilkrts_worker *w,
1168                                 full_frame *ff)
1169 {
1170     // ASSERT: we hold ff->lock
1171
1172     START_INTERVAL(w, INTERVAL_UNCONDITIONAL_STEAL) {
1173         decjoin(ff);
1174         __cilkrts_push_next_frame(w, ff);
1175     } STOP_INTERVAL(w, INTERVAL_UNCONDITIONAL_STEAL);
1176 }
1177
1178
1179 /* CHILD is about to die.  Give its exceptions to a sibling or to the
1180    parent.  */
1181 static inline void splice_exceptions_for_call(__cilkrts_worker *w,
1182                                               full_frame *parent_ff,
1183                                               full_frame *child_ff)
1184 {
1185     // ASSERT: We own parent_ff->lock
1186     CILK_ASSERT(child_ff->is_call_child);
1187     CILK_ASSERT(NULL == child_ff->right_pending_exception);
1188     CILK_ASSERT(NULL == parent_ff->pending_exception);
1189
1190     parent_ff->pending_exception = child_ff->pending_exception;
1191     child_ff->pending_exception = NULL;
1192 }
1193
1194 /**
1195  * Merge exceptions for a dying child.
1196  *
1197  * @param w                   The currently executing worker.
1198  * @param ff                  The child frame that is dying.
1199  * @param left_exception_ptr  Pointer to the exception that is to our left.
1200  */
1201 static inline
1202 void splice_exceptions_for_spawn(__cilkrts_worker *w,
1203                                  full_frame *ff,
1204                                  struct pending_exception_info **left_exception_ptr)
1205 {
1206     // ASSERT: parent_ff == child_ff->parent.
1207     // ASSERT: We own parent_ff->lock
1208
1209     // Merge current exception into the slot where the left
1210     // exception should go.
1211     *left_exception_ptr =
1212         __cilkrts_merge_pending_exceptions(w,
1213                                            *left_exception_ptr,
1214                                            ff->pending_exception);
1215     ff->pending_exception = NULL;
1216
1217
1218     // Merge right exception into the slot where the left exception
1219     // should go.
1220     *left_exception_ptr =
1221         __cilkrts_merge_pending_exceptions(w,
1222                                            *left_exception_ptr,
1223                                            ff->right_pending_exception);
1224     ff->right_pending_exception = NULL;
1225 }
1226
1227
1228 static inline void splice_stacks_for_call(__cilkrts_worker *w,
1229                                           full_frame *parent_ff,
1230                                           full_frame *child_ff)
1231 {
1232 #if CILK_LIB_DEBUG
1233     if (parent_ff->call_stack)
1234         CILK_ASSERT(!(parent_ff->call_stack->flags & CILK_FRAME_MBZ));
1235 #endif
1236
1237     /* A synched frame does not have accumulated child reducers. */
1238     CILK_ASSERT(!child_ff->fiber_child);
1239     CILK_ASSERT(child_ff->is_call_child);
1240
1241     /* An attached parent has no self fiber.  It may have
1242        accumulated child fibers or child owners, which should be
1243        ignored until sync. */
1244     CILK_ASSERT(!parent_ff->fiber_self);
1245     parent_ff->fiber_self = child_ff->fiber_self;
1246     child_ff->fiber_self = NULL;
1247 }
1248
1249 static void finalize_child_for_call(__cilkrts_worker *w,
1250                                     full_frame *parent_ff,
1251                                     full_frame *child_ff)
1252 {
1253     // ASSERT: we hold w->lock and parent_ff->lock
1254
1255     START_INTERVAL(w, INTERVAL_FINALIZE_CHILD) {
1256         CILK_ASSERT(child_ff->is_call_child);
1257         CILK_ASSERT(child_ff->join_counter == 0);
1258         CILK_ASSERT(!child_ff->rightmost_child);
1259         CILK_ASSERT(child_ff == parent_ff->rightmost_child);
1260
1261         // CHILD is about to die.
1262         // Splicing out reducers is a no-op for a call since
1263         // w->reducer_map should already store the correct
1264         // reducer map.
1265
1266         // ASSERT there are no maps left to reduce.
1267         CILK_ASSERT(NULL == child_ff->children_reducer_map);
1268         CILK_ASSERT(NULL == child_ff->right_reducer_map);
1269
1270         splice_exceptions_for_call(w, parent_ff, child_ff);
1271
1272         splice_stacks_for_call(w, parent_ff, child_ff);
1273
1274         /* remove CHILD from list of children of PARENT */
1275         unlink_child(parent_ff, child_ff);
1276
1277         /* continue with the parent. */
1278         unconditional_steal(w, parent_ff);
1279         __cilkrts_destroy_full_frame(w, child_ff);
1280     } STOP_INTERVAL(w, INTERVAL_FINALIZE_CHILD);
1281 }
1282
1283
1284 /**
1285  * The invariant on ff->children_reducer_map is that when ff is
1286  * synched and when we are about to resume execution of ff, at least
1287  * one of ff->children_reducer_map and w->reducer_map must be NULL.
1288  *
1289  * Consider the two possibilities before resuming execution of ff:
1290  *
1291  * 1.  Suppose ff is synched and suspended.  Then either
1292  *
1293  *     (a) ff->children_reducer_map stores the reducer map that w
1294  *         should use, where w is the worker resuming execution of ff,
1295  *         OR
1296  *     (b) w already has a user map, and ff->children_reducer_map is NULL.
1297  *
1298  *     Case (a) happens when we are resuming execution of ff as a
1299  *     provably good steal.  In this case, w->reducer_map should be
1300  *     NULL and ff->children_reducer_map is valid.  To resume
1301  *     execution of ff on w, set w->reducer_map to
1302  *     ff->children_reducer_map.
1303  *
1304  *     Case (b) occurs when we resume execution of ff because ff is a
1305  *     called child.  Then, ff->children_reducer_map should be NULL,
1306  *     and w should already have a valid reducer map when resuming
1307  *     execution of ff.  We resume execution of ff without changing
1308  *     w->reducer_map.
1309  *
1310  * 2. Suppose frame ff is not synched (i.e., it is active and might have
1311  *    active children).   Then ff->children_reducer_map is the slot for
1312  *    storing the reducer map from ff's leftmost child, as in the reducer
1313  *    protocol.   The runtime may resume execution of ff while it is not
1314  *    synched only because of a steal.
1315  *    In this case, while we are resuming ff, ff->children_reducer_map
1316  *    may be non-NULL (because one of ff's children has completed).
1317  *    We resume execution of ff without changing w->reducer_map.
1318  */
1319 static void setup_for_execution_reducers(__cilkrts_worker *w,
1320                                          full_frame *ff)
1321 {
1322     // We only need to move ff->children_reducer_map into
1323     // w->reducer_map in case 1(a).
1324     //
1325     // First check whether ff is synched.
1326     __cilkrts_stack_frame *sf = ff->call_stack;
1327     if (!(sf->flags & CILK_FRAME_UNSYNCHED)) {
1328         // In this case, ff is synched. (Case 1).
1329         CILK_ASSERT(!ff->rightmost_child);
1330
1331         // Test whether we are in case 1(a) and have
1332         // something to do.  Note that if both
1333         // ff->children_reducer_map and w->reducer_map are NULL, we
1334         // can't distinguish between cases 1(a) and 1(b) here.
1335         if (ff->children_reducer_map) {
1336             // We are in Case 1(a).
1337             CILK_ASSERT(!w->reducer_map);
1338             w->reducer_map = ff->children_reducer_map;
1339             ff->children_reducer_map = NULL;
1340         }
1341     }
1342 }
1343
1344 static void setup_for_execution_exceptions(__cilkrts_worker *w,
1345                                            full_frame *ff)
1346 {
1347     CILK_ASSERT(NULL == w->l->pending_exception);
1348     w->l->pending_exception = ff->pending_exception;
1349     ff->pending_exception = NULL;
1350 }
1351
1352 #if 0 /* unused */
1353 static void setup_for_execution_stack(__cilkrts_worker *w,
1354                                       full_frame *ff)
1355 {
1356 }
1357 #endif
1358
1359 /*
1360  * setup_for_execution_pedigree
1361  *
1362  * Copies the pedigree information from the frame we're resuming to the
1363  * worker.  Increments the pedigree if this is work that has been stolen
1364  * to match the increment on a return from a spawn helper.
1365  */
1366 static void setup_for_execution_pedigree(__cilkrts_worker *w)
1367 {
1368     int pedigree_unsynched;
1369     __cilkrts_stack_frame *sf = w->current_stack_frame;
1370
1371     CILK_ASSERT(NULL != sf);
1372
1373     // If this isn't an ABI 1 or later frame, there's no pedigree information
1374     if (0 == CILK_FRAME_VERSION_VALUE(sf->flags))
1375         return;
1376
1377     // Note whether the pedigree is unsynched and clear the flag before
1378     // we forget
1379     pedigree_unsynched = sf->flags & CILK_FRAME_SF_PEDIGREE_UNSYNCHED;
1380     sf->flags &= ~CILK_FRAME_SF_PEDIGREE_UNSYNCHED;
1381
1382     // If we're just marshalling onto this worker, do not increment
1383     // the rank since that wouldn't happen in a sequential execution
1384     if (w->l->work_stolen || pedigree_unsynched)
1385     {
1386         if (w->l->work_stolen)
1387             w->pedigree.rank = sf->parent_pedigree.rank + 1;
1388         else
1389             w->pedigree.rank = sf->parent_pedigree.rank;
1390     }
1391
1392     w->pedigree.parent = sf->parent_pedigree.parent;
1393     w->l->work_stolen = 0;
1394 }
1395
1396 static void setup_for_execution(__cilkrts_worker *w,
1397                                 full_frame *ff,
1398                                 int is_return_from_call)
1399 {
1400     // ASSERT: We own w->lock and ff->lock || P == 1
1401
1402     setup_for_execution_reducers(w, ff);
1403     setup_for_execution_exceptions(w, ff);
1404     /*setup_for_execution_stack(w, ff);*/
1405
1406     ff->call_stack->worker = w;
1407     w->current_stack_frame = ff->call_stack;
1408
1409     // If this is a return from a call, leave the pedigree alone
1410     if (! is_return_from_call)
1411         setup_for_execution_pedigree(w);
1412
1413     __cilkrts_setup_for_execution_sysdep(w, ff);
1414
1415     w->head = w->tail = w->l->ltq;
1416     reset_THE_exception(w);
1417
1418     make_runnable(w, ff);
1419 }
1420
1421
1422 /*
1423  * Called by the scheduling fiber, right before
1424  * resuming a sf/ff for user code.
1425  *
1426  * This method associates the specified sf with the worker.
1427  *
1428  * It also asserts that w, ff, and sf all have the expected properties
1429  * for resuming user code.
1430  */
1431 void scheduling_fiber_prepare_to_resume_user_code(__cilkrts_worker *w,
1432                                                   full_frame *ff,
1433                                                   __cilkrts_stack_frame *sf)
1434 {
1435     w->current_stack_frame = sf;
1436     sf->worker = w;
1437
1438     // Lots of debugging checks on the state of the fiber we might be
1439     // resuming.
1440 #if FIBER_DEBUG >= 1
1441 #   if FIBER_DEBUG >= 3
1442     {
1443         fprintf(stderr, "w=%d: ff=%p, sf=%p. about to resume user code\n",
1444                 w->self, ff, sf);
1445     }
1446 #   endif
1447
1448     const int flags = sf->flags;
1449     CILK_ASSERT(flags & CILK_FRAME_SUSPENDED);
1450     CILK_ASSERT(!sf->call_parent);
1451     CILK_ASSERT(w->head == w->tail);
1452
1453     /* A frame can not be resumed unless it was suspended. */
1454     CILK_ASSERT(ff->sync_sp != NULL);
1455
1456     /* The leftmost frame has no allocated stack */
1457     if (ff->simulated_stolen)
1458         CILK_ASSERT(flags & CILK_FRAME_UNSYNCHED);
1459     else if (flags & CILK_FRAME_UNSYNCHED)
1460         /* XXX By coincidence sync_sp could be null. */
1461         CILK_ASSERT(ff->fiber_self != NULL);
1462     else
1463         /* XXX This frame could be resumed unsynched on the leftmost stack */
1464         CILK_ASSERT((ff->sync_master == 0 || ff->sync_master == w));
1465     CILK_ASSERT(w->l->frame_ff == ff);
1466 #endif
1467 }
1468
1469
1470 /**
1471  * This method is the first method that should execute after we've
1472  * switched to a scheduling fiber from user code.
1473  *
1474  * @param fiber The scheduling fiber for the current worker.
1475  * @param wptr  The current worker.
1476  */
1477 static void enter_runtime_transition_proc(cilk_fiber *fiber)
1478 {
1479     // We can execute this method for one of three reasons:
1480     // 1. Undo-detach finds parent stolen.
1481     // 2. Sync suspends frame.
1482     // 3. Return from Cilk entry point.
1483     //
1484     //
1485     // In cases 1 and 2, the frame may be truly suspended or
1486     // may be immediately executed by this worker after provably_good_steal.
1487     //
1488     //
1489     // There is a fourth case, which can, but does not need to execute
1490     // this function:
1491     //   4. Starting up the scheduling loop on a user or
1492     //      system worker.  In this case, we won't have
1493     //      a scheduling stack function to run.
1494     __cilkrts_worker* w = cilk_fiber_get_owner(fiber);
1495     if (w->l->post_suspend) {
1496         // Run the continuation function passed to longjmp_into_runtime
1497         run_scheduling_stack_fcn(w);
1498
1499         // After we have jumped into the runtime and run the
1500         // scheduling function, any reducer map the worker had before entering the runtime
1501         // should have already been saved into the appropriate full
1502         // frame.
1503         CILK_ASSERT(NULL == w->reducer_map);
1504
1505         // There shouldn't be any uncaught exceptions.
1506         //
1507         // In Windows, the OS catches any exceptions not caught by the
1508         // user code.  Thus, we are omitting the check on Windows.
1509         //
1510         // On Android, calling std::uncaught_exception with the stlport
1511         // library causes a seg fault.  Since we're not supporting
1512         // exceptions there at this point, just don't do the check
1513         //
1514         // TBD: Is this check also safe to do on Windows?
1515         CILKBUG_ASSERT_NO_UNCAUGHT_EXCEPTION();
1516     }
1517 }
1518
1519
1520 /**
1521  * Method called to jump back to executing user code.
1522  *
1523  * A normal return from the runtime back to resuming user code calls
1524  * this method.  A computation executed using force_reduce also calls
1525  * this method to return to user code.
1526  *
1527  * This function should not contain any code that depends on a fiber.
1528  * In a force-reduce case, the user worker may not have a fiber.  In
1529  * the force-reduce case, we call this method directly instead of
1530  * calling @c user_code_resume_after_switch_into_runtime.
1531  */
1532 static inline NORETURN
1533 cilkrts_resume(__cilkrts_stack_frame *sf, full_frame *ff)
1534 {
1535     // Save the sync stack pointer, and do the bookkeeping
1536     char* sync_sp = ff->sync_sp;
1537     __cilkrts_take_stack(ff, sync_sp);  // leaves ff->sync_sp null
1538
1539     sf->flags &= ~CILK_FRAME_SUSPENDED;
1540     // Actually longjmp to the user code.
1541     // We may have exceptions to deal with, since we are resuming
1542     // a previous-suspended frame.
1543     sysdep_longjmp_to_sf(sync_sp, sf, ff);
1544 }
1545
1546
1547 /**
1548  * Called by the user-code fiber right before resuming a full frame
1549  * (sf/ff).
1550  *
1551  * This method pulls sf/ff out of the worker, and then calls
1552  * cilkrts_resume to jump to user code.
1553  */
1554 static NORETURN
1555 user_code_resume_after_switch_into_runtime(cilk_fiber *fiber)
1556 {
1557     __cilkrts_worker *w = cilk_fiber_get_owner(fiber);
1558     __cilkrts_stack_frame *sf;
1559     full_frame *ff;
1560     sf = w->current_stack_frame;
1561     ff = sf->worker->l->frame_ff;
1562
1563 #if FIBER_DEBUG >= 1
1564     CILK_ASSERT(ff->fiber_self == fiber);
1565     cilk_fiber_data *fdata = cilk_fiber_get_data(fiber);
1566     DBGPRINTF ("%d-%p: resume_after_switch_into_runtime, fiber=%p\n",
1567                w->self, w, fiber);
1568     CILK_ASSERT(sf == fdata->resume_sf);
1569 #endif
1570
1571     // Notify the Intel tools that we're stealing code
1572     ITT_SYNC_ACQUIRED(sf->worker);
1573     NOTIFY_ZC_INTRINSIC("cilk_continue", sf);
1574     cilk_fiber_invoke_tbb_stack_op(fiber, CILK_TBB_STACK_ADOPT);
1575
1576     // Actually jump to user code.
1577     cilkrts_resume(sf, ff);
1578  }
1579
1580
1581 /* The current stack is about to either be suspended or destroyed.  This
1582  * function will switch to the stack on which the scheduler is suspended and
1583  * resume running the scheduler within function do_work().  Upon waking up,
1584  * the scheduler will run the 'cont' function, using the supplied worker and
1585  * frame.
1586  */
1587 static NORETURN
1588 longjmp_into_runtime(__cilkrts_worker *w,
1589                      scheduling_stack_fcn_t fcn,
1590                      __cilkrts_stack_frame *sf)
1591 {
1592     full_frame *ff, *ff2;
1593
1594     CILK_ASSERT(!w->l->post_suspend);
1595     ff = w->l->frame_ff;
1596
1597     // If we've got only one worker, stealing shouldn't be possible.
1598     // Assume that this is a steal or return from spawn in a force-reduce case.
1599     // We don't have a scheduling stack to switch to, so call the continuation
1600     // function directly.
1601     if (1 == w->g->P) {
1602         fcn(w, ff, sf);
1603
1604         /* The call to function c() will have pushed ff as the next frame.  If
1605          * this were a normal (non-forced-reduce) execution, there would have
1606          * been a pop_next_frame call in a separate part of the runtime.  We
1607          * must call pop_next_frame here to complete the push/pop cycle. */
1608         ff2 = pop_next_frame(w);
1609
1610         setup_for_execution(w, ff2, 0);
1611         scheduling_fiber_prepare_to_resume_user_code(w, ff2, w->current_stack_frame);
1612         cilkrts_resume(w->current_stack_frame, ff2);
1613
1614 // Suppress clang warning that the expression result is unused
1615 #if defined(__clang__) && (! defined(__INTEL_COMPILER))
1616 #   pragma clang diagnostic push
1617 #   pragma clang diagnostic ignored "-Wunused-value"
1618 #endif // __clang__
1619         /* no return */
1620         CILK_ASSERT(((void)"returned from __cilkrts_resume", 0));
1621 #if defined(__clang__) && (! defined(__INTEL_COMPILER))
1622 #   pragma clang diagnostic pop
1623 #endif // __clang__
1624     }
1625
1626     w->l->post_suspend = fcn;
1627     w->l->suspended_stack = sf;
1628
1629     ITT_SYNC_RELEASING(w);
1630     ITT_SYNC_PREPARE(w);
1631
1632 #if FIBER_DEBUG >= 2
1633     fprintf(stderr, "ThreadId=%p, W=%d: about to switch into runtime... w->l->frame_ff = %p, sf=%p\n",
1634             cilkos_get_current_thread_id(),
1635             w->self, w->l->frame_ff,
1636             sf);
1637 #endif
1638
1639     // Current fiber is either the (1) one we are about to free,
1640     // or (2) it has been passed up to the parent.
1641     cilk_fiber *current_fiber = ( w->l->fiber_to_free ?
1642                                   w->l->fiber_to_free :
1643                                   w->l->frame_ff->parent->fiber_child );
1644     cilk_fiber_data* fdata = cilk_fiber_get_data(current_fiber);
1645     CILK_ASSERT(NULL == w->l->frame_ff->fiber_self);
1646
1647     // Clear the sf in the current fiber for cleanliness, to prevent
1648     // us from accidentally resuming a bad sf.
1649     // Technically, resume_sf gets overwritten for a fiber when
1650     // we are about to resume it anyway.
1651     fdata->resume_sf = NULL;
1652     CILK_ASSERT(fdata->owner == w);
1653
1654     // Set the function to execute immediately after switching to the
1655     // scheduling fiber, but before freeing any fibers.
1656     cilk_fiber_set_post_switch_proc(w->l->scheduling_fiber,
1657                                     enter_runtime_transition_proc);
1658     cilk_fiber_invoke_tbb_stack_op(current_fiber, CILK_TBB_STACK_ORPHAN);
1659
1660     if (w->l->fiber_to_free) {
1661         // Case 1: we are freeing this fiber.  We never
1662         // resume this fiber again after jumping into the runtime.
1663         w->l->fiber_to_free = NULL;
1664
1665         // Extra check. Normally, the fiber we are about to switch to
1666         // should have a NULL owner.
1667         CILK_ASSERT(NULL == cilk_fiber_get_data(w->l->scheduling_fiber)->owner);
1668 #if FIBER_DEBUG >= 4
1669         fprintf(stderr, "ThreadId=%p, W=%d: about to switch into runtime.. current_fiber = %p, deallcoate, switch to fiber %p\n",
1670                 cilkos_get_current_thread_id(),
1671                 w->self,
1672                 current_fiber, w->l->scheduling_fiber);
1673 #endif
1674         cilk_fiber_invoke_tbb_stack_op(current_fiber, CILK_TBB_STACK_RELEASE);
1675         NOTE_INTERVAL(w, INTERVAL_DEALLOCATE_RESUME_OTHER);
1676         cilk_fiber_remove_reference_from_self_and_resume_other(current_fiber,
1677                                                                &w->l->fiber_pool,
1678                                                                w->l->scheduling_fiber);
1679         // We should never come back here!
1680         CILK_ASSERT(0);
1681     }
1682     else {
1683         // Case 2: We are passing the fiber to our parent because we
1684         // are leftmost.  We should come back later to
1685         // resume execution of user code.
1686         //
1687         // If we are not freeing a fiber, there we must be
1688         // returning from a spawn or processing an exception.  The
1689         // "sync" path always frees a fiber.
1690         //
1691         // We must be the leftmost child, and by left holder logic, we
1692         // have already moved the current fiber into our parent full
1693         // frame.
1694 #if FIBER_DEBUG >= 2
1695         fprintf(stderr, "ThreadId=%p, W=%d: about to suspend self into runtime.. current_fiber = %p, deallcoate, switch to fiber %p\n",
1696                 cilkos_get_current_thread_id(),
1697                 w->self,
1698                 current_fiber, w->l->scheduling_fiber);
1699 #endif
1700
1701         NOTE_INTERVAL(w, INTERVAL_SUSPEND_RESUME_OTHER);
1702
1703         cilk_fiber_suspend_self_and_resume_other(current_fiber,
1704                                                  w->l->scheduling_fiber);
1705         // Resuming this fiber returns control back to
1706         // this function because our implementation uses OS fibers.
1707         //
1708         // On Unix, we could have the choice of passing the
1709         // user_code_resume_after_switch_into_runtime as an extra "resume_proc"
1710         // that resumes execution of user code instead of the
1711         // jumping back here, and then jumping back to user code.
1712 #if FIBER_DEBUG >= 2
1713         CILK_ASSERT(fdata->owner == __cilkrts_get_tls_worker());
1714 #endif
1715         user_code_resume_after_switch_into_runtime(current_fiber);
1716     }
1717 }
1718
1719 /*
1720  * Send a message to the children of the specified worker: run or wait.
1721  */
1722 static void notify_children(__cilkrts_worker *w, unsigned int msg)
1723 {
1724     int child_num;
1725     __cilkrts_worker *child;
1726     int num_sys_workers = w->g->P - 1;
1727
1728     // If worker is "n", then its children are 2n + 1, and 2n + 2.
1729     child_num = (w->self << 1) + 1;
1730     if (child_num < num_sys_workers) {
1731         child = w->g->workers[child_num];
1732         CILK_ASSERT(child->l->signal_node);
1733         signal_node_msg(child->l->signal_node, msg);
1734         child_num++;
1735         if (child_num < num_sys_workers) {
1736             child = w->g->workers[child_num];
1737             CILK_ASSERT(child->l->signal_node);
1738             signal_node_msg(child->l->signal_node, msg);
1739         }
1740     }
1741 }
1742
1743 /*
1744  * Notify this worker's children that they need to wait.
1745  */
1746 static void notify_children_wait(__cilkrts_worker *w)
1747 {
1748     notify_children(w, 0);
1749 }
1750
1751 /*
1752  * Notify this worker's children to run and start trying to steal.
1753  */
1754 static void notify_children_run(__cilkrts_worker *w)
1755 {
1756     notify_children(w, 1);
1757 }
1758
1759 /**
1760  * A single "check" to find work, either on our queue or through a
1761  * steal attempt.  This method checks our local queue once, and
1762  * performs one steal attempt.
1763  */
1764 static full_frame* check_for_work(__cilkrts_worker *w)
1765 {
1766     full_frame *ff = NULL;
1767     ff = pop_next_frame(w);
1768     // If there is no work on the queue, try to steal some.
1769     if (NULL == ff) {
1770         START_INTERVAL(w, INTERVAL_STEALING) {
1771             if (w->l->type != WORKER_USER && w->l->team != NULL) {
1772                 // At this point, the worker knows for certain that it has run
1773                 // out of work.  Therefore, it loses its team affiliation.  User
1774                 // workers never change teams, of course.
1775                 __cilkrts_worker_lock(w);
1776                 w->l->team = NULL;
1777                 __cilkrts_worker_unlock(w);
1778             }
1779
1780             // If we are about to do a random steal, we should have no
1781             // full frame...
1782             CILK_ASSERT(NULL == w->l->frame_ff);
1783             random_steal(w);
1784         } STOP_INTERVAL(w, INTERVAL_STEALING);
1785
1786         // If the steal was successful, then the worker has populated its next
1787         // frame with the work to resume.
1788         ff = pop_next_frame(w);
1789         if (NULL == ff) {
1790             // Punish the worker for failing to steal.
1791             // No quantum for you!
1792             if (w->l->steal_failure_count > 30000) {
1793                 // Punish more if the worker has been doing unsuccessful steals
1794                 // for a long time. After return from the idle state, it will
1795                 // be given a grace period to react quickly.
1796                 __cilkrts_idle();
1797                 w->l->steal_failure_count -= 300;
1798             } else {
1799                 __cilkrts_yield();
1800             }
1801             w->l->steal_failure_count++;
1802         } else {
1803             // Reset steal_failure_count since there is obviously still work to
1804             // be done.
1805             w->l->steal_failure_count = 0;
1806         }
1807     }
1808     return ff;
1809 }
1810
1811 /**
1812  * Keep stealing or looking on our queue.
1813  *
1814  * Returns either when a full frame is found, or NULL if the
1815  * computation is done.
1816  */
1817 static full_frame* search_until_work_found_or_done(__cilkrts_worker *w)
1818 {
1819     full_frame *ff = NULL;
1820     // Find a full frame to execute (either through random stealing,
1821     // or because we pull it off w's 1-element queue).
1822     while (!ff) {
1823         // Check worker state to figure out our next action.
1824         switch (worker_runnable(w))
1825         {
1826         case SCHEDULE_RUN:             // One attempt at checking for work.
1827             ff = check_for_work(w);
1828             break;
1829         case SCHEDULE_WAIT:            // go into wait-mode.
1830             START_INTERVAL(w, INTERVAL_SCHEDULE_WAIT);
1831             CILK_ASSERT(WORKER_SYSTEM == w->l->type);
1832             // If we are about to wait, then we better not have
1833             // a frame that we should execute...
1834             CILK_ASSERT(NULL == w->l->next_frame_ff);
1835             notify_children_wait(w);
1836             signal_node_wait(w->l->signal_node);
1837             // ...
1838             // Runtime is waking up.
1839             notify_children_run(w);
1840             w->l->steal_failure_count = 0;
1841             STOP_INTERVAL(w, INTERVAL_SCHEDULE_WAIT);
1842             break;
1843         case SCHEDULE_EXIT:            // exit the scheduler.
1844             CILK_ASSERT(WORKER_USER != w->l->type);
1845             return NULL;
1846         default:
1847             CILK_ASSERT(0);
1848             abort();
1849         }
1850     }
1851     return ff;
1852 }
1853
1854 /**
1855  * The proc method for a scheduling fiber on a user worker.
1856  *
1857  * When a user worker jumps into the runtime, it jumps into this
1858  * method by either starting it if the scheduling fiber has never run
1859  * before, or resuming the fiber if it was previously suspended.
1860  */
1861 COMMON_PORTABLE
1862 void scheduler_fiber_proc_for_user_worker(cilk_fiber *fiber)
1863 {
1864     __cilkrts_worker* w = cilk_fiber_get_owner(fiber);
1865     CILK_ASSERT(w);
1866
1867     // This must be a user worker
1868     CILK_ASSERT(WORKER_USER == w->l->type);
1869
1870     // If we aren't the current worker, then something is very wrong
1871     // here..
1872     verify_current_wkr(w);
1873
1874     __cilkrts_run_scheduler_with_exceptions(w);
1875 }
1876
1877
1878 /**
1879  * The body of the runtime scheduling loop.  This function executes in
1880  * 4 stages:
1881  *
1882  * 1. Transitions from the user code into the runtime by
1883  *    executing any scheduling-stack functions.
1884  * 2. Looks for a full frame enqueued from a successful provably
1885  *    good steal.
1886  * 3. If no full frame is found in step 2, steal until
1887  *    a frame is found or we are done.  If we are done, finish
1888  *    the scheduling loop.
1889  * 4. When a frame is found, setup to resume user code.
1890  *    In particular, suspend the current fiber and resume the
1891  *    user fiber to execute the frame.
1892  *
1893  * Returns a fiber object that we should switch to after completing
1894  * the body of the loop, or NULL if we should continue executing on
1895  * this fiber.
1896  *
1897  * @pre @c current_fiber should equal @c wptr->l->scheduling_fiber
1898  *
1899  * @param current_fiber   The currently executing (scheduling_ fiber
1900  * @param wptr            The currently executing worker.
1901  * @param return          The next fiber we should switch to.
1902  */
1903 static cilk_fiber* worker_scheduling_loop_body(cilk_fiber* current_fiber,
1904                                                void* wptr)
1905 {
1906     __cilkrts_worker *w = (__cilkrts_worker*) wptr;
1907     CILK_ASSERT(current_fiber == w->l->scheduling_fiber);
1908
1909     // Stage 1: Transition from executing user code to the runtime code.
1910     // We don't need to do this call here any more, because
1911     // every switch to the scheduling fiber should make this call
1912     // using a post_switch_proc on the fiber.
1913     //
1914     //  enter_runtime_transition_proc(w->l->scheduling_fiber, wptr);
1915
1916     // After Stage 1 is complete, w should no longer have
1917     // an associated full frame.
1918     CILK_ASSERT(NULL == w->l->frame_ff);
1919
1920     // Stage 2.  First do a quick check of our 1-element queue.
1921     full_frame *ff = pop_next_frame(w);
1922
1923     if (!ff) {
1924         // Stage 3.  We didn't find anything from our 1-element
1925         // queue.  Now go through the steal loop to find work.
1926         ff = search_until_work_found_or_done(w);
1927         if (!ff) {
1928             CILK_ASSERT(w->g->work_done);
1929             return NULL;
1930         }
1931     }
1932
1933     // Stage 4.  Now that we have found a full frame to work on,
1934     // actually execute it.
1935     __cilkrts_stack_frame *sf;
1936
1937     // There shouldn't be any uncaught exceptions.
1938     //
1939     // In Windows, the OS catches any exceptions not caught by the
1940     // user code.  Thus, we are omitting the check on Windows.
1941     //
1942     // On Android, calling std::uncaught_exception with the stlport
1943     // library causes a seg fault.  Since we're not supporting
1944     // exceptions there at this point, just don't do the check
1945     CILKBUG_ASSERT_NO_UNCAUGHT_EXCEPTION();
1946
1947     BEGIN_WITH_WORKER_LOCK(w) {
1948         CILK_ASSERT(!w->l->frame_ff);
1949         BEGIN_WITH_FRAME_LOCK(w, ff) {
1950             sf = ff->call_stack;
1951             CILK_ASSERT(sf && !sf->call_parent);
1952             setup_for_execution(w, ff, 0);
1953         } END_WITH_FRAME_LOCK(w, ff);
1954     } END_WITH_WORKER_LOCK(w);
1955
1956     /* run it */
1957     //
1958     // Prepare to run the full frame.  To do so, we need to:
1959     //   (a) Execute some code on this fiber (the scheduling
1960     //       fiber) to set up data structures, and
1961     //   (b) Suspend the scheduling fiber, and resume the
1962     //       user-code fiber.
1963
1964     // Part (a). Set up data structures.
1965     scheduling_fiber_prepare_to_resume_user_code(w, ff, sf);
1966
1967     cilk_fiber *other = w->l->frame_ff->fiber_self;
1968     cilk_fiber_data* other_data = cilk_fiber_get_data(other);
1969     cilk_fiber_data* current_fiber_data = cilk_fiber_get_data(current_fiber);
1970
1971     // I believe two cases are possible here, both of which
1972     // should have other_data->resume_sf as NULL.
1973     //
1974     // 1. Resuming a fiber that was previously executing
1975     //    user code (i.e., a provably-good-steal).
1976     //    In this case, resume_sf should have been
1977     //    set to NULL when it was suspended.
1978     //
1979     // 2. Resuming code on a steal.  In this case, since we
1980     //    grabbed a new fiber, resume_sf should be NULL.
1981     CILK_ASSERT(NULL == other_data->resume_sf);
1982
1983 #if FIBER_DEBUG >= 2
1984     fprintf(stderr, "W=%d: other fiber=%p, setting resume_sf to %p\n",
1985             w->self, other, other_data->resume_sf);
1986 #endif
1987     // Update our own fiber's data.
1988     current_fiber_data->resume_sf = NULL;
1989     // The scheduling fiber should have the right owner from before.
1990     CILK_ASSERT(current_fiber_data->owner == w);
1991     other_data->resume_sf = sf;
1992
1993
1994 #if FIBER_DEBUG >= 3
1995     fprintf(stderr, "ThreadId=%p (about to suspend self resume other), W=%d: current_fiber=%p, other=%p, current_fiber->resume_sf = %p, other->resume_sf = %p\n",
1996             cilkos_get_current_thread_id(),
1997             w->self,
1998             current_fiber, other,
1999             current_fiber_data->resume_sf,
2000             other_data->resume_sf);
2001 #endif
2002     return other;
2003 }
2004
2005
2006 /**
2007  * This function is executed once by each worker, to initialize its
2008  * scheduling loop.
2009  */
2010 static void worker_scheduler_init_function(__cilkrts_worker *w)
2011 {
2012     // First, execute the startup tasks that must happen for all
2013     // worker types.
2014     ITT_SYNC_PREPARE(w);
2015     /* Notify tools about the new worker. Inspector needs this, but we
2016        don't want to confuse Cilkscreen with system threads.  User threads
2017        do this notification in bind_thread */
2018     if (! w->g->under_ptool)
2019         __cilkrts_cilkscreen_establish_worker(w);
2020
2021     // Seed the initial random number generator.
2022     // If we forget to do this, then the worker always steals from 0.
2023     // Programs will still execute correctly, but
2024     // you may see a subtle performance bug...
2025     mysrand(w, (w->self + 1));
2026
2027     // The startup work varies, depending on the worker type.
2028     switch (w->l->type) {
2029     case WORKER_USER:
2030         break;
2031
2032     case WORKER_SYSTEM:
2033         // If a system worker is starting, we must also be starting
2034         // the runtime.
2035
2036         // Runtime begins in a wait-state and is woken up by the first user
2037         // worker when the runtime is ready.
2038         signal_node_wait(w->l->signal_node);
2039         // ...
2040         // Runtime is waking up.
2041         notify_children_run(w);
2042         w->l->steal_failure_count = 0;
2043         break;
2044     default:
2045         __cilkrts_bug("Unknown worker %p of type %d entering scheduling loop\n",
2046                       w, w->l->type);
2047     }
2048 }
2049
2050 /**
2051  * This function is executed once by each worker, to finish its
2052  * scheduling loop.
2053  *
2054  * @note Currently, only system workers finish their loops.  User
2055  * workers will jump away to user code without exiting their
2056  * scheduling loop.
2057  */
2058 static void worker_scheduler_terminate_function(__cilkrts_worker *w)
2059 {
2060     // A user worker should never finish by falling through the
2061     // scheduling loop.
2062     CILK_ASSERT(WORKER_USER != w->l->type);
2063 }
2064
2065 /**
2066  * The main scheduler function executed by a worker's scheduling
2067  * fiber.
2068  *
2069  * This method is started by either a new system worker, or a user
2070  * worker that has stalled and just been imported into the runtime.
2071  */
2072 static void worker_scheduler_function(__cilkrts_worker *w)
2073 {
2074     START_INTERVAL(w, INTERVAL_INIT_WORKER);
2075     worker_scheduler_init_function(w);
2076     STOP_INTERVAL(w, INTERVAL_INIT_WORKER);
2077
2078     // The main scheduling loop body.
2079
2080     while (!w->g->work_done) {
2081         // Execute the "body" of the scheduling loop, and figure
2082         // out the fiber to jump to next.
2083         START_INTERVAL(w, INTERVAL_SCHED_LOOP);
2084         cilk_fiber* fiber_to_resume
2085             = worker_scheduling_loop_body(w->l->scheduling_fiber, w);
2086         STOP_INTERVAL(w, INTERVAL_SCHED_LOOP);
2087
2088         if (fiber_to_resume) {
2089             // Suspend the current fiber and resume next one.
2090             NOTE_INTERVAL(w, INTERVAL_SUSPEND_RESUME_OTHER);
2091
2092             // Whenever we jump to resume user code, we stop being in
2093             // the runtime, and start working.
2094             STOP_INTERVAL(w, INTERVAL_IN_RUNTIME);
2095             START_INTERVAL(w, INTERVAL_WORKING);
2096             cilk_fiber_suspend_self_and_resume_other(w->l->scheduling_fiber,
2097                                                      fiber_to_resume);
2098             // Return here only when this (scheduling) fiber is
2099             // resumed (i.e., this worker wants to reenter the runtime).
2100
2101             // We've already switched from WORKING to IN_RUNTIME in
2102             // the runtime code that handles the fiber switch.  Thus, at
2103             // this point we are IN_RUNTIME already.
2104         }
2105     }
2106
2107     // Finish the scheduling loop.
2108     worker_scheduler_terminate_function(w);
2109 }
2110
2111
2112 /*************************************************************
2113   Forward declarations for reduction protocol.
2114 *************************************************************/
2115
2116 static __cilkrts_worker*
2117 execute_reductions_for_sync(__cilkrts_worker *w,
2118                             full_frame *ff,
2119                             __cilkrts_stack_frame *sf_at_sync);
2120
2121 static __cilkrts_worker*
2122 execute_reductions_for_spawn_return(__cilkrts_worker *w,
2123                                     full_frame *ff,
2124                                     __cilkrts_stack_frame *returning_sf);
2125
2126
2127
2128 /*************************************************************
2129   Scheduler functions that are callable by client code
2130 *************************************************************/
2131 static full_frame *disown(__cilkrts_worker *w,
2132                           full_frame *ff,
2133                           __cilkrts_stack_frame *sf,
2134                           const char *why)
2135 {
2136     CILK_ASSERT(ff);
2137     make_unrunnable(w, ff, sf, sf != 0, why);
2138     w->l->frame_ff = 0;
2139     return ff->parent;
2140 }
2141
2142 /**
2143  * Called when ff is returning from a spawn, and we need to execute a
2144  * reduction.
2145  *
2146  * @param w             The currently executing worker.
2147  * @param ff            The full frame for w.
2148  * @param returning_sf  The stack frame for the spawn helper that is returning.
2149  *
2150  * Normally, by the time we gain control in the runtime, the worker
2151  * has already popped off the __cilkrts_stack_frame "returning_sf"
2152  * from its call chain.
2153  *
2154  * When we have only serial reductions, w->current_stack_frame is not
2155  * needed any more, because w is about to enter the runtime scheduling
2156  * loop anyway.  Similarly, the frame "ff" is slated to be destroyed
2157  * after the runtime finishes the return from spawn and splices ff out
2158  * of the tree of full frames.
2159  *
2160  * To execute a parallel reduction, however, we still want
2161  * w->current_stack_frame == returning_sf, and we are going to use the
2162  * frame ff for a little bit longer.
2163  *
2164  * This method:
2165  *
2166  *   1. Puts returning_sf back as w's current stack frame.
2167  *   2. Makes "ff" runnable again on w.
2168  */
2169 static inline
2170 void restore_frame_for_spawn_return_reduction(__cilkrts_worker *w,
2171                                               full_frame *ff,
2172                                               __cilkrts_stack_frame *returning_sf) {
2173 #if REDPAR_DEBUG >= 2
2174     CILK_ASSERT(returning_sf);
2175     CILK_ASSERT(returning_sf->worker == w);
2176 #endif
2177     // Change w's current stack frame back to "returning_sf".
2178     //
2179     // Intuitively, w->current_stack_frame should be
2180     // returning_sf->call_parent at this point.
2181     //
2182     // We can not assert this, however, because the pop of
2183     // returning_sf from the call chain has already cleared
2184     // returning_sf->call_parent.  We don't want to restore the call
2185     // parent of returning_sf, because its parent has been stolen, and
2186     // the runtime assumes that steals break this link.
2187
2188     // We cannot assert call_parent is NULL either, since that's not true for
2189     // Win64 exception handling
2190 //    CILK_ASSERT(returning_sf->call_parent == NULL);
2191     w->current_stack_frame = returning_sf;
2192
2193     // Make the full frame "ff" runnable again, in preparation for
2194     // executing the reduction.
2195     make_runnable(w, ff);
2196 }
2197
2198
2199 NORETURN __cilkrts_c_sync(__cilkrts_worker *w,
2200                           __cilkrts_stack_frame *sf_at_sync)
2201 {
2202     full_frame *ff;
2203     STOP_INTERVAL(w, INTERVAL_WORKING);
2204     START_INTERVAL(w, INTERVAL_IN_RUNTIME);
2205
2206     // Claim: This read of w->l->frame_ff can occur without
2207     // holding the worker lock because when w has reached a sync
2208     // and entered the runtime (because it stalls), w's deque is empty
2209     // and no one else can steal and change w->l->frame_ff.
2210
2211     ff = w->l->frame_ff;
2212 #ifdef _WIN32
2213     __cilkrts_save_exception_state(w, ff);
2214 #else
2215     // Move any pending exceptions into the full frame
2216     CILK_ASSERT(NULL == ff->pending_exception);
2217     ff->pending_exception = w->l->pending_exception;
2218     w->l->pending_exception = NULL;
2219 #endif
2220
2221     w = execute_reductions_for_sync(w, ff, sf_at_sync);
2222
2223 #if FIBER_DEBUG >= 3
2224     fprintf(stderr, "ThreadId=%p, w->self = %d. about to longjmp_into_runtim[c_sync] with ff=%p\n",
2225             cilkos_get_current_thread_id(), w->self, ff);
2226 #endif
2227
2228     longjmp_into_runtime(w, do_sync, sf_at_sync);
2229 }
2230
2231 static void do_sync(__cilkrts_worker *w, full_frame *ff,
2232                     __cilkrts_stack_frame *sf)
2233 {
2234     //int abandoned = 1;
2235     enum provably_good_steal_t steal_result = ABANDON_EXECUTION;
2236
2237     START_INTERVAL(w, INTERVAL_SYNC_CHECK) {
2238         BEGIN_WITH_WORKER_LOCK_OPTIONAL(w) {
2239
2240             CILK_ASSERT(ff);
2241             BEGIN_WITH_FRAME_LOCK(w, ff) {
2242                 CILK_ASSERT(sf->call_parent == 0);
2243                 CILK_ASSERT(sf->flags & CILK_FRAME_UNSYNCHED);
2244
2245                 // Before switching into the scheduling fiber, we should have
2246                 // already taken care of deallocating the current
2247                 // fiber.
2248                 CILK_ASSERT(NULL == ff->fiber_self);
2249
2250                 // Update the frame's pedigree information if this is an ABI 1
2251                 // or later frame
2252                 if (CILK_FRAME_VERSION_VALUE(sf->flags) >= 1)
2253                 {
2254                     sf->parent_pedigree.rank = w->pedigree.rank;
2255                     sf->parent_pedigree.parent = w->pedigree.parent;
2256
2257                     // Note that the pedigree rank needs to be updated
2258                     // when setup_for_execution_pedigree runs
2259                     sf->flags |= CILK_FRAME_SF_PEDIGREE_UNSYNCHED;
2260                 }
2261
2262                 /* the decjoin() occurs in provably_good_steal() */
2263                 steal_result = provably_good_steal(w, ff);
2264
2265             } END_WITH_FRAME_LOCK(w, ff);
2266             // set w->l->frame_ff = NULL after checking abandoned
2267             if (WAIT_FOR_CONTINUE != steal_result) {
2268                 w->l->frame_ff = NULL;
2269             }
2270         } END_WITH_WORKER_LOCK_OPTIONAL(w);
2271     } STOP_INTERVAL(w, INTERVAL_SYNC_CHECK);
2272
2273     // Now, if we are in a replay situation and provably_good_steal() returned
2274     // WAIT_FOR_CONTINUE, we should sleep, reacquire locks, call
2275     // provably_good_steal(), and release locks until we get a value other
2276     // than WAIT_FOR_CONTINUE from the function.
2277 #ifdef CILK_RECORD_REPLAY
2278     // We don't have to explicitly check for REPLAY_LOG below because
2279     // steal_result can only be set to WAIT_FOR_CONTINUE during replay
2280     while(WAIT_FOR_CONTINUE == steal_result)
2281     {
2282         __cilkrts_sleep();
2283         BEGIN_WITH_WORKER_LOCK_OPTIONAL(w)
2284         {
2285             ff = w->l->frame_ff;
2286             BEGIN_WITH_FRAME_LOCK(w, ff)
2287             {
2288                 steal_result = provably_good_steal(w, ff);
2289             } END_WITH_FRAME_LOCK(w, ff);
2290             if (WAIT_FOR_CONTINUE != steal_result)
2291                 w->l->frame_ff = NULL;
2292         } END_WITH_WORKER_LOCK_OPTIONAL(w);
2293     }
2294 #endif  // CILK_RECORD_REPLAY
2295
2296 #ifdef ENABLE_NOTIFY_ZC_INTRINSIC
2297     // If we can't make any further progress on this thread, tell Inspector
2298     // that we're abandoning the work and will go find something else to do.
2299     if (ABANDON_EXECUTION == steal_result)
2300     {
2301         NOTIFY_ZC_INTRINSIC("cilk_sync_abandon", 0);
2302     }
2303 #endif // defined ENABLE_NOTIFY_ZC_INTRINSIC
2304
2305     return; /* back to scheduler loop */
2306 }
2307
2308 /* worker W completely promotes its own deque, simulating the case
2309    where the whole deque is stolen.  We use this mechanism to force
2310    the allocation of new storage for reducers for race-detection
2311    purposes. */
2312 void __cilkrts_promote_own_deque(__cilkrts_worker *w)
2313 {
2314     // Remember the fiber we start this method on.
2315     CILK_ASSERT(w->l->frame_ff);
2316     cilk_fiber* starting_fiber = w->l->frame_ff->fiber_self;
2317
2318     BEGIN_WITH_WORKER_LOCK(w) {
2319         while (dekker_protocol(w)) {
2320             /* PLACEHOLDER_FIBER is used as non-null marker to tell detach()
2321                and make_child() that this frame should be treated as a spawn
2322                parent, even though we have not assigned it a stack. */
2323             detach_for_steal(w, w, PLACEHOLDER_FIBER);
2324         }
2325     } END_WITH_WORKER_LOCK(w);
2326
2327
2328     // TBD: The management of full frames and fibers is a bit
2329     // sketchy here.  We are promoting stack frames into full frames,
2330     // and pretending they are stolen away, but no other worker is
2331     // actually working on them.  Some runtime invariants
2332     // may be broken here.
2333     //
2334     // Technically, if we are simulating a steal from w
2335     // w should get a new full frame, but
2336     // keep the same fiber.  A real thief would be taking the
2337     // loot frame away, get a new fiber, and starting executing the
2338     // loot frame.
2339     //
2340     // What should a fake thief do?  Where does the frame go?
2341
2342     // In any case, we should be finishing the promotion process with
2343     // the same fiber with.
2344     CILK_ASSERT(w->l->frame_ff);
2345     CILK_ASSERT(w->l->frame_ff->fiber_self == starting_fiber);
2346 }
2347
2348
2349
2350 /* the client code calls this function after a spawn when the dekker
2351    protocol fails.  The function may either return or longjmp
2352    into the rts
2353
2354    This function takes in a "returning_sf" argument which corresponds
2355    to the __cilkrts_stack_frame that we are finishing (i.e., the
2356    argument to __cilkrts_leave_frame).
2357    */
2358 void __cilkrts_c_THE_exception_check(__cilkrts_worker *w,
2359                                      __cilkrts_stack_frame *returning_sf)
2360 {
2361     full_frame *ff;
2362     int stolen_p;
2363     __cilkrts_stack_frame *saved_sf = NULL;
2364
2365     // For the exception check, stop working and count as time in
2366     // runtime.
2367     STOP_INTERVAL(w, INTERVAL_WORKING);
2368     START_INTERVAL(w, INTERVAL_IN_RUNTIME);
2369
2370     START_INTERVAL(w, INTERVAL_THE_EXCEPTION_CHECK);
2371
2372     BEGIN_WITH_WORKER_LOCK(w) {
2373         ff = w->l->frame_ff;
2374         CILK_ASSERT(ff);
2375         /* This code is called only upon a normal return and never
2376            upon an exceptional return.  Assert that this is the
2377            case. */
2378         CILK_ASSERT(!w->l->pending_exception);
2379
2380         reset_THE_exception(w);
2381         stolen_p = !(w->head < (w->tail + 1)); /* +1 because tail was
2382                                                   speculatively
2383                                                   decremented by the
2384                                                   compiled code */
2385
2386         if (stolen_p) {
2387             /* XXX This will be charged to THE for accounting purposes */
2388             __cilkrts_save_exception_state(w, ff);
2389
2390             // Save the value of the current stack frame.
2391             saved_sf = w->current_stack_frame;
2392
2393             // Reverse the decrement from undo_detach.
2394             // This update effectively resets the deque to be
2395             // empty (i.e., changes w->tail back to equal w->head).
2396             // We need to reset the deque to execute parallel
2397             // reductions.  When we have only serial reductions, it
2398             // does not matter, since serial reductions do not
2399             // change the deque.
2400             w->tail++;
2401 #if REDPAR_DEBUG > 1
2402             // ASSERT our deque is empty.
2403             CILK_ASSERT(w->head == w->tail);
2404 #endif
2405         }
2406     } END_WITH_WORKER_LOCK(w);
2407
2408     STOP_INTERVAL(w, INTERVAL_THE_EXCEPTION_CHECK);
2409
2410     if (stolen_p)
2411     {
2412         w = execute_reductions_for_spawn_return(w, ff, returning_sf);
2413
2414         // "Mr. Policeman?  My parent always told me that if I was in trouble
2415         // I should ask a nice policeman for help.  I can't find my parent
2416         // anywhere..."
2417         //
2418         // Write a record to the replay log for an attempt to return to a stolen parent
2419         replay_record_orphaned(w);
2420
2421         // Update the pedigree only after we've finished the
2422         // reductions.
2423         update_pedigree_on_leave_frame(w, returning_sf);
2424
2425         // Notify Inspector that the parent has been stolen and we're
2426         // going to abandon this work and go do something else.  This
2427         // will match the cilk_leave_begin in the compiled code
2428         NOTIFY_ZC_INTRINSIC("cilk_leave_stolen", saved_sf);
2429
2430         DBGPRINTF ("%d: longjmp_into_runtime from __cilkrts_c_THE_exception_check\n", w->self);
2431         longjmp_into_runtime(w, do_return_from_spawn, 0);
2432         DBGPRINTF ("%d: returned from longjmp_into_runtime from __cilkrts_c_THE_exception_check?!\n", w->self);
2433     }
2434     else
2435     {
2436         NOTE_INTERVAL(w, INTERVAL_THE_EXCEPTION_CHECK_USELESS);
2437
2438         // If we fail the exception check and return, then switch back
2439         // to working.
2440         STOP_INTERVAL(w, INTERVAL_IN_RUNTIME);
2441         START_INTERVAL(w, INTERVAL_WORKING);
2442         return;
2443     }
2444 }
2445
2446 /* Return an exception to a stolen parent. */
2447 NORETURN __cilkrts_exception_from_spawn(__cilkrts_worker *w,
2448                                         __cilkrts_stack_frame *returning_sf)
2449 {
2450     full_frame *ff = w->l->frame_ff;
2451     STOP_INTERVAL(w, INTERVAL_WORKING);
2452     START_INTERVAL(w, INTERVAL_IN_RUNTIME);
2453
2454     // This is almost the same as THE_exception_check, except
2455     // the detach didn't happen, we don't need to undo the tail
2456     // update.
2457     CILK_ASSERT(w->head == w->tail);
2458     w = execute_reductions_for_spawn_return(w, ff, returning_sf);
2459
2460     longjmp_into_runtime(w, do_return_from_spawn, 0);
2461     CILK_ASSERT(0);
2462 }
2463
2464 static void do_return_from_spawn(__cilkrts_worker *w,
2465                                  full_frame *ff,
2466                                  __cilkrts_stack_frame *sf)
2467 {
2468     full_frame *parent_ff;
2469     enum provably_good_steal_t steal_result = ABANDON_EXECUTION;
2470
2471     BEGIN_WITH_WORKER_LOCK_OPTIONAL(w) {
2472         CILK_ASSERT(ff);
2473         CILK_ASSERT(!ff->is_call_child);
2474         CILK_ASSERT(sf == NULL);
2475         parent_ff = ff->parent;
2476
2477         BEGIN_WITH_FRAME_LOCK(w, ff) {
2478             decjoin(ff);
2479         } END_WITH_FRAME_LOCK(w, ff);
2480
2481         BEGIN_WITH_FRAME_LOCK(w, parent_ff) {
2482             if (parent_ff->simulated_stolen)
2483                 unconditional_steal(w, parent_ff);
2484             else
2485                 steal_result = provably_good_steal(w, parent_ff);
2486         } END_WITH_FRAME_LOCK(w, parent_ff);
2487
2488     } END_WITH_WORKER_LOCK_OPTIONAL(w);
2489
2490     // Loop here in replay mode
2491 #ifdef CILK_RECORD_REPLAY
2492     // We don't have to explicitly check for REPLAY_LOG below because
2493     // steal_result can only get set to WAIT_FOR_CONTINUE during replay.
2494     // We also don't have to worry about the simulated_stolen flag
2495     // because steal_result can only be set to WAIT_FOR_CONTINUE by
2496     // provably_good_steal().
2497     while(WAIT_FOR_CONTINUE == steal_result)
2498     {
2499         __cilkrts_sleep();
2500         BEGIN_WITH_WORKER_LOCK_OPTIONAL(w)
2501         {
2502             BEGIN_WITH_FRAME_LOCK(w, parent_ff)
2503             {
2504                 steal_result = provably_good_steal(w, parent_ff);
2505             } END_WITH_FRAME_LOCK(w, parent_ff);
2506         } END_WITH_WORKER_LOCK_OPTIONAL(w);
2507     }
2508 #endif  // CILK_RECORD_REPLAY
2509
2510     // Cleanup the child frame.
2511     __cilkrts_destroy_full_frame(w, ff);
2512     return;
2513 }
2514
2515 #ifdef _WIN32
2516 /* migrate an exception across fibers.  Call this function when an exception has
2517  * been thrown and has to traverse across a steal.  The exception has already
2518  * been wrapped up, so all that remains is to longjmp() into the continuation,
2519  * sync, and re-raise it.
2520  */
2521 void __cilkrts_migrate_exception(__cilkrts_stack_frame *sf) {
2522
2523     __cilkrts_worker *w = sf->worker;
2524     full_frame *ff;
2525
2526     BEGIN_WITH_WORKER_LOCK(w) {
2527         ff = w->l->frame_ff;
2528         reset_THE_exception(w);
2529         /* there is no need to check for a steal because we wouldn't be here if
2530            there weren't a steal. */
2531         __cilkrts_save_exception_state(w, ff);
2532
2533         CILK_ASSERT(w->head == w->tail);
2534     } END_WITH_WORKER_LOCK(w);
2535
2536     {
2537         // TBD(jsukha): This function emulates the
2538         // the "do_return_from_spawn" path.
2539         w = execute_reductions_for_spawn_return(w, ff, sf);
2540     }
2541
2542     longjmp_into_runtime(w, do_return_from_spawn, 0); /* does not return. */
2543     CILK_ASSERT(! "Shouldn't be here...");
2544 }
2545 #endif
2546
2547
2548 /* Pop a call stack from TAIL.  Return the call stack, or NULL if the
2549    queue is empty */
2550 __cilkrts_stack_frame *__cilkrts_pop_tail(__cilkrts_worker *w)
2551 {
2552     __cilkrts_stack_frame *sf;
2553     BEGIN_WITH_WORKER_LOCK(w) {
2554         __cilkrts_stack_frame *volatile *tail = w->tail;
2555         if (w->head < tail) {
2556             --tail;
2557             sf = *tail;
2558             w->tail = tail;
2559         } else {
2560             sf = 0;
2561         }
2562     } END_WITH_WORKER_LOCK(w);
2563     return sf;
2564 }
2565
2566 #ifdef CILK_RECORD_REPLAY
2567 __cilkrts_stack_frame *simulate_pop_tail(__cilkrts_worker *w)
2568 {
2569     __cilkrts_stack_frame *sf;
2570     BEGIN_WITH_WORKER_LOCK(w) {
2571         if (w->head < w->tail) {
2572             sf = *(w->tail-1);
2573         } else {
2574             sf = 0;
2575         }
2576     } END_WITH_WORKER_LOCK(w);
2577     return sf;
2578 }
2579 #endif
2580
2581
2582 /* Return from a call, not a spawn. */
2583 void __cilkrts_return(__cilkrts_worker *w)
2584 {
2585     full_frame *ff, *parent_ff;
2586
2587     // Count time during the return as in the runtime.
2588     STOP_INTERVAL(w, INTERVAL_WORKING);
2589     START_INTERVAL(w, INTERVAL_IN_RUNTIME);
2590     START_INTERVAL(w, INTERVAL_RETURNING);
2591
2592     BEGIN_WITH_WORKER_LOCK_OPTIONAL(w) {
2593         ff = w->l->frame_ff;
2594         CILK_ASSERT(ff);
2595         CILK_ASSERT(ff->join_counter == 1);
2596         /* This path is not used to return from spawn. */
2597         CILK_ASSERT(ff->is_call_child);
2598
2599         BEGIN_WITH_FRAME_LOCK(w, ff) {
2600             // After this call, w->l->frame_ff != ff.
2601             // Technically, w will "own" ff until ff is freed,
2602             // however, because ff is a dying leaf full frame.
2603             parent_ff = disown(w, ff, 0, "return");
2604             decjoin(ff);
2605
2606 #ifdef _WIN32
2607             __cilkrts_save_exception_state(w, ff);
2608 #else
2609             // Move the pending exceptions into the full frame
2610             // This should always be NULL if this isn't a
2611             // return with an exception
2612             CILK_ASSERT(NULL == ff->pending_exception);
2613             ff->pending_exception = w->l->pending_exception;
2614             w->l->pending_exception = NULL;
2615 #endif  // _WIN32
2616
2617         } END_WITH_FRAME_LOCK(w, ff);
2618
2619         __cilkrts_fence(); /* redundant */
2620
2621         CILK_ASSERT(parent_ff);
2622
2623         BEGIN_WITH_FRAME_LOCK(w, parent_ff) {
2624             finalize_child_for_call(w, parent_ff, ff);
2625         } END_WITH_FRAME_LOCK(w, parent_ff);
2626
2627         ff = pop_next_frame(w);
2628         /* ff will be non-null except when the parent frame is owned
2629            by another worker.
2630            CILK_ASSERT(ff)
2631         */
2632         CILK_ASSERT(!w->l->frame_ff);
2633         if (ff) {
2634             BEGIN_WITH_FRAME_LOCK(w, ff) {
2635                 __cilkrts_stack_frame *sf = ff->call_stack;
2636                 CILK_ASSERT(sf && !sf->call_parent);
2637                 setup_for_execution(w, ff, 1);
2638             } END_WITH_FRAME_LOCK(w, ff);
2639         }
2640     } END_WITH_WORKER_LOCK_OPTIONAL(w);
2641
2642     STOP_INTERVAL(w, INTERVAL_RETURNING);
2643     STOP_INTERVAL(w, INTERVAL_IN_RUNTIME);
2644     START_INTERVAL(w, INTERVAL_WORKING);
2645 }
2646
2647 static void __cilkrts_unbind_thread()
2648 {
2649     int stop_cilkscreen = 0;
2650     global_state_t *g;
2651
2652     // Take out the global OS mutex to protect accesses to the table of workers
2653     global_os_mutex_lock();
2654
2655     if (cilkg_is_published()) {
2656         __cilkrts_worker *w = __cilkrts_get_tls_worker();
2657         if (w) {
2658             g = w->g;
2659
2660
2661             // Matches the START in bind_thread in cilk-abi.c.
2662             STOP_INTERVAL(w, INTERVAL_IN_RUNTIME);
2663             STOP_INTERVAL(w, INTERVAL_IN_SCHEDULER);
2664
2665             __cilkrts_set_tls_worker(0);
2666
2667             if (w->self == -1) {
2668                 // This worker is an overflow worker.  I.e., it was created on-
2669                 // demand when the global pool ran out of workers.
2670                 destroy_worker(w);
2671                 __cilkrts_free(w);
2672             } else {
2673                 // This is a normal user worker and needs to be counted by the
2674                 // global state for the purposes of throttling system workers.
2675                 w->l->type = WORKER_FREE;
2676                 __cilkrts_leave_cilk(g);
2677             }
2678
2679             stop_cilkscreen = (0 == g->Q);
2680         }
2681     }
2682     global_os_mutex_unlock();
2683
2684     /* Turn off Cilkscreen.  This needs to be done when we are NOT holding the
2685      * os mutex. */
2686     if (stop_cilkscreen)
2687         __cilkrts_cilkscreen_disable_instrumentation();
2688 }
2689
2690 /* special return from the initial frame */
2691
2692 void __cilkrts_c_return_from_initial(__cilkrts_worker *w)
2693 {
2694     struct cilkred_map *rm;
2695
2696     // When we are returning from the initial frame, switch from
2697     // INTERVAL_WORKING into INTERVAL_IN_RUNTIME.
2698     STOP_INTERVAL(w, INTERVAL_WORKING);
2699     START_INTERVAL(w, INTERVAL_IN_RUNTIME);
2700
2701     /* This is only called on a user thread worker. */
2702     CILK_ASSERT(w->l->type == WORKER_USER);
2703
2704     #if REDPAR_DEBUG >= 3
2705     fprintf(stderr, "[W=%d, desc=cilkrts_c_return_from_initial, ff=%p]\n",
2706             w->self, w->l->frame_ff);
2707     #endif
2708
2709     BEGIN_WITH_WORKER_LOCK_OPTIONAL(w) {
2710         full_frame *ff = w->l->frame_ff;
2711         CILK_ASSERT(ff);
2712         CILK_ASSERT(ff->join_counter == 1);
2713         w->l->frame_ff = 0;
2714
2715         CILK_ASSERT(ff->fiber_self);
2716         // Save any TBB interop data for the next time this thread enters Cilk
2717         cilk_fiber_tbb_interop_save_info_from_stack(ff->fiber_self);
2718
2719         // Deallocate cilk_fiber that mapped to the user stack.  The stack
2720         // itself does not get deallocated (of course) but our data
2721         // structure becomes divorced from it.
2722
2723 #if FIBER_DEBUG >= 1
2724         fprintf(stderr, "ThreadId=%p: w=%d: We are about to deallocate ff->fiber_self  = %p here. w->l->scheduling_fiber = %p. w->l->type = %d\n",
2725                 cilkos_get_current_thread_id(),
2726                 w->self,
2727                 ff->fiber_self,
2728                 w->l->scheduling_fiber,
2729                 w->l->type);
2730 #endif
2731         // The fiber in ff is a user-code fiber.  The fiber in
2732         // w->l->scheduling_fiber is a scheduling fiber.  These fibers should
2733         // never be equal.  When a user worker returns (and will unbind), we
2734         // should destroy only the fiber in ff.  The scheduling fiber will be
2735         // re-used.
2736
2737         CILK_ASSERT(ff->fiber_self != w->l->scheduling_fiber);
2738
2739         START_INTERVAL(w, INTERVAL_FIBER_DEALLOCATE) {
2740             // This fiber might not be deallocated here if there
2741             // is a pending exception on Windows that refers
2742             // to this fiber.
2743             //
2744             // First "suspend" the fiber, and then try to delete it.
2745             cilk_fiber_deallocate_from_thread(ff->fiber_self);
2746         } STOP_INTERVAL(w, INTERVAL_FIBER_DEALLOCATE);
2747         ff->fiber_self = NULL;
2748
2749         /* Save reducer map into global_state object */
2750         rm = w->reducer_map;
2751         w->reducer_map = NULL;
2752
2753 #if REDPAR_DEBUG >= 3
2754         fprintf(stderr, "W=%d, reducer_map_to_delete=%p, was in ff=%p\n",
2755                 w->self,
2756                 rm,
2757                 ff);
2758 #endif
2759         __cilkrts_destroy_full_frame(w, ff);
2760
2761
2762         /* Work is never done. w->g->work_done = 1; __cilkrts_fence(); */
2763     } END_WITH_WORKER_LOCK_OPTIONAL(w);
2764
2765
2766     save_pedigree_leaf_from_user_worker(w);
2767
2768     // Workers can have NULL reducer maps now.
2769     if (rm) {
2770         __cilkrts_destroy_reducer_map(w, rm);
2771     }
2772
2773
2774 #if FIBER_DEBUG >= 1
2775     __cilkrts_worker* tmp = w;
2776     int tmp_id = w->self;
2777     fprintf(stderr, "w=%d: We are about unbind thread (w= %p)\n",
2778             w->self,
2779             w);
2780 #endif
2781
2782     w = NULL;
2783
2784     __cilkrts_unbind_thread();
2785
2786 #if FIBER_DEBUG >= 1
2787
2788     fprintf(stderr, "w=%p, %d: Finished unbind\n",
2789             tmp, tmp_id);
2790 #endif
2791
2792     /* Other workers will stop trying to steal if this was the last worker. */
2793
2794     return;
2795 }
2796
2797
2798 /*
2799  * __cilkrts_restore_stealing
2800  *
2801  * Restore the protected_tail to a previous state, possibly allowing frames
2802  * to be stolen.  The dekker_protocol has been extended to steal only if
2803  * head+1 is < protected_tail.
2804  */
2805
2806 void __cilkrts_restore_stealing(
2807     __cilkrts_worker *w,
2808     __cilkrts_stack_frame *volatile *saved_protected_tail)
2809 {
2810     /* On most x86 this pair of operations would be slightly faster
2811        as an atomic exchange due to the implicit memory barrier in
2812        an atomic instruction. */
2813     w->protected_tail = saved_protected_tail;
2814     __cilkrts_fence();
2815 }
2816
2817 /*
2818  * __cilkrts_disallow_stealing
2819  *
2820  * Move the protected_tail to NEW_PROTECTED_TAIL, preventing any
2821  * frames from being stolen.  If NEW_PROTECTED_TAIL is NULL, prevent
2822  * stealing from the whole queue.  The dekker_protocol has been
2823  * extended to only steal if head+1 is also < protected_tail.
2824  */
2825
2826 __cilkrts_stack_frame *volatile *__cilkrts_disallow_stealing(
2827     __cilkrts_worker *w,
2828     __cilkrts_stack_frame *volatile *new_protected_tail)
2829 {
2830     __cilkrts_stack_frame *volatile *saved_protected_tail = w->protected_tail;
2831
2832     if (!new_protected_tail)
2833         new_protected_tail = w->l->ltq;
2834
2835     if (w->protected_tail > new_protected_tail) {
2836         w->protected_tail = new_protected_tail;
2837         /* Issue a store-store barrier.  The update to protected_tail
2838            here must precede the update to tail in the next spawn.
2839            On x86 this is probably not needed. */
2840 #if defined __GNUC__ && __ICC >= 1200 && !(__MIC__ ||__MIC2__)
2841         _mm_sfence();
2842 #else
2843         __cilkrts_fence();
2844 #endif
2845     }
2846
2847     return saved_protected_tail;
2848 }
2849
2850 /*************************************************************
2851   Initialization and startup
2852 *************************************************************/
2853
2854 __cilkrts_worker *make_worker(global_state_t *g,
2855                               int self, __cilkrts_worker *w)
2856 {
2857     w->self = self;
2858     w->g = g;
2859
2860     w->pedigree.rank = 0;    // Initial rank is 0
2861     w->pedigree.parent = NULL;
2862
2863     w->l = (local_state *)__cilkrts_malloc(sizeof(*w->l));
2864
2865     __cilkrts_frame_malloc_per_worker_init(w);
2866
2867     w->reducer_map = NULL;
2868     w->current_stack_frame = NULL;
2869     w->reserved = NULL;
2870
2871     w->l->worker_magic_0 = WORKER_MAGIC_0;
2872     w->l->team = NULL;
2873     w->l->type = WORKER_FREE;
2874
2875     __cilkrts_mutex_init(&w->l->lock);
2876     __cilkrts_mutex_init(&w->l->steal_lock);
2877     w->l->do_not_steal = 0;
2878     w->l->frame_ff = 0;
2879     w->l->next_frame_ff = 0;
2880     w->l->last_full_frame = NULL;
2881
2882     w->l->ltq = (__cilkrts_stack_frame **)
2883         __cilkrts_malloc(g->ltqsize * sizeof(*w->l->ltq));
2884     w->ltq_limit = w->l->ltq + g->ltqsize;
2885     w->head = w->tail = w->l->ltq;
2886
2887     cilk_fiber_pool_init(&w->l->fiber_pool,
2888                          &g->fiber_pool,
2889                          g->stack_size,
2890                          g->fiber_pool_size,
2891                          0,   // alloc_max is 0.  We don't allocate from the heap directly without checking the parent pool.
2892                          0);
2893 #if FIBER_DEBUG >= 2
2894     fprintf(stderr, "ThreadId=%p: Making w=%d (%p), pool = %p\n",
2895             cilkos_get_current_thread_id(),
2896             w->self, w,
2897             &w->l->fiber_pool);
2898 #endif
2899     w->l->scheduling_fiber = NULL;
2900     w->l->original_pedigree_leaf = NULL;
2901     w->l->rand_seed = 0; /* the scheduler will overwrite this field */
2902
2903     w->l->post_suspend = 0;
2904     w->l->suspended_stack = 0;
2905     w->l->fiber_to_free = NULL;
2906     w->l->pending_exception = NULL;
2907
2908 #if CILK_PROFILE
2909     w->l->stats = __cilkrts_malloc(sizeof(statistics));
2910     __cilkrts_init_stats(w->l->stats);
2911 #else
2912     w->l->stats = NULL;
2913 #endif
2914     w->l->steal_failure_count = 0;
2915
2916     w->l->work_stolen = 0;
2917
2918     // Initialize record/replay assuming we're doing neither
2919     w->l->record_replay_fptr = NULL;
2920     w->l->replay_list_root = NULL;
2921     w->l->replay_list_entry = NULL;
2922     w->l->signal_node = NULL;
2923     // Nothing's been stolen yet
2924     w->l->worker_magic_1 = WORKER_MAGIC_1;
2925
2926     /*w->parallelism_disabled = 0;*/
2927
2928     // Allow stealing all frames. Sets w->saved_protected_tail
2929     __cilkrts_restore_stealing(w, w->ltq_limit);
2930
2931     __cilkrts_init_worker_sysdep(w);
2932
2933     reset_THE_exception(w);
2934
2935     return w;
2936 }
2937
2938 void destroy_worker(__cilkrts_worker *w)
2939 {
2940     CILK_ASSERT (NULL == w->l->pending_exception);
2941
2942     // Deallocate the scheduling fiber
2943     if (NULL != w->l->scheduling_fiber)
2944     {
2945         // The scheduling fiber is the main fiber for system workers and must
2946         // be deallocated by the thread that created it.  Thus, we can
2947         // deallocate only free workers' (formerly user workers) scheduling
2948         // fibers here.
2949         CILK_ASSERT(WORKER_FREE == w->l->type);
2950
2951 #if FIBER_DEBUG >=1
2952         fprintf(stderr, "ThreadId=%p, w=%p, %d, deallocating scheduling fiber = %p, \n",
2953                 cilkos_get_current_thread_id(),
2954                 w,
2955                 w->self,
2956                 w->l->scheduling_fiber);
2957 #endif
2958         int ref_count = cilk_fiber_remove_reference(w->l->scheduling_fiber, NULL);
2959         // Scheduling fiber should never have extra references because of exceptions.
2960         CILK_ASSERT(0 == ref_count);
2961         w->l->scheduling_fiber = NULL;
2962     }
2963
2964 #if CILK_PROFILE
2965     if (w->l->stats) {
2966         __cilkrts_free(w->l->stats);
2967     }
2968 #else
2969     CILK_ASSERT(NULL == w->l->stats);
2970 #endif
2971
2972     /* Free any cached fibers. */
2973     cilk_fiber_pool_destroy(&w->l->fiber_pool);
2974
2975     __cilkrts_destroy_worker_sysdep(w);
2976
2977     if (w->l->signal_node) {
2978         CILK_ASSERT(WORKER_SYSTEM == w->l->type);
2979         signal_node_destroy(w->l->signal_node);
2980     }
2981
2982     __cilkrts_free(w->l->ltq);
2983     __cilkrts_mutex_destroy(0, &w->l->lock);
2984     __cilkrts_mutex_destroy(0, &w->l->steal_lock);
2985     __cilkrts_frame_malloc_per_worker_cleanup(w);
2986
2987     __cilkrts_free(w->l);
2988
2989     // The caller is responsible for freeing the worker memory
2990 }
2991
2992 /*
2993  * Make a worker into a system worker.
2994  */
2995 static void make_worker_system(__cilkrts_worker *w) {
2996     CILK_ASSERT(WORKER_FREE == w->l->type);
2997     w->l->type = WORKER_SYSTEM;
2998     w->l->signal_node = signal_node_create();
2999 }
3000
3001 void __cilkrts_deinit_internal(global_state_t *g)
3002 {
3003     int i;
3004     __cilkrts_worker *w;
3005
3006     // If there's no global state then we're done
3007     if (NULL == g)
3008         return;
3009
3010 #ifdef CILK_PROFILE
3011     __cilkrts_dump_stats_to_stderr(g);
3012 #endif
3013
3014     w = g->workers[0];
3015     if (w->l->frame_ff) {
3016         __cilkrts_destroy_full_frame(w, w->l->frame_ff);
3017         w->l->frame_ff = 0;
3018     }
3019
3020     // Release any resources used for record/replay
3021     replay_term(g);
3022
3023     // Destroy any system dependent global state
3024     __cilkrts_destroy_global_sysdep(g);
3025
3026     for (i = 0; i < g->total_workers; ++i)
3027         destroy_worker(g->workers[i]);
3028
3029     // Free memory for all worker blocks which were allocated contiguously
3030     __cilkrts_free(g->workers[0]);
3031
3032     __cilkrts_free(g->workers);
3033
3034     cilk_fiber_pool_destroy(&g->fiber_pool);
3035     __cilkrts_frame_malloc_global_cleanup(g);
3036
3037     cilkg_deinit_global_state();
3038 }
3039
3040 /*
3041  * Wake the runtime by notifying the system workers that they can steal.  The
3042  * first user worker into the runtime should call this.
3043  */
3044 static void wake_runtime(global_state_t *g)
3045 {
3046     __cilkrts_worker *root;
3047     if (g->P > 1) {
3048         // Send a message to the root node.  The message will propagate.
3049         root = g->workers[0];
3050         CILK_ASSERT(root->l->signal_node);
3051         signal_node_msg(root->l->signal_node, 1);
3052     }
3053 }
3054
3055 /*
3056  * Put the runtime to sleep.  The last user worker out of the runtime should
3057  * call this.  Like Dad always said, turn out the lights when nobody's in the
3058  * room.
3059  */
3060 static void sleep_runtime(global_state_t *g)
3061 {
3062     __cilkrts_worker *root;
3063     if (g->P > 1) {
3064         // Send a message to the root node.  The message will propagate.
3065         root = g->workers[0];
3066         CILK_ASSERT(root->l->signal_node);
3067         signal_node_msg(root->l->signal_node, 0);
3068     }
3069 }
3070
3071 /* Called when a user thread joins Cilk.
3072    Global lock must be held. */
3073 void __cilkrts_enter_cilk(global_state_t *g)
3074 {
3075     if (g->Q++ == 0) {
3076         // If this is the first user thread to enter Cilk wake
3077         // up all the workers.
3078         wake_runtime(g);
3079     }
3080 }
3081
3082 /* Called when a user thread leaves Cilk.
3083    Global lock must be held. */
3084 void __cilkrts_leave_cilk(global_state_t *g)
3085 {
3086     if (--g->Q == 0) {
3087         // Put the runtime to sleep.
3088         sleep_runtime(g);
3089     }
3090 }
3091
3092 /*
3093  * worker_runnable
3094  *
3095  * Return true if the worker should continue to try to steal.  False, otherwise.
3096  */
3097
3098 NOINLINE
3099 static enum schedule_t worker_runnable(__cilkrts_worker *w)
3100 {
3101     global_state_t *g = w->g;
3102
3103     /* If this worker has something to do, do it.
3104        Otherwise the work would be lost. */
3105     if (w->l->next_frame_ff)
3106         return SCHEDULE_RUN;
3107
3108     // If Cilk has explicitly (by the user) been told to exit (i.e., by
3109     // __cilkrts_end_cilk() -> __cilkrts_stop_workers(g)), then return 0.
3110     if (g->work_done)
3111         return SCHEDULE_EXIT;
3112
3113     if (0 == w->self) {
3114         // This worker is the root node and is the only one that may query the
3115         // global state to see if there are still any user workers in Cilk.
3116         if (w->l->steal_failure_count > g->max_steal_failures) {
3117             if (signal_node_should_wait(w->l->signal_node)) {
3118                 return SCHEDULE_WAIT;
3119             } else {
3120                 // Reset the steal_failure_count since we have verified that
3121                 // user workers are still in Cilk.
3122                 w->l->steal_failure_count = 0;
3123             }
3124         }
3125     } else if (WORKER_SYSTEM == w->l->type &&
3126                signal_node_should_wait(w->l->signal_node)) {
3127         // This worker has been notified by its parent that it should stop
3128         // trying to steal.
3129         return SCHEDULE_WAIT;
3130     }
3131
3132     return SCHEDULE_RUN;
3133 }
3134
3135
3136
3137 // Initialize the worker structs, but don't start the workers themselves.
3138 static void init_workers(global_state_t *g)
3139 {
3140     int total_workers = g->total_workers;
3141     int i;
3142     struct CILK_ALIGNAS(256) buffered_worker {
3143         __cilkrts_worker w;
3144         char buf[64];
3145     } *workers_memory;
3146
3147     /* not needed if only one worker */
3148     cilk_fiber_pool_init(&g->fiber_pool,
3149                          NULL,
3150                          g->stack_size,
3151                          g->global_fiber_pool_size,           // buffer_size
3152                          g->max_stacks,                       // maximum # to allocate
3153                          1);
3154
3155     cilk_fiber_pool_set_fiber_limit(&g->fiber_pool,
3156                                     (g->max_stacks ? g->max_stacks : INT_MAX));
3157
3158     g->workers = (__cilkrts_worker **)
3159         __cilkrts_malloc(total_workers * sizeof(*g->workers));
3160
3161     // Allocate 1 block of memory for workers to make life easier for tools
3162     // like Inspector which run multithreaded and need to know the memory
3163     // range for all the workers that will be accessed in a user's program
3164     workers_memory = (struct buffered_worker*)
3165         __cilkrts_malloc(sizeof(*workers_memory) * total_workers);
3166
3167     // Notify any tools that care (Cilkscreen and Inspector) that they should
3168     // ignore memory allocated for the workers
3169     __cilkrts_cilkscreen_ignore_block(&workers_memory[0],
3170                                       &workers_memory[total_workers]);
3171
3172     // Initialize worker structs, including unused worker slots.
3173     for (i = 0; i < total_workers; ++i) {
3174         g->workers[i] = make_worker(g, i, &workers_memory[i].w);
3175     }
3176
3177     // Set the workers in the first P - 1 slots to be system workers.
3178     // Remaining worker structs already have type == 0.
3179     for (i = 0; i < g->system_workers; ++i) {
3180         make_worker_system(g->workers[i]);
3181     }
3182 }
3183
3184 void __cilkrts_init_internal(int start)
3185 {
3186     global_state_t *g = NULL;
3187
3188     if (cilkg_is_published()) {
3189         g = cilkg_init_global_state();
3190     }
3191     else {
3192
3193         // We think the state has not been published yet.
3194         // Grab the lock and try to initialize/publish.
3195         global_os_mutex_lock();
3196
3197         if (cilkg_is_published()) {
3198             // Some other thread must have snuck in and published.
3199             g = cilkg_init_global_state();
3200         }
3201         else {
3202             // Initialize and retrieve global state
3203             g = cilkg_init_global_state();
3204
3205             // Set the scheduler pointer
3206             g->scheduler = worker_scheduler_function;
3207
3208             // If we're running under a sequential P-Tool (Cilkscreen or
3209             // Cilkview) then there's only one worker and we need to tell
3210             // the tool about the extent of the stack
3211             if (g->under_ptool)
3212                 __cilkrts_establish_c_stack();
3213             init_workers(g);
3214
3215             // Initialize per-work record/replay logging
3216             replay_init_workers(g);
3217
3218             // Initialize any system dependent global state
3219             __cilkrts_init_global_sysdep(g);
3220
3221
3222             cilkg_publish_global_state(g);
3223         }
3224
3225         global_os_mutex_unlock();
3226     }
3227
3228     CILK_ASSERT(g);
3229
3230     if (start && !g->workers_running)
3231     {
3232         // Acquire the global OS mutex while we're starting the workers
3233         global_os_mutex_lock();
3234         if (!g->workers_running)
3235             // Start P - 1 system workers since P includes the first user
3236             // worker.
3237             __cilkrts_start_workers(g, g->P - 1);
3238         global_os_mutex_unlock();
3239     }
3240 }
3241
3242
3243 /************************************************************************
3244   Methods for reducer protocol.
3245
3246   Reductions occur in two places:
3247     A. A full frame "ff" is returning from a spawn with a stolen parent.
3248     B. A full frame "ff" is stalling at a sync.
3249
3250   To support parallel reductions, reduction functions need to be
3251   executed while control is on a user stack, before jumping into the
3252   runtime.  These reductions can not occur while holding a worker or
3253   frame lock.
3254
3255   Before a worker w executes a reduction in either Case A or B, w's
3256   deque is empty.
3257
3258   Since parallel reductions push work onto the deque, we must do extra
3259   work to set up runtime data structures properly before reductions
3260   begin to allow stealing.  ( Normally, when we have only serial
3261   reductions, once a worker w starts a reduction, its deque remains
3262   empty until w either steals another frame or resumes a suspended
3263   frame.  Thus, we don't care about the state of the deque, since w
3264   will reset its deque when setting up execution of a frame. )
3265
3266   To allow for parallel reductions, we coerce the runtime data
3267   structures so that, from their perspective, it looks as though we
3268   have spliced in an "execute_reductions()" function.  Consider the
3269   two cases for reductions:
3270
3271     Case A: Return from a spawn with a stolen parent.
3272       Consider a spawned function g is returning on a worker w.
3273       Assume:
3274           -   g was spawned from a parent function f.
3275           -   ff is the full frame for g's spawn helper
3276           -   sf be the __cilkrts_stack_frame for g's spawn helper.
3277
3278       We are conceptually splicing "execute_reductions()" so that it
3279       occurs immediately before the spawn helper of g returns to f.
3280
3281       We do so by creating two different world views --- one for the
3282       runtime data structures, and one for the actual control flow.
3283
3284         - Before reductions begin, the runtime data structures should
3285           look as though the spawn helper of g is calling
3286           "execute_reductions()", in terms of both the user stack and
3287           worker deque.  More precisely, w should satisfy the
3288           following properties:
3289
3290               (a) w has ff as its full frame,
3291               (b) w has sf as its __cilkrts_stack_frame, and
3292               (c) w has an empty deque.
3293
3294           If the runtime satisfies these properties, then if w
3295           encounters a spawn in a parallel reduction, it can push onto
3296           a valid deque.  Also, when a steal from w occurs, it will
3297           build the correct tree of full frames when w is stolen from.
3298
3299         - In actual control flow, however, once the
3300           "execute_reductions()" function returns, it is actually
3301           returning to runtime code instead of g's spawn helper.
3302
3303           At the point a worker w began executing reductions, the
3304           control flow / compiled code had already finished g's spawn
3305           helper, and w was about to enter the runtime.  With parallel
3306           reductions, some worker v (which might be different from w)
3307           is the one returning to the runtime.
3308
3309
3310       The reduction logic consists of 4 steps:
3311
3312        A1. Restore runtime data structures to make it look as though
3313            the spawn helper of g() is still the currently executing
3314            frame for w.
3315
3316        A2. Execute reductions on the user stack.  Reductions also
3317            includes the logic for exceptions and stacks.  Note that
3318            reductions start on w, but may finish on a different
3319            worker if there is parallelism in the reduce.
3320
3321        A3. Splice out ff from the tree of full frames.
3322
3323        A4. Jump into the runtime/scheduling stack and execute
3324            "do_return_from_spawn".  This method
3325
3326            (a) Frees the user stack we were just on if it is no longer needed.
3327            (b) Decrement the join counter on ff->parent, and tries to do a
3328                provably good steal.
3329            (c) Clean up the full frame ff.
3330
3331
3332    Case B: Stalling at a sync.
3333
3334      Consider a function g(), with full frame ff and
3335      __cilkrts_stack_frame sf.  Suppose g() stalls at a sync, and we
3336      are executing reductions.
3337
3338      Conceptually, we are splicing in an "execute_reductions()"
3339      function into g() as the last action that g() takes immediately
3340      before it executes the cilk_sync.
3341
3342      The reduction logic for this case is similar to Case A.
3343
3344        B1. Restore the runtime data structures.
3345
3346            The main difference from Case A is that ff/sf is still a
3347            frame that needs to be executed later (since it is stalling
3348            at a cilk_sync).  Thus, we also need to save the current
3349            stack information into "ff" so that we can correctly resume
3350            execution of "ff" after the sync.
3351
3352        B2. Execute reductions on the user stack.
3353
3354        B3. No frame to splice out of the tree.
3355
3356        B4. Jump into the runtime/scheduling stack and execute "do_sync".
3357            This method:
3358            (a) Frees the user stack we were just on if it is no longer needed.
3359            (b) Tries to execute a provably good steal.
3360
3361   Finally, for the reducer protocol, we consider two reduction paths,
3362   namely a "fast" and "slow" path.  On a fast path, only trivial
3363   merges of reducer maps happen (i.e., one or both of the maps are
3364   NULL).  Otherwise, on the slow path, a reduction actually needs to
3365   happen.
3366
3367 *****************************************************************/
3368
3369 /**
3370  * @brief Locations to store the result of a reduction.
3371  *
3372  * Struct storing pointers to the fields in our "left" sibling that we
3373  * should update when splicing out a full frame or stalling at a sync.
3374  */
3375 typedef struct {
3376     /** A pointer to the location of our left reducer map. */
3377     struct cilkred_map **map_ptr;
3378
3379     /** A pointer to the location of our left exception. */
3380     struct pending_exception_info **exception_ptr;
3381 } splice_left_ptrs;
3382
3383 /**
3384  * For a full frame returning from a spawn, calculate the pointers to
3385  * the maps and exceptions to my left.
3386  *
3387  * @param w   The currently executing worker.
3388  * @param ff  Full frame that is dying
3389  * @return    Pointers to our "left" for reducers and exceptions.
3390  */
3391 static inline
3392 splice_left_ptrs compute_left_ptrs_for_spawn_return(__cilkrts_worker *w,
3393                                                     full_frame *ff)
3394 {
3395     // ASSERT: we hold the lock on ff->parent
3396
3397     splice_left_ptrs left_ptrs;
3398     if (ff->left_sibling) {
3399         left_ptrs.map_ptr = &ff->left_sibling->right_reducer_map;
3400         left_ptrs.exception_ptr = &ff->left_sibling->right_pending_exception;
3401     }
3402     else {
3403         full_frame *parent_ff = ff->parent;
3404         left_ptrs.map_ptr = &parent_ff->children_reducer_map;
3405         left_ptrs.exception_ptr = &parent_ff->child_pending_exception;
3406     }
3407     return left_ptrs;
3408 }
3409
3410 /**
3411  * For a full frame at a sync, calculate the pointers to the maps and
3412  * exceptions to my left.
3413  *
3414  * @param w   The currently executing worker.
3415  * @param ff  Full frame that is stalling at a sync.
3416  * @return    Pointers to our "left" for reducers and exceptions.
3417  */
3418 static inline
3419 splice_left_ptrs compute_left_ptrs_for_sync(__cilkrts_worker *w,
3420                                             full_frame *ff)
3421 {
3422     // ASSERT: we hold the lock on ff
3423     splice_left_ptrs left_ptrs;
3424
3425     // Figure out which map to the left we should merge into.
3426     if (ff->rightmost_child) {
3427         CILK_ASSERT(ff->rightmost_child->parent == ff);
3428         left_ptrs.map_ptr = &(ff->rightmost_child->right_reducer_map);
3429         left_ptrs.exception_ptr = &(ff->rightmost_child->right_pending_exception);
3430     }
3431     else {
3432         // We have no children.  Then, we should be the last
3433         // worker at the sync... "left" is our child map.
3434         left_ptrs.map_ptr = &(ff->children_reducer_map);
3435         left_ptrs.exception_ptr = &(ff->child_pending_exception);
3436     }
3437     return left_ptrs;
3438 }
3439
3440 /**
3441  * After we have completed all reductions on a spawn return, call this
3442  * method to finish up before jumping into the runtime.
3443  *
3444  *   1. Perform the "reduction" on stacks, i.e., execute the left
3445  *      holder logic to pass the leftmost stack up.
3446  *
3447  *      w->l->fiber_to_free holds any stack that needs to be freed
3448  *      when control switches into the runtime fiber.
3449  *
3450  *   2. Unlink and remove child_ff from the tree of full frames.
3451  *
3452  * @param   w          The currently executing worker.
3453  * @param   parent_ff  The parent of child_ff.
3454  * @param   child_ff   The full frame returning from a spawn.
3455  */
3456 static inline
3457 void finish_spawn_return_on_user_stack(__cilkrts_worker *w,
3458                                        full_frame *parent_ff,
3459                                        full_frame *child_ff)
3460 {
3461     CILK_ASSERT(w->l->fiber_to_free == NULL);
3462
3463     // Execute left-holder logic for stacks.
3464     if (child_ff->left_sibling || parent_ff->fiber_child) {
3465         // Case where we are not the leftmost stack.
3466         CILK_ASSERT(parent_ff->fiber_child != child_ff->fiber_self);
3467
3468         // Remember any fiber we need to free in the worker.
3469         // After we jump into the runtime, we will actually do the
3470         // free.
3471         w->l->fiber_to_free = child_ff->fiber_self;
3472     }
3473     else {
3474         // We are leftmost, pass stack/fiber up to parent.
3475         // Thus, no stack/fiber to free.
3476         parent_ff->fiber_child = child_ff->fiber_self;
3477         w->l->fiber_to_free = NULL;
3478     }
3479
3480     child_ff->fiber_self = NULL;
3481
3482     unlink_child(parent_ff, child_ff);
3483 }
3484
3485
3486 /**
3487  * Executes any fast reductions necessary to splice ff out of the tree
3488  * of full frames.
3489  *
3490  * This "fast" path performs only trivial merges of reducer maps,
3491  * i.e,. when one of them is NULL.
3492  * (See slow_path_reductions_for_spawn_return() for slow path.)
3493  *
3494  * Returns: 1 if we finished all reductions.
3495  * Returns: 0 if there are still reductions to execute, and
3496  *            we should execute the slow path.
3497  *
3498  * This method assumes w holds the frame lock on parent_ff.
3499  * After this method completes:
3500  *    1. We have spliced ff out of the tree of full frames.
3501  *    2. The reducer maps of child_ff have been deposited
3502  *       "left" according to the reducer protocol.
3503  *    3. w->l->stack_to_free stores the stack
3504  *       that needs to be freed once we jump into the runtime.
3505  *
3506  * We have not, however, decremented the join counter on ff->parent.
3507  * This prevents any other workers from resuming execution of the parent.
3508  *
3509  * @param   w    The currently executing worker.
3510  * @param   ff   The full frame returning from a spawn.
3511  * @return  NULL if we finished all reductions.
3512  * @return  The address where the left map is stored (which should be passed to
3513  *          slow_path_reductions_for_spawn_return()) if there are
3514  *          still reductions to execute.
3515  */
3516 struct cilkred_map**
3517 fast_path_reductions_for_spawn_return(__cilkrts_worker *w,
3518                                       full_frame *ff)
3519 {
3520     // ASSERT: we hold ff->parent->lock.
3521     splice_left_ptrs left_ptrs;
3522
3523     CILK_ASSERT(NULL == w->l->pending_exception);
3524
3525     // Figure out the pointers to the left where I want
3526     // to put reducers and exceptions.
3527     left_ptrs = compute_left_ptrs_for_spawn_return(w, ff);
3528
3529     // Go ahead and merge exceptions while holding the lock.
3530     splice_exceptions_for_spawn(w, ff, left_ptrs.exception_ptr);
3531
3532     // Now check if we have any reductions to perform.
3533     //
3534     // Consider all the cases of left, middle and right maps.
3535     //  0. (-, -, -)  :  finish and return 1
3536     //  1. (L, -, -)  :  finish and return 1
3537     //  2. (-, M, -)  :  slide over to left, finish, and return 1.
3538     //  3. (L, M, -)  :  return 0
3539     //  4. (-, -, R)  :  slide over to left, finish, and return 1.
3540     //  5. (L, -, R)  :  return 0
3541     //  6. (-, M, R)  :  return 0
3542     //  7. (L, M, R)  :  return 0
3543     //
3544     // In terms of code:
3545     //  L == *left_ptrs.map_ptr
3546     //  M == w->reducer_map
3547     //  R == f->right_reducer_map.
3548     //
3549     // The goal of the code below is to execute the fast path with
3550     // as few branches and writes as possible.
3551
3552     int case_value = (*(left_ptrs.map_ptr) != NULL);
3553     case_value += ((w->reducer_map != NULL) << 1);
3554     case_value += ((ff->right_reducer_map != NULL) << 2);
3555
3556     // Fastest path is case_value == 0 or 1.
3557     if (case_value >=2) {
3558         switch (case_value) {
3559         case 2:
3560             *(left_ptrs.map_ptr) = w->reducer_map;
3561             w->reducer_map = NULL;
3562             return NULL;
3563             break;
3564         case 4:
3565             *(left_ptrs.map_ptr) = ff->right_reducer_map;
3566             ff->right_reducer_map = NULL;
3567             return NULL;
3568         default:
3569             // If we have to execute the slow path, then
3570             // return the pointer to the place to deposit the left
3571             // map.
3572             return left_ptrs.map_ptr;
3573         }
3574     }
3575
3576     // Do nothing
3577     return NULL;
3578 }
3579
3580
3581 /**
3582  * Executes any reductions necessary to splice "ff" frame out of
3583  * the steal tree.
3584  *
3585  * This method executes the "slow" path for reductions on a spawn
3586  * return, i.e., there are non-NULL maps that need to be merged
3587  * together.
3588  *
3589  * This method should execute only if
3590  * fast_path_reductions_for_spawn_return() returns a non-NULL
3591  * left_map_ptr.
3592  *
3593  * Upon entry, left_map_ptr should be the location of the left map
3594  * at the start of the reduction, as calculated by
3595  * fast_path_reductions_for_spawn_return().
3596  *
3597  * After this method completes:
3598  *    1. We have spliced ff out of the tree of full frames.
3599  *    2. The reducer maps of child_ff have been deposited
3600  *       "left" according to the reducer protocol.
3601  *    3. w->l->stack_to_free stores the stack
3602  *       that needs to be freed once we jump into the runtime.
3603  * We have not, however, decremented the join counter on ff->parent,
3604  * so no one can resume execution of the parent yet.
3605  *
3606  * WARNING:
3607  *   This method assumes the lock on ff->parent is held upon entry, and
3608  *   Upon exit, the worker that returns still holds a lock on ff->parent
3609  *   This method can, however, release and reacquire the lock on ff->parent.
3610  *
3611  * @param w             The currently executing worker.
3612  * @param ff            The full frame returning from a spawn.
3613  * @param left_map_ptr  Pointer to our initial left map.
3614  * @return              The worker that this method returns on.
3615  */
3616 static __cilkrts_worker*
3617 slow_path_reductions_for_spawn_return(__cilkrts_worker *w,
3618                                       full_frame *ff,
3619                                       struct cilkred_map **left_map_ptr)
3620 {
3621
3622     // CILK_ASSERT: w is holding frame lock on parent_ff.
3623 #if REDPAR_DEBUG > 0
3624     CILK_ASSERT(!ff->rightmost_child);
3625     CILK_ASSERT(!ff->is_call_child);
3626 #endif
3627
3628     // Loop invariant:
3629     // When beginning this loop, we should
3630     //   1. Be holding the lock on ff->parent.
3631     //   2. left_map_ptr should be the address of the pointer to the left map.
3632     //   3. All maps should be slid over left by one, if possible.
3633     //   4. All exceptions should be merged so far.
3634     while (1) {
3635
3636         // Slide middle map left if possible.
3637         if (!(*left_map_ptr)) {
3638             *left_map_ptr = w->reducer_map;
3639             w->reducer_map = NULL;
3640         }
3641         // Slide right map to middle if possible.
3642         if (!w->reducer_map) {
3643             w->reducer_map = ff->right_reducer_map;
3644             ff->right_reducer_map = NULL;
3645         }
3646
3647         // Since we slid everything left by one,
3648         // we are finished if there is no middle map.
3649         if (!w->reducer_map) {
3650             verify_current_wkr(w);
3651             return w;
3652         }
3653         else {
3654             struct cilkred_map* left_map;
3655             struct cilkred_map* middle_map;
3656             struct cilkred_map* right_map;
3657
3658             // Take all the maps from their respective locations.
3659             // We can't leave them in place and execute a reduction because these fields
3660             // might change once we release the lock.
3661             left_map = *left_map_ptr;
3662             *left_map_ptr = NULL;
3663             middle_map = w->reducer_map;
3664             w->reducer_map = NULL;
3665             right_map = ff->right_reducer_map;
3666             ff->right_reducer_map = NULL;
3667
3668             // WARNING!!! Lock release here.
3669             // We have reductions to execute (and we can't hold locks).
3670             __cilkrts_frame_unlock(w, ff->parent);
3671
3672             // After we've released the lock, start counting time as
3673             // WORKING again.
3674             STOP_INTERVAL(w, INTERVAL_IN_RUNTIME);
3675             START_INTERVAL(w, INTERVAL_WORKING);
3676
3677             // Merge all reducers into the left map.
3678             left_map = repeated_merge_reducer_maps(&w,
3679                                                    left_map,
3680                                                    middle_map);
3681             verify_current_wkr(w);
3682             left_map = repeated_merge_reducer_maps(&w,
3683                                                    left_map,
3684                                                    right_map);
3685             verify_current_wkr(w);
3686             CILK_ASSERT(NULL == w->reducer_map);
3687             // Put the final answer back into w->reducer_map.
3688             w->reducer_map = left_map;
3689
3690             // Save any exceptions generated because of the reduction
3691             // process from the returning worker.  These get merged
3692             // the next time around the loop.
3693             CILK_ASSERT(NULL == ff->pending_exception);
3694             ff->pending_exception = w->l->pending_exception;
3695             w->l->pending_exception = NULL;
3696
3697             STOP_INTERVAL(w, INTERVAL_WORKING);
3698             START_INTERVAL(w, INTERVAL_IN_RUNTIME);
3699
3700             // Lock ff->parent for the next loop around.
3701             __cilkrts_frame_lock(w, ff->parent);
3702
3703             // Once we have the lock again, recompute who is to our
3704             // left.
3705             splice_left_ptrs left_ptrs;
3706             left_ptrs = compute_left_ptrs_for_spawn_return(w, ff);
3707
3708             // Update the pointer for the left map.
3709             left_map_ptr = left_ptrs.map_ptr;
3710             // Splice the exceptions for spawn.
3711             splice_exceptions_for_spawn(w, ff, left_ptrs.exception_ptr);
3712         }
3713     }
3714     // We should never break out of this loop.
3715
3716     CILK_ASSERT(0);
3717     return NULL;
3718 }
3719
3720
3721
3722 /**
3723  * Execute reductions when returning from a spawn whose parent has
3724  * been stolen.
3725  *
3726  * Execution may start on w, but may finish on a different worker.
3727  * This method acquires/releases the lock on ff->parent.
3728  *
3729  * @param w            The currently executing worker.
3730  * @param ff           The full frame of the spawned function that is returning.
3731  * @param returning_sf The __cilkrts_stack_frame for this returning function.
3732  * @return             The worker returning from this method.
3733  */
3734 static __cilkrts_worker*
3735 execute_reductions_for_spawn_return(__cilkrts_worker *w,
3736                                     full_frame *ff,
3737                                     __cilkrts_stack_frame *returning_sf)
3738 {
3739     // Step A1 from reducer protocol described above.
3740     //
3741     // Coerce the runtime into thinking that
3742     // ff/returning_sf are still on the bottom of
3743     // w's deque.
3744     restore_frame_for_spawn_return_reduction(w, ff, returning_sf);
3745
3746     // Step A2 and A3: Execute reductions on user stack.
3747     BEGIN_WITH_FRAME_LOCK(w, ff->parent) {
3748         struct cilkred_map **left_map_ptr;
3749         left_map_ptr = fast_path_reductions_for_spawn_return(w, ff);
3750
3751         // Pointer will be non-NULL if there are
3752         // still reductions to execute.
3753         if (left_map_ptr) {
3754             // WARNING: This method call may release the lock
3755             // on ff->parent and re-acquire it (possibly on a
3756             // different worker).
3757             // We can't hold locks while actually executing
3758             // reduce functions.
3759             w = slow_path_reductions_for_spawn_return(w,
3760                                                       ff,
3761                                                       left_map_ptr);
3762             verify_current_wkr(w);
3763         }
3764
3765         finish_spawn_return_on_user_stack(w, ff->parent, ff);
3766         // WARNING: the use of this lock macro is deceptive.
3767         // The worker may have changed here.
3768     } END_WITH_FRAME_LOCK(w, ff->parent);
3769     return w;
3770 }
3771
3772
3773
3774 /**
3775  * Execute fast "reductions" when ff stalls at a sync.
3776  *
3777  * @param   w  The currently executing worker.
3778  * @param   ff The full frame stalling at a sync.
3779  * @return  1 if we are finished with all reductions after calling this method.
3780  * @return  0 if we still need to execute the slow path reductions.
3781  */
3782 static inline
3783 int fast_path_reductions_for_sync(__cilkrts_worker *w,
3784                                   full_frame *ff) {
3785     // Return 0 if there is some reduction that needs to happen.
3786     return !(w->reducer_map  || ff->pending_exception);
3787 }
3788
3789 /**
3790  * Executes slow reductions when ff stalls at a sync.
3791  * This method should execute only if
3792  *   fast_path_reductions_for_sync(w, ff) returned 0.
3793  *
3794  * After this method completes:
3795  *   1. ff's current reducer map has been deposited into
3796  *       right_reducer_map of ff's rightmost child, or
3797  *       ff->children_reducer_map if ff has no children.
3798  *   2. Similarly for ff's current exception.
3799  *   3. Nothing to calculate for stacks --- if we are stalling
3800  *      we will always free a stack.
3801  *
3802  * This method may repeatedly acquire/release the lock on ff.
3803  *
3804  * @param   w  The currently executing worker.
3805  * @param   ff The full frame stalling at a sync.
3806  * @return  The worker returning from this method.
3807  */
3808 static __cilkrts_worker*
3809 slow_path_reductions_for_sync(__cilkrts_worker *w,
3810                               full_frame *ff)
3811 {
3812     struct cilkred_map *left_map;
3813     struct cilkred_map *middle_map;
3814
3815 #if (REDPAR_DEBUG > 0)
3816     CILK_ASSERT(ff);
3817     CILK_ASSERT(w->head == w->tail);
3818 #endif
3819
3820     middle_map = w->reducer_map;
3821     w->reducer_map = NULL;
3822
3823     // Loop invariant: middle_map should be valid (the current map to reduce).
3824     //                 left_map is junk.
3825     //                 w->reducer_map == NULL.
3826     while (1) {
3827         BEGIN_WITH_FRAME_LOCK(w, ff) {
3828             splice_left_ptrs left_ptrs = compute_left_ptrs_for_sync(w, ff);
3829
3830             // Grab the "left" map and store pointers to those locations.
3831             left_map = *(left_ptrs.map_ptr);
3832             *(left_ptrs.map_ptr) = NULL;
3833
3834             // Slide the maps in our struct left as far as possible.
3835             if (!left_map) {
3836                 left_map = middle_map;
3837                 middle_map = NULL;
3838             }
3839
3840             *(left_ptrs.exception_ptr) =
3841                 __cilkrts_merge_pending_exceptions(w,
3842                                                    *left_ptrs.exception_ptr,
3843                                                    ff->pending_exception);
3844             ff->pending_exception = NULL;
3845
3846             // If there is no middle map, then we are done.
3847             // Deposit left and return.
3848             if (!middle_map) {
3849                 *(left_ptrs).map_ptr = left_map;
3850                 #if (REDPAR_DEBUG > 0)
3851                 CILK_ASSERT(NULL == w->reducer_map);
3852                 #endif
3853                 // Sanity check upon leaving the loop.
3854                 verify_current_wkr(w);
3855                 // Make sure to unlock before we return!
3856                 __cilkrts_frame_unlock(w, ff);
3857                 return w;
3858             }
3859         } END_WITH_FRAME_LOCK(w, ff);
3860
3861         // After we've released the lock, start counting time as
3862         // WORKING again.
3863         STOP_INTERVAL(w, INTERVAL_IN_RUNTIME);
3864         START_INTERVAL(w, INTERVAL_WORKING);
3865
3866         // If we get here, we have a nontrivial reduction to execute.
3867         middle_map = repeated_merge_reducer_maps(&w,
3868                                                  left_map,
3869                                                  middle_map);
3870         verify_current_wkr(w);
3871
3872         STOP_INTERVAL(w, INTERVAL_WORKING);
3873         START_INTERVAL(w, INTERVAL_IN_RUNTIME);
3874
3875         // Save any exceptions generated because of the reduction
3876         // process.  These get merged the next time around the
3877         // loop.
3878         CILK_ASSERT(NULL == ff->pending_exception);
3879         ff->pending_exception = w->l->pending_exception;
3880         w->l->pending_exception = NULL;
3881     }
3882
3883     // We should never break out of the loop above.
3884     CILK_ASSERT(0);
3885     return NULL;
3886 }
3887
3888
3889 /**
3890  * Execute reductions when ff stalls at a sync.
3891  *
3892  * Execution starts on w, but may finish on a different worker.
3893  * This method may acquire/release the lock on ff.
3894  *
3895  * @param w          The currently executing worker.
3896  * @param ff         The full frame of the spawned function at the sync
3897  * @param sf_at_sync The __cilkrts_stack_frame stalling at a sync
3898  * @return           The worker returning from this method.
3899  */
3900 static __cilkrts_worker*
3901 execute_reductions_for_sync(__cilkrts_worker *w,
3902                             full_frame *ff,
3903                             __cilkrts_stack_frame *sf_at_sync)
3904 {
3905     int finished_reductions;
3906     // Step B1 from reducer protocol above:
3907     // Restore runtime invariants.
3908     //
3909     // The following code for this step is almost equivalent to
3910     // the following sequence:
3911     //   1. disown(w, ff, sf_at_sync, "sync") (which itself
3912     //        calls make_unrunnable(w, ff, sf_at_sync))
3913     //   2. make_runnable(w, ff, sf_at_sync).
3914     //
3915     // The "disown" will mark the frame "sf_at_sync"
3916     // as stolen and suspended, and save its place on the stack,
3917     // so it can be resumed after the sync.
3918     //
3919     // The difference is, that we don't want the disown to
3920     // break the following connections yet, since we are
3921     // about to immediately make sf/ff runnable again anyway.
3922     //   sf_at_sync->worker == w
3923     //   w->l->frame_ff == ff.
3924     //
3925     // These connections are needed for parallel reductions, since
3926     // we will use sf / ff as the stack frame / full frame for
3927     // executing any potential reductions.
3928     //
3929     // TBD: Can we refactor the disown / make_unrunnable code
3930     // to avoid the code duplication here?
3931
3932     ff->call_stack = NULL;
3933
3934     // Normally, "make_unrunnable" would add CILK_FRAME_STOLEN and
3935     // CILK_FRAME_SUSPENDED to sf_at_sync->flags and save the state of
3936     // the stack so that a worker can resume the frame in the correct
3937     // place.
3938     //
3939     // But on this path, CILK_FRAME_STOLEN should already be set.
3940     // Also, we technically don't want to suspend the frame until
3941     // the reduction finishes.
3942     // We do, however, need to save the stack before
3943     // we start any reductions, since the reductions might push more
3944     // data onto the stack.
3945     CILK_ASSERT(sf_at_sync->flags | CILK_FRAME_STOLEN);
3946
3947     __cilkrts_put_stack(ff, sf_at_sync);
3948     __cilkrts_make_unrunnable_sysdep(w, ff, sf_at_sync, 1,
3949                                      "execute_reductions_for_sync");
3950     CILK_ASSERT(w->l->frame_ff == ff);
3951
3952     // Step B2: Execute reductions on user stack.
3953     // Check if we have any "real" reductions to do.
3954     finished_reductions = fast_path_reductions_for_sync(w, ff);
3955
3956     if (!finished_reductions) {
3957         // Still have some real reductions to execute.
3958         // Run them here.
3959
3960         // This method may acquire/release the lock on ff.
3961         w = slow_path_reductions_for_sync(w, ff);
3962
3963         // The previous call may return on a different worker.
3964         // than what we started on.
3965         verify_current_wkr(w);
3966     }
3967
3968 #if REDPAR_DEBUG >= 0
3969     CILK_ASSERT(w->l->frame_ff == ff);
3970     CILK_ASSERT(ff->call_stack == NULL);
3971 #endif
3972
3973     // Now we suspend the frame ff (since we've
3974     // finished the reductions).  Roughly, we've split apart the
3975     // "make_unrunnable" call here --- we've already saved the
3976     // stack info earlier before the reductions execute.
3977     // All that remains is to restore the call stack back into the
3978     // full frame, and mark the frame as suspended.
3979     ff->call_stack = sf_at_sync;
3980     sf_at_sync->flags |= CILK_FRAME_SUSPENDED;
3981
3982     // At a nontrivial sync, we should always free the current fiber,
3983     // because it can not be leftmost.
3984     w->l->fiber_to_free = ff->fiber_self;
3985     ff->fiber_self = NULL;
3986     return w;
3987 }
3988
3989
3990 /*
3991   Local Variables: **
3992   c-file-style:"bsd" **
3993   c-basic-offset:4 **
3994   indent-tabs-mode:nil **
3995   End: **
3996 */