libcilkrts/runtime/scheduler.c

   1 /* scheduler.c                  -*-C-*-
   2  *
   3  *************************************************************************
   4  *
   5  *  Copyright (C) 2007-2016, Intel Corporation
   6  *  All rights reserved.
   7  *
   8  *  Redistribution and use in source and binary forms, with or without
   9  *  modification, are permitted provided that the following conditions
  10  *  are met:
  11  *
  12  *    * Redistributions of source code must retain the above copyright
  13  *      notice, this list of conditions and the following disclaimer.
  14  *    * Redistributions in binary form must reproduce the above copyright
  15  *      notice, this list of conditions and the following disclaimer in
  16  *      the documentation and/or other materials provided with the
  17  *      distribution.
  18  *    * Neither the name of Intel Corporation nor the names of its
  19  *      contributors may be used to endorse or promote products derived
  20  *      from this software without specific prior written permission.
  21  *
  22  *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  23  *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  24  *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  25  *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  26  *  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  27  *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  28  *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
  29  *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  30  *  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  31  *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
  32  *  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  33  *  POSSIBILITY OF SUCH DAMAGE.
  34  *
  35  *  *********************************************************************
  36  *
  37  *  PLEASE NOTE: This file is a downstream copy of a file mainitained in
  38  *  a repository at cilkplus.org. Changes made to this file that are not
  39  *  submitted through the contribution process detailed at
  40  *  http://www.cilkplus.org/submit-cilk-contribution will be lost the next
  41  *  time that a new version is released. Changes only submitted to the
  42  *  GNU compiler collection or posted to the git repository at
  43  *  https://bitbucket.org/intelcilkruntime/intel-cilk-runtime.git are
  44  *  not tracked.
  45  *
  46  *  We welcome your contributions to this open source project. Thank you
  47  *  for your assistance in helping us improve Cilk Plus.
  48  *
  49  **************************************************************************/
  50
  51 /*
  52  * Cilk scheduler
  53  */
  54
  55 #include "scheduler.h"
  56 #include "bug.h"
  57 #include "os.h"
  58 #include "os_mutex.h"
  59 #include "local_state.h"
  60 #include "signal_node.h"
  61 #include "full_frame.h"
  62 #include "sysdep.h"
  63 #include "except.h"
  64 #include "cilk_malloc.h"
  65 #include "pedigrees.h"
  66 #include "record-replay.h"
  67
  68 #include <limits.h>
  69 #include <string.h> /* memcpy */
  70 #include <stdio.h>  // sprintf
  71 #include <stdlib.h> // malloc, free, abort
  72
  73 #ifdef _WIN32
  74 #   pragma warning(disable:1786)   // disable warning: sprintf is deprecated
  75 #   include "sysdep-win.h"
  76 #   include "except-win32.h"
  77 #endif  // _WIN32
  78
  79 // ICL: Don't complain about conversion from pointer to same-sized integral
  80 // type in __cilkrts_put_stack.  That's why we're using ptrdiff_t
  81 #ifdef _WIN32
  82 #   pragma warning(disable: 1684)
  83 #endif
  84
  85 #include "cilk/cilk_api.h"
  86 #include "frame_malloc.h"
  87 #include "metacall_impl.h"
  88 #include "reducer_impl.h"
  89 #include "cilk-tbb-interop.h"
  90 #include "cilk-ittnotify.h"
  91 #include "stats.h"
  92
  93 // ICL: Don't complain about loss of precision in myrand
  94 // I tried restoring the warning after the function, but it didn't
  95 // suppress it
  96 #ifdef _WIN32
  97 #   pragma warning(disable: 2259)
  98 #endif
  99
 100 #ifndef _WIN32
 101 #   include <unistd.h>
 102 #endif
 103
 104 #ifdef __VXWORKS__
 105 // redeclare longjmp() with noreturn to stop warnings
 106 extern __attribute__((noreturn))
 107                 void longjmp(jmp_buf, int);
 108 #endif
 109
 110 //#define DEBUG_LOCKS 1
 111 #ifdef DEBUG_LOCKS
 112 // The currently executing worker must own this worker's lock
 113 #   define ASSERT_WORKER_LOCK_OWNED(w) \
 114         { \
 115             __cilkrts_worker *tls_worker = __cilkrts_get_tls_worker(); \
 116             CILK_ASSERT((w)->l->lock.owner == tls_worker); \
 117         }
 118 #else
 119 #   define ASSERT_WORKER_LOCK_OWNED(w)
 120 #endif // DEBUG_LOCKS
 121
 122 // Options for the scheduler.
 123 enum schedule_t { SCHEDULE_RUN,
 124                   SCHEDULE_WAIT,
 125                   SCHEDULE_EXIT };
 126
 127 // Return values for provably_good_steal()
 128 enum provably_good_steal_t
 129 {
 130     ABANDON_EXECUTION,  // Not the last child to the sync - attempt to steal work
 131     CONTINUE_EXECUTION, // Last child to the sync - continue executing on this worker
 132     WAIT_FOR_CONTINUE   // The replay log indicates that this was the worker
 133                         // which continued.  Loop until we are the last worker
 134                         // to the sync.
 135 };
 136
 137
 138 // Verify that "w" is the worker we are currently executing on.
 139 // Because this check is expensive, this method is usually a no-op.
 140 static inline void verify_current_wkr(__cilkrts_worker *w)
 141 {
 142 #if ((REDPAR_DEBUG >= 3) || (FIBER_DEBUG >= 1))
 143     // Lookup the worker from TLS and compare to w.
 144     __cilkrts_worker* tmp = __cilkrts_get_tls_worker();
 145     if (w != tmp) {
 146         fprintf(stderr, "Error.  W=%d, actual worker =%d...\n",
 147                 w->self,
 148                 tmp->self);
 149     }
 150     CILK_ASSERT(w == tmp);
 151 #endif
 152 }
 153
 154 static enum schedule_t worker_runnable(__cilkrts_worker *w);
 155
 156 // Scheduling-fiber functions:
 157 static void do_return_from_spawn (__cilkrts_worker *w,
 158                                   full_frame *ff,
 159                                   __cilkrts_stack_frame *sf);
 160 static void do_sync (__cilkrts_worker *w,
 161                      full_frame *ff,
 162                      __cilkrts_stack_frame *sf);
 163
 164 // max is defined on Windows and VxWorks
 165 #if (! defined(_WIN32)) && (! defined(__VXWORKS__))
 166     // TBD: definition of max() for Linux.
 167 #   define max(a, b) ((a) < (b) ? (b) : (a))
 168 #endif
 169
 170 void __cilkrts_dump_stats_to_stderr(global_state_t *g)
 171 {
 172 #ifdef CILK_PROFILE
 173     int i;
 174     for (i = 0; i < g->total_workers; ++i) {
 175         // Print out statistics for each worker.  We collected them,
 176         // so why not print them out?
 177         fprintf(stderr, "Stats for worker %d\n", i);
 178         dump_stats_to_file(stderr, g->workers[i]->l->stats);
 179         __cilkrts_accum_stats(&g->stats, g->workers[i]->l->stats);
 180     }
 181
 182     // Also print out aggregate statistics.
 183     dump_stats_to_file(stderr, &g->stats);
 184 #endif
 185     fprintf(stderr,
 186             "CILK PLUS Thread Info: P=%d, Q=%d\n",
 187             g->P,
 188             g->Q);
 189     fprintf(stderr,
 190             "CILK PLUS RUNTIME MEMORY USAGE: %lld bytes",
 191             (long long)g->frame_malloc.allocated_from_os);
 192 #ifdef CILK_PROFILE
 193     if (g->stats.stack_hwm)
 194         fprintf(stderr, ", %ld stacks", g->stats.stack_hwm);
 195 #endif
 196     fputc('\n', stderr);
 197 }
 198
 199 static void validate_worker(__cilkrts_worker *w)
 200 {
 201     /* check the magic numbers, for debugging purposes */
 202     if (w->l->worker_magic_0 != WORKER_MAGIC_0 ||
 203         w->l->worker_magic_1 != WORKER_MAGIC_1)
 204         abort_because_rts_is_corrupted();
 205 }
 206
 207 static void double_link(full_frame *left_ff, full_frame *right_ff)
 208 {
 209     if (left_ff)
 210         left_ff->right_sibling = right_ff;
 211     if (right_ff)
 212         right_ff->left_sibling = left_ff;
 213 }
 214
 215 /* add CHILD to the right of all children of PARENT */
 216 static void push_child(full_frame *parent_ff, full_frame *child_ff)
 217 {
 218     double_link(parent_ff->rightmost_child, child_ff);
 219     double_link(child_ff, 0);
 220     parent_ff->rightmost_child = child_ff;
 221 }
 222
 223 /* unlink CHILD from the list of all children of PARENT */
 224 static void unlink_child(full_frame *parent_ff, full_frame *child_ff)
 225 {
 226     double_link(child_ff->left_sibling, child_ff->right_sibling);
 227
 228     if (!child_ff->right_sibling) {
 229         /* this is the rightmost child -- update parent link */
 230         CILK_ASSERT(parent_ff->rightmost_child == child_ff);
 231         parent_ff->rightmost_child = child_ff->left_sibling;
 232     }
 233     child_ff->left_sibling = child_ff->right_sibling = 0; /* paranoia */
 234 }
 235
 236 static void incjoin(full_frame *ff)
 237 {
 238     ++ff->join_counter;
 239 }
 240
 241 static int decjoin(full_frame *ff)
 242 {
 243     CILK_ASSERT(ff->join_counter > 0);
 244     return (--ff->join_counter);
 245 }
 246
 247 static int simulate_decjoin(full_frame *ff)
 248 {
 249   CILK_ASSERT(ff->join_counter > 0);
 250   return (ff->join_counter - 1);
 251 }
 252
 253 /*
 254  * Pseudo-random generator defined by the congruence S' = 69070 * S
 255  * mod (2^32 - 5).  Marsaglia (CACM July 1993) says on page 107 that
 256  * this is a ``good one''.  There you go.
 257  *
 258  * The literature makes a big fuss about avoiding the division, but
 259  * for us it is not worth the hassle.
 260  */
 261 static const unsigned RNGMOD = ((1ULL << 32) - 5);
 262 static const unsigned RNGMUL = 69070U;
 263
 264 static unsigned myrand(__cilkrts_worker *w)
 265 {
 266     unsigned state = w->l->rand_seed;
 267     state = (unsigned)((RNGMUL * (unsigned long long)state) % RNGMOD);
 268     w->l->rand_seed = state;
 269     return state;
 270 }
 271
 272 static void mysrand(__cilkrts_worker *w, unsigned seed)
 273 {
 274     seed %= RNGMOD;
 275     seed += (seed == 0); /* 0 does not belong to the multiplicative
 276                             group.  Use 1 instead */
 277     w->l->rand_seed = seed;
 278 }
 279
 280 /* W grabs its own lock */
 281 void __cilkrts_worker_lock(__cilkrts_worker *w)
 282 {
 283     validate_worker(w);
 284     CILK_ASSERT(w->l->do_not_steal == 0);
 285
 286     /* tell thieves to stay out of the way */
 287     w->l->do_not_steal = 1;
 288     __cilkrts_fence(); /* probably redundant */
 289
 290     __cilkrts_mutex_lock(w, &w->l->lock);
 291 }
 292
 293 void __cilkrts_worker_unlock(__cilkrts_worker *w)
 294 {
 295     __cilkrts_mutex_unlock(w, &w->l->lock);
 296     CILK_ASSERT(w->l->do_not_steal == 1);
 297     /* The fence is probably redundant.  Use a release
 298        operation when supported (gcc and compatibile);
 299        that is faster on x86 which serializes normal stores. */
 300 #if defined __GNUC__ && (__GNUC__ * 10 + __GNUC_MINOR__ > 43 || __ICC >= 1110)
 301     __sync_lock_release(&w->l->do_not_steal);
 302 #else
 303     w->l->do_not_steal = 0;
 304     __cilkrts_fence(); /* store-store barrier, redundant on x86 */
 305 #endif
 306 }
 307
 308 /* try to acquire the lock of some *other* worker */
 309 static int worker_trylock_other(__cilkrts_worker *w,
 310                                 __cilkrts_worker *other)
 311 {
 312     int status = 0;
 313
 314     validate_worker(other);
 315
 316     /* This protocol guarantees that, after setting the DO_NOT_STEAL
 317        flag, worker W can enter its critical section after waiting for
 318        the thief currently in the critical section (if any) and at
 319        most one other thief.
 320
 321        This requirement is overly paranoid, but it should protect us
 322        against future nonsense from OS implementors.
 323     */
 324
 325     /* compete for the right to disturb OTHER */
 326     if (__cilkrts_mutex_trylock(w, &other->l->steal_lock)) {
 327         if (other->l->do_not_steal) {
 328             /* leave it alone */
 329         } else {
 330             status = __cilkrts_mutex_trylock(w, &other->l->lock);
 331         }
 332         __cilkrts_mutex_unlock(w, &other->l->steal_lock);
 333     }
 334
 335
 336     return status;
 337 }
 338
 339 static void worker_unlock_other(__cilkrts_worker *w,
 340                                 __cilkrts_worker *other)
 341 {
 342     __cilkrts_mutex_unlock(w, &other->l->lock);
 343 }
 344
 345
 346 /* Lock macro Usage:
 347     BEGIN_WITH_WORKER_LOCK(w) {
 348         statement;
 349         statement;
 350         BEGIN_WITH_FRAME_LOCK(w, ff) {
 351             statement;
 352             statement;
 353         } END_WITH_FRAME_LOCK(w, ff);
 354     } END_WITH_WORKER_LOCK(w);
 355  */
 356 #define BEGIN_WITH_WORKER_LOCK(w) __cilkrts_worker_lock(w); do
 357 #define END_WITH_WORKER_LOCK(w)   while (__cilkrts_worker_unlock(w), 0)
 358
 359 // TBD(jsukha): These are worker lock acquistions on
 360 // a worker whose deque is empty.  My conjecture is that we
 361 // do not need to hold the worker lock at these points.
 362 // I have left them in for now, however.
 363 //
 364 // #define REMOVE_POSSIBLY_OPTIONAL_LOCKS
 365 #ifdef REMOVE_POSSIBLY_OPTIONAL_LOCKS
 366     #define BEGIN_WITH_WORKER_LOCK_OPTIONAL(w) do
 367     #define END_WITH_WORKER_LOCK_OPTIONAL(w)   while (0)
 368 #else
 369     #define BEGIN_WITH_WORKER_LOCK_OPTIONAL(w) __cilkrts_worker_lock(w); do
 370     #define END_WITH_WORKER_LOCK_OPTIONAL(w)   while (__cilkrts_worker_unlock(w), 0)
 371 #endif
 372
 373
 374 #define BEGIN_WITH_FRAME_LOCK(w, ff)                                     \
 375     do { full_frame *_locked_ff = ff; __cilkrts_frame_lock(w, _locked_ff); do
 376
 377 #define END_WITH_FRAME_LOCK(w, ff)                       \
 378     while (__cilkrts_frame_unlock(w, _locked_ff), 0); } while (0)
 379
 380 /* W becomes the owner of F and F can be stolen from W */
 381 static void make_runnable(__cilkrts_worker *w, full_frame *ff)
 382 {
 383     w->l->frame_ff = ff;
 384
 385     /* CALL_STACK is invalid (the information is stored implicitly in W) */
 386     ff->call_stack = 0;
 387 }
 388
 389 /*
 390  * The worker parameter is unused, except for print-debugging purposes.
 391  */
 392 static void make_unrunnable(__cilkrts_worker *w,
 393                             full_frame *ff,
 394                             __cilkrts_stack_frame *sf,
 395                             int is_loot,
 396                             const char *why)
 397 {
 398     /* CALL_STACK becomes valid again */
 399     ff->call_stack = sf;
 400
 401     if (sf) {
 402 #if CILK_LIB_DEBUG
 403         if (__builtin_expect(sf->flags & CILK_FRAME_EXITING, 0))
 404             __cilkrts_bug("W%d suspending exiting frame %p/%p\n", w->self, ff, sf);
 405 #endif
 406         sf->flags |= CILK_FRAME_STOLEN | CILK_FRAME_SUSPENDED;
 407         sf->worker = 0;
 408
 409         if (is_loot)
 410             __cilkrts_put_stack(ff, sf);
 411
 412         /* perform any system-dependent action, such as saving the
 413            state of the stack */
 414         __cilkrts_make_unrunnable_sysdep(w, ff, sf, is_loot, why);
 415     }
 416 }
 417
 418
 419 /* Push the next full frame to be made active in this worker and increment its
 420  * join counter.  __cilkrts_push_next_frame and pop_next_frame work on a
 421  * one-element queue.  This queue is used to communicate across the runtime
 422  * from the code that wants to activate a frame to the code that can actually
 423  * begin execution on that frame.  They are asymetrical in that push
 424  * increments the join counter but pop does not decrement it.  Rather, a
 425  * single push/pop combination makes a frame active and increments its join
 426  * counter once. */
 427 void __cilkrts_push_next_frame(__cilkrts_worker *w, full_frame *ff)
 428 {
 429     CILK_ASSERT(ff);
 430     CILK_ASSERT(!w->l->next_frame_ff);
 431     incjoin(ff);
 432     w->l->next_frame_ff = ff;
 433 }
 434
 435 /* Get the next full-frame to be made active in this worker.  The join count
 436  * of the full frame will have been incremented by the corresponding push
 437  * event.  See __cilkrts_push_next_frame, above.
 438  */
 439 static full_frame *pop_next_frame(__cilkrts_worker *w)
 440 {
 441     full_frame *ff;
 442     ff = w->l->next_frame_ff;
 443     // Remove the frame from the next_frame field.
 444     //
 445     // If this is a user worker, then there is a chance that another worker
 446     // from our team could push work into our next_frame (if it is the last
 447     // worker doing work for this team).  The other worker's setting of the
 448     // next_frame could race with our setting of next_frame to NULL.  This is
 449     // the only possible race condition on next_frame.  However, if next_frame
 450     // has a non-NULL value, then it means the team still has work to do, and
 451     // there is no chance of another team member populating next_frame.  Thus,
 452     // it is safe to set next_frame to NULL, if it was populated.  There is no
 453     // need for an atomic op.
 454     if (NULL != ff) {
 455         w->l->next_frame_ff = NULL;
 456     }
 457     return ff;
 458 }
 459
 460 /*
 461  * Identify the single worker that is allowed to cross a sync in this frame.  A
 462  * thief should call this function when it is the first to steal work from a
 463  * user worker.  "First to steal work" may mean that there has been parallelism
 464  * in the user worker before, but the whole team sync'd, and this is the first
 465  * steal after that.
 466  *
 467  * This should happen while holding the worker and frame lock.
 468  */
 469 static void set_sync_master(__cilkrts_worker *w, full_frame *ff)
 470 {
 471     w->l->last_full_frame = ff;
 472     ff->sync_master = w;
 473 }
 474
 475 /*
 476  * The sync that ends all parallelism for a particular user worker is about to
 477  * be crossed.  Decouple the worker and frame.
 478  *
 479  * No locks need to be held since the user worker isn't doing anything, and none
 480  * of the system workers can steal from it.  But unset_sync_master() should be
 481  * called before the user worker knows about this work (i.e., before it is
 482  * inserted into the w->l->next_frame_ff is set).
 483  */
 484 static void unset_sync_master(__cilkrts_worker *w, full_frame *ff)
 485 {
 486     CILK_ASSERT(WORKER_USER == w->l->type);
 487     CILK_ASSERT(ff->sync_master == w);
 488     ff->sync_master = NULL;
 489     w->l->last_full_frame = NULL;
 490 }
 491
 492 /********************************************************************
 493  * THE protocol:
 494  ********************************************************************/
 495 /*
 496  * This is a protocol for work stealing that minimizes the overhead on
 497  * the victim.
 498  *
 499  * The protocol uses three shared pointers into the worker's deque:
 500  * - T - the "tail"
 501  * - H - the "head"
 502  * - E - the "exception"  NB: In this case, "exception" has nothing to do
 503  * with C++ throw-catch exceptions -- it refers only to a non-normal return,
 504  * i.e., a steal or similar scheduling exception.
 505  *
 506  * with H <= E, H <= T.
 507  *
 508  * Stack frames SF, where H <= E < T, are available for stealing.
 509  *
 510  * The worker operates on the T end of the stack.  The frame being
 511  * worked on is not on the stack.  To make a continuation available for
 512  * stealing the worker pushes a from onto the stack: stores *T++ = SF.
 513  * To return, it pops the frame off the stack: obtains SF = *--T.
 514  *
 515  * After decrementing T, the condition E > T signals to the victim that
 516  * it should invoke the runtime system's "THE" exception handler.  The
 517  * pointer E can become INFINITY, in which case the victim must invoke
 518  * the THE exception handler as soon as possible.
 519  *
 520  * See "The implementation of the Cilk-5 multithreaded language", PLDI 1998,
 521  * http://portal.acm.org/citation.cfm?doid=277652.277725, for more information
 522  * on the THE protocol.
 523  */
 524
 525 /* the infinity value of E */
 526 #define EXC_INFINITY  ((__cilkrts_stack_frame **) (-1))
 527
 528 static void increment_E(__cilkrts_worker *victim)
 529 {
 530     __cilkrts_stack_frame *volatile *tmp;
 531
 532     // The currently executing worker must own the worker lock to touch
 533     // victim->exc
 534     ASSERT_WORKER_LOCK_OWNED(victim);
 535
 536     tmp = victim->exc;
 537     if (tmp != EXC_INFINITY) {
 538         /* On most x86 this pair of operations would be slightly faster
 539            as an atomic exchange due to the implicit memory barrier in
 540            an atomic instruction. */
 541         victim->exc = tmp + 1;
 542         __cilkrts_fence();
 543     }
 544 }
 545
 546 static void decrement_E(__cilkrts_worker *victim)
 547 {
 548     __cilkrts_stack_frame *volatile *tmp;
 549
 550     // The currently executing worker must own the worker lock to touch
 551     // victim->exc
 552     ASSERT_WORKER_LOCK_OWNED(victim);
 553
 554     tmp = victim->exc;
 555     if (tmp != EXC_INFINITY) {
 556         /* On most x86 this pair of operations would be slightly faster
 557            as an atomic exchange due to the implicit memory barrier in
 558            an atomic instruction. */
 559         victim->exc = tmp - 1;
 560         __cilkrts_fence(); /* memory fence not really necessary */
 561     }
 562 }
 563
 564 #if 0
 565 /* for now unused, will be necessary if we implement abort */
 566 static void signal_THE_exception(__cilkrts_worker *wparent)
 567 {
 568     wparent->exc = EXC_INFINITY;
 569     __cilkrts_fence();
 570 }
 571 #endif
 572
 573 static void reset_THE_exception(__cilkrts_worker *w)
 574 {
 575     // The currently executing worker must own the worker lock to touch
 576     // w->exc
 577     ASSERT_WORKER_LOCK_OWNED(w);
 578
 579     w->exc = w->head;
 580     __cilkrts_fence();
 581 }
 582
 583 /* conditions under which victim->head can be stolen: */
 584 static int can_steal_from(__cilkrts_worker *victim)
 585 {
 586     return ((victim->head < victim->tail) &&
 587             (victim->head < victim->protected_tail));
 588 }
 589
 590 /* Return TRUE if the frame can be stolen, false otherwise */
 591 static int dekker_protocol(__cilkrts_worker *victim)
 592 {
 593     // increment_E and decrement_E are going to touch victim->exc.  The
 594     // currently executing worker must own victim's lock before they can
 595     // modify it
 596     ASSERT_WORKER_LOCK_OWNED(victim);
 597
 598     /* ASSERT(E >= H); */
 599
 600     increment_E(victim);
 601
 602     /* ASSERT(E >= H + 1); */
 603     if (can_steal_from(victim)) {
 604         /* success, we can steal victim->head and set H <- H + 1
 605            in detach() */
 606         return 1;
 607     } else {
 608         /* failure, restore previous state */
 609         decrement_E(victim);
 610         return 0;
 611     }
 612 }
 613
 614
 615 /* Link PARENT and CHILD in the spawn tree */
 616 static full_frame *make_child(__cilkrts_worker *w,
 617                               full_frame *parent_ff,
 618                               __cilkrts_stack_frame *child_sf,
 619                               cilk_fiber *fiber)
 620 {
 621     full_frame *child_ff = __cilkrts_make_full_frame(w, child_sf);
 622
 623     child_ff->parent = parent_ff;
 624     push_child(parent_ff, child_ff);
 625
 626     //DBGPRINTF("%d-          make_child - child_frame: %p, parent_frame: %p, child_sf: %p\n"
 627     //    "            parent - parent: %p, left_sibling: %p, right_sibling: %p, rightmost_child: %p\n"
 628     //    "            child  - parent: %p, left_sibling: %p, right_sibling: %p, rightmost_child: %p\n",
 629     //          w->self, child, parent, child_sf,
 630     //          parent->parent, parent->left_sibling, parent->right_sibling, parent->rightmost_child,
 631     //          child->parent, child->left_sibling, child->right_sibling, child->rightmost_child);
 632     CILK_ASSERT(parent_ff->call_stack);
 633     child_ff->is_call_child = (fiber == NULL);
 634
 635     /* PLACEHOLDER_FIBER is used as non-null marker indicating that
 636        child should be treated as a spawn child even though we have not
 637        yet assigned a real fiber to its parent. */
 638     if (fiber == PLACEHOLDER_FIBER)
 639         fiber = NULL; /* Parent actually gets a null fiber, for now */
 640
 641     /* perform any system-dependent actions, such as capturing
 642        parameter passing information */
 643     /*__cilkrts_make_child_sysdep(child, parent);*/
 644
 645     /* Child gets reducer map and stack of parent.
 646        Parent gets a new map and new stack. */
 647     child_ff->fiber_self = parent_ff->fiber_self;
 648     child_ff->sync_master = NULL;
 649
 650     if (child_ff->is_call_child) {
 651         /* Cause segfault on any attempted access.  The parent gets
 652            the child map and stack when the child completes. */
 653         parent_ff->fiber_self = 0;
 654     } else {
 655         parent_ff->fiber_self = fiber;
 656     }
 657
 658     incjoin(parent_ff);
 659     return child_ff;
 660 }
 661
 662 static inline __cilkrts_stack_frame *__cilkrts_advance_frame(__cilkrts_stack_frame *sf)
 663 {
 664     __cilkrts_stack_frame *p = sf->call_parent;
 665     sf->call_parent = 0;
 666     return p;
 667 }
 668
 669 /* w should be the currently executing worker.
 670  * loot_sf is the youngest stack frame in the call stack being
 671  *   unrolled (i.e., the most deeply nested stack frame.)
 672  *
 673  * When this method is called for a steal, loot_sf should be on a
 674  * victim worker which is different from w.
 675  * For CILK_FORCE_REDUCE, the victim worker will equal w.
 676  *
 677  * Before execution, the __cilkrts_stack_frame's have pointers from
 678  * older to younger, i.e., a __cilkrts_stack_frame points to parent.
 679  *
 680  * This method creates a full frame for each __cilkrts_stack_frame in
 681  * the call stack, with each full frame also pointing to its parent.
 682  *
 683  * The method returns the full frame created for loot_sf, i.e., the
 684  * youngest full frame.
 685  */
 686 static full_frame *unroll_call_stack(__cilkrts_worker *w,
 687                                      full_frame *ff,
 688                                      __cilkrts_stack_frame *const loot_sf)
 689 {
 690     __cilkrts_stack_frame *sf = loot_sf;
 691     __cilkrts_stack_frame *rev_sf = 0;
 692     __cilkrts_stack_frame *t_sf;
 693
 694     CILK_ASSERT(sf);
 695     /*CILK_ASSERT(sf->call_parent != sf);*/
 696
 697     /* The leafmost frame is unsynched. */
 698     if (sf->worker != w)
 699         sf->flags |= CILK_FRAME_UNSYNCHED;
 700
 701     /* Reverse the call stack to make a linked list ordered from parent
 702        to child.  sf->call_parent points to the child of SF instead of
 703        the parent.  */
 704     do {
 705         t_sf = (sf->flags & (CILK_FRAME_DETACHED|CILK_FRAME_STOLEN|CILK_FRAME_LAST))? 0 : sf->call_parent;
 706         sf->call_parent = rev_sf;
 707         rev_sf = sf;
 708         sf = t_sf;
 709     } while (sf);
 710     sf = rev_sf;
 711
 712     /* Promote each stack frame to a full frame in order from parent
 713        to child, following the reversed list we just built. */
 714     make_unrunnable(w, ff, sf, sf == loot_sf, "steal 1");
 715     /* T is the *child* of SF, because we have reversed the list */
 716     for (t_sf = __cilkrts_advance_frame(sf); t_sf;
 717          sf = t_sf, t_sf = __cilkrts_advance_frame(sf)) {
 718         ff = make_child(w, ff, t_sf, NULL);
 719         make_unrunnable(w, ff, t_sf, t_sf == loot_sf, "steal 2");
 720     }
 721
 722     /* XXX What if the leafmost frame does not contain a sync
 723        and this steal is from promote own deque? */
 724     /*sf->flags |= CILK_FRAME_UNSYNCHED;*/
 725
 726     CILK_ASSERT(!sf->call_parent);
 727     return ff;
 728 }
 729
 730 /* detach the top of the deque frame from the VICTIM and install a new
 731    CHILD frame in its place */
 732 static void detach_for_steal(__cilkrts_worker *w,
 733                              __cilkrts_worker *victim,
 734                              cilk_fiber* fiber)
 735 {
 736     /* ASSERT: we own victim->lock */
 737
 738     full_frame *parent_ff, *child_ff, *loot_ff;
 739     __cilkrts_stack_frame *volatile *h;
 740     __cilkrts_stack_frame *sf;
 741
 742     w->l->team = victim->l->team;
 743
 744     CILK_ASSERT(w->l->frame_ff == 0 || w == victim);
 745
 746     h = victim->head;
 747
 748     CILK_ASSERT(*h);
 749
 750     victim->head = h + 1;
 751
 752     parent_ff = victim->l->frame_ff;
 753     BEGIN_WITH_FRAME_LOCK(w, parent_ff) {
 754         /* parent no longer referenced by victim */
 755         decjoin(parent_ff);
 756
 757         /* obtain the victim call stack */
 758         sf = *h;
 759
 760         /* perform system-dependent normalizations */
 761         /*__cilkrts_normalize_call_stack_on_steal(sf);*/
 762
 763         /* unroll PARENT_FF with call stack SF, adopt the youngest
 764            frame LOOT.  If loot_ff == parent_ff, then we hold loot_ff->lock,
 765            otherwise, loot_ff is newly created and we can modify it without
 766            holding its lock. */
 767         loot_ff = unroll_call_stack(w, parent_ff, sf);
 768
 769         #if REDPAR_DEBUG >= 3
 770         fprintf(stderr, "[W=%d, victim=%d, desc=detach, parent_ff=%p, loot=%p]\n",
 771                 w->self, victim->self,
 772                 parent_ff, loot_ff);
 773         #endif
 774
 775         if (WORKER_USER == victim->l->type &&
 776             NULL == victim->l->last_full_frame) {
 777             // Mark this looted frame as special: only the original user worker
 778             // may cross the sync.
 779             //
 780             // This call is a shared access to
 781             // victim->l->last_full_frame.
 782             set_sync_master(victim, loot_ff);
 783         }
 784
 785         /* LOOT is the next frame that the thief W is supposed to
 786            run, unless the thief is stealing from itself, in which
 787            case the thief W == VICTIM executes CHILD and nobody
 788            executes LOOT. */
 789         if (w == victim) {
 790             /* Pretend that frame has been stolen */
 791             loot_ff->call_stack->flags |= CILK_FRAME_UNSYNCHED;
 792             loot_ff->simulated_stolen = 1;
 793         }
 794         else
 795             __cilkrts_push_next_frame(w, loot_ff);
 796
 797         // After this "push_next_frame" call, w now owns loot_ff.
 798         child_ff = make_child(w, loot_ff, 0, fiber);
 799
 800         BEGIN_WITH_FRAME_LOCK(w, child_ff) {
 801             /* install child in the victim's work queue, taking
 802                the parent_ff's place */
 803             /* child is referenced by victim */
 804             incjoin(child_ff);
 805
 806             // With this call, w is bestowing ownership of the newly
 807             // created frame child_ff to the victim, and victim is
 808             // giving up ownership of parent_ff.
 809             //
 810             // Worker w will either take ownership of parent_ff
 811             // if parent_ff == loot_ff, or parent_ff will be
 812             // suspended.
 813             //
 814             // Note that this call changes the victim->frame_ff
 815             // while the victim may be executing.
 816             make_runnable(victim, child_ff);
 817         } END_WITH_FRAME_LOCK(w, child_ff);
 818     } END_WITH_FRAME_LOCK(w, parent_ff);
 819 }
 820
 821 /**
 822  * @brief cilk_fiber_proc that resumes user code after a successful
 823  * random steal.
 824
 825  * This function longjmps back into the user code whose state is
 826  * stored in cilk_fiber_get_data(fiber)->resume_sf.  The stack pointer
 827  * is adjusted so that the code resumes on the specified fiber stack
 828  * instead of its original stack.
 829  *
 830  * This method gets executed only on a fiber freshly allocated from a
 831  * pool.
 832  *
 833  * @param fiber   The fiber being used to resume user code.
 834  * @param arg     Unused.
 835  */
 836 static
 837 void fiber_proc_to_resume_user_code_for_random_steal(cilk_fiber *fiber)
 838 {
 839     cilk_fiber_data *data = cilk_fiber_get_data(fiber);
 840     __cilkrts_stack_frame* sf = data->resume_sf;
 841     full_frame *ff;
 842
 843     CILK_ASSERT(sf);
 844
 845     // When we pull the resume_sf out of the fiber to resume it, clear
 846     // the old value.
 847     data->resume_sf = NULL;
 848     CILK_ASSERT(sf->worker == data->owner);
 849     ff = sf->worker->l->frame_ff;
 850
 851     // For Win32, we need to overwrite the default exception handler
 852     // in this function, so that when the OS exception handling code
 853     // walks off the top of the current Cilk stack, it reaches our stub
 854     // handler.
 855
 856     // Also, this function needs to be wrapped into a try-catch block
 857     // so the compiler generates the appropriate exception information
 858     // in this frame.
 859
 860     // TBD: IS THIS HANDLER IN THE WRONG PLACE?  Can we longjmp out of
 861     // this function (and does it matter?)
 862 #if defined(_WIN32) && !defined(_WIN64)
 863     install_exception_stub_handler();
 864     __try
 865 #endif
 866     {
 867         char* new_sp = sysdep_reset_jump_buffers_for_resume(fiber, ff, sf);
 868
 869         // Notify the Intel tools that we're stealing code
 870         ITT_SYNC_ACQUIRED(sf->worker);
 871         NOTIFY_ZC_INTRINSIC("cilk_continue", sf);
 872
 873         // TBD: We'd like to move TBB-interop methods into the fiber
 874         // eventually.
 875         cilk_fiber_invoke_tbb_stack_op(fiber, CILK_TBB_STACK_ADOPT);
 876
 877         sf->flags &= ~CILK_FRAME_SUSPENDED;
 878
 879         // longjmp to user code.  Don't process exceptions here,
 880         // because we are resuming a stolen frame.
 881         sysdep_longjmp_to_sf(new_sp, sf, NULL);
 882         /*NOTREACHED*/
 883         // Intel's C compiler respects the preceding lint pragma
 884     }
 885 #if defined(_WIN32) && !defined(_WIN64)
 886     __except (CILK_ASSERT(!"should not execute the the stub filter"),
 887               EXCEPTION_EXECUTE_HANDLER)
 888     {
 889         // If we are here, that means something very wrong
 890         // has happened in our exception processing...
 891         CILK_ASSERT(! "should not be here!");
 892     }
 893 #endif
 894 }
 895
 896 static void random_steal(__cilkrts_worker *w)
 897 {
 898     __cilkrts_worker *victim = NULL;
 899     cilk_fiber *fiber = NULL;
 900     int n;
 901     int success = 0;
 902     int32_t victim_id;
 903
 904     // Nothing's been stolen yet. When true, this will flag
 905     // setup_for_execution_pedigree to increment the pedigree
 906     w->l->work_stolen = 0;
 907
 908     /* If the user has disabled stealing (using the debugger) we fail */
 909     if (__builtin_expect(w->g->stealing_disabled, 0))
 910         return;
 911
 912     CILK_ASSERT(w->l->type == WORKER_SYSTEM || w->l->team == w);
 913
 914     /* If there is only one processor work can still be stolen.
 915        There must be only one worker to prevent stealing. */
 916     CILK_ASSERT(w->g->total_workers > 1);
 917
 918     /* pick random *other* victim */
 919     n = myrand(w) % (w->g->total_workers - 1);
 920     if (n >= w->self)
 921         ++n;
 922
 923     // If we're replaying a log, override the victim.  -1 indicates that
 924     // we've exhausted the list of things this worker stole when we recorded
 925     // the log so just return.  If we're not replaying a log,
 926     // replay_get_next_recorded_victim() just returns the victim ID passed in.
 927     n = replay_get_next_recorded_victim(w, n);
 928     if (-1 == n)
 929         return;
 930
 931     victim = w->g->workers[n];
 932
 933     START_INTERVAL(w, INTERVAL_FIBER_ALLOCATE) {
 934         /* Verify that we can get a stack.  If not, no need to continue. */
 935         fiber = cilk_fiber_allocate(&w->l->fiber_pool);
 936     } STOP_INTERVAL(w, INTERVAL_FIBER_ALLOCATE);
 937
 938
 939     if (NULL == fiber) {
 940 #if FIBER_DEBUG >= 2
 941         fprintf(stderr, "w=%d: failed steal because we could not get a fiber\n",
 942                 w->self);
 943 #endif
 944         return;
 945     }
 946
 947     /* do not steal from self */
 948     CILK_ASSERT (victim != w);
 949
 950     /* Execute a quick check before engaging in the THE protocol.
 951        Avoid grabbing locks if there is nothing to steal. */
 952     if (!can_steal_from(victim)) {
 953         NOTE_INTERVAL(w, INTERVAL_STEAL_FAIL_EMPTYQ);
 954         START_INTERVAL(w, INTERVAL_FIBER_DEALLOCATE) {
 955             int ref_count = cilk_fiber_remove_reference(fiber, &w->l->fiber_pool);
 956             // Fibers we use when trying to steal should not be active,
 957             // and thus should not have any other references.
 958             CILK_ASSERT(0 == ref_count);
 959         } STOP_INTERVAL(w, INTERVAL_FIBER_DEALLOCATE);
 960         return;
 961     }
 962
 963     /* Attempt to steal work from the victim */
 964     if (worker_trylock_other(w, victim)) {
 965         if (w->l->type == WORKER_USER && victim->l->team != w) {
 966
 967             // Fail to steal if this is a user worker and the victim is not
 968             // on this team.  If a user worker were allowed to steal work
 969             // descended from another user worker, the former might not be
 970             // done with its work by the time it was needed to resume and
 971             // unbind.  Therefore, user workers are not permitted to change
 972             // teams.
 973
 974             // There is no race on the victim's team because the victim cannot
 975             // change its team until it runs out of work to do, at which point
 976             // it will try to take out its own lock, and this worker already
 977             // holds it.
 978             NOTE_INTERVAL(w, INTERVAL_STEAL_FAIL_USER_WORKER);
 979
 980         } else if (victim->l->frame_ff) {
 981             // A successful steal will change victim->frame_ff, even
 982             // though the victim may be executing.  Thus, the lock on
 983             // the victim's deque is also protecting victim->frame_ff.
 984             if (dekker_protocol(victim)) {
 985                 int proceed_with_steal = 1; // optimistic
 986
 987                 // If we're replaying a log, verify that this the correct frame
 988                 // to steal from the victim
 989                 if (! replay_match_victim_pedigree(w, victim))
 990                 {
 991                     // Abort the steal attempt. decrement_E(victim) to
 992                     // counter the increment_E(victim) done by the
 993                     // dekker protocol
 994                     decrement_E(victim);
 995                     proceed_with_steal = 0;
 996                 }
 997
 998                 if (proceed_with_steal)
 999                 {
1000                     START_INTERVAL(w, INTERVAL_STEAL_SUCCESS) {
1001                         success = 1;
1002                         detach_for_steal(w, victim, fiber);
1003                         victim_id = victim->self;
1004
1005                         #if REDPAR_DEBUG >= 1
1006                         fprintf(stderr, "Wkr %d stole from victim %d, fiber = %p\n",
1007                                 w->self, victim->self, fiber);
1008                         #endif
1009
1010                         // The use of victim->self contradicts our
1011                         // classification of the "self" field as
1012                         // local.  But since this code is only for
1013                         // debugging, it is ok.
1014                         DBGPRINTF ("%d-%p: Stealing work from worker %d\n"
1015                             "            sf: %p, call parent: %p\n",
1016                             w->self, GetCurrentFiber(), victim->self,
1017                             w->l->next_frame_ff->call_stack,
1018                             w->l->next_frame_ff->call_stack->call_parent);
1019                     } STOP_INTERVAL(w, INTERVAL_STEAL_SUCCESS);
1020                 }  // end if(proceed_with_steal)
1021             } else {
1022                 NOTE_INTERVAL(w, INTERVAL_STEAL_FAIL_DEKKER);
1023             }
1024         } else {
1025             NOTE_INTERVAL(w, INTERVAL_STEAL_FAIL_EMPTYQ);
1026         }
1027         worker_unlock_other(w, victim);
1028     } else {
1029         NOTE_INTERVAL(w, INTERVAL_STEAL_FAIL_LOCK);
1030     }
1031
1032     // Record whether work was stolen.  When true, this will flag
1033     // setup_for_execution_pedigree to increment the pedigree
1034     w->l->work_stolen = success;
1035
1036     if (0 == success) {
1037         // failed to steal work.  Return the fiber to the pool.
1038         START_INTERVAL(w, INTERVAL_FIBER_DEALLOCATE) {
1039             int ref_count = cilk_fiber_remove_reference(fiber, &w->l->fiber_pool);
1040             // Fibers we use when trying to steal should not be active,
1041             // and thus should not have any other references.
1042             CILK_ASSERT(0 == ref_count);
1043         } STOP_INTERVAL(w, INTERVAL_FIBER_DEALLOCATE);
1044     }
1045     else
1046     {
1047         // Since our steal was successful, finish initialization of
1048         // the fiber.
1049         cilk_fiber_reset_state(fiber,
1050                                fiber_proc_to_resume_user_code_for_random_steal);
1051         // Record the pedigree of the frame that w has stolen.
1052         // record only if CILK_RECORD_LOG is set
1053         replay_record_steal(w, victim_id);
1054     }
1055 }
1056
1057
1058
1059 /**
1060  * At a provably good steal, we need to transfer the child reducer map
1061  * from ff->children_reducer_map into v->reducer_map, where v is the
1062  * worker that resumes execution of ff.
1063  *
1064  * Normally, we have v == w, where w is the currently executing
1065  * worker.  In the case where we are resuming a team leader on a user
1066  * worker, however, v might differ from w.
1067
1068  * Thus, this, operation is a no-op, since we can't really move
1069  * ff->children_reducer_map into w here.
1070  *
1071  * Instead, this work is done in setup_for_execution_reducers().
1072  */
1073 static inline void provably_good_steal_reducers(__cilkrts_worker *w,
1074                                                 full_frame       *ff)
1075 {
1076     // No-op.
1077 }
1078
1079 /* at a provably good steal, incorporate the accumulated exceptions of
1080    children into the parent's exception */
1081 static void provably_good_steal_exceptions(__cilkrts_worker *w,
1082                                            full_frame       *ff)
1083 {
1084     // ASSERT: we own ff->lock
1085     ff->pending_exception =
1086         __cilkrts_merge_pending_exceptions(w,
1087                                            ff->child_pending_exception,
1088                                            ff->pending_exception);
1089     ff->child_pending_exception = NULL;
1090 }
1091
1092 /* At sync discard the frame's old stack and take the leftmost child's. */
1093 static void provably_good_steal_stacks(__cilkrts_worker *w, full_frame *ff)
1094 {
1095     CILK_ASSERT(NULL == ff->fiber_self);
1096     ff->fiber_self = ff->fiber_child;
1097     ff->fiber_child = NULL;
1098 }
1099
1100 static void __cilkrts_mark_synched(full_frame *ff)
1101 {
1102     ff->call_stack->flags &= ~CILK_FRAME_UNSYNCHED;
1103     ff->simulated_stolen = 0;
1104 }
1105
1106 static
1107 enum provably_good_steal_t provably_good_steal(__cilkrts_worker *w,
1108                                                full_frame       *ff)
1109 {
1110     // ASSERT: we hold w->lock and ff->lock
1111
1112     enum provably_good_steal_t result = ABANDON_EXECUTION;
1113
1114     // If the current replay entry is a sync record matching the worker's
1115     // pedigree, AND this isn't the last child to the sync, return
1116     // WAIT_FOR_CONTINUE to indicate that the caller should loop until
1117     // we find the right frame to steal and CONTINUE_EXECUTION is returned.
1118     int match_found = replay_match_sync_pedigree(w);
1119     if (match_found && (0 != simulate_decjoin(ff)))
1120         return WAIT_FOR_CONTINUE;
1121
1122     START_INTERVAL(w, INTERVAL_PROVABLY_GOOD_STEAL) {
1123         if (decjoin(ff) == 0) {
1124             provably_good_steal_reducers(w, ff);
1125             provably_good_steal_exceptions(w, ff);
1126             provably_good_steal_stacks(w, ff);
1127             __cilkrts_mark_synched(ff);
1128
1129             // If the original owner wants this frame back (to resume
1130             // it on its original thread) pass it back now.
1131             if (NULL != ff->sync_master) {
1132                 // The frame wants to go back and be executed by the original
1133                 // user thread.  We can throw caution to the wind and push the
1134                 // frame straight onto its queue because the only way we have
1135                 // gotten to this point of being able to continue execution of
1136                 // the frame is if the original user worker is spinning without
1137                 // work.
1138
1139                 unset_sync_master(w->l->team, ff);
1140                 __cilkrts_push_next_frame(w->l->team, ff);
1141
1142                 // If this is the team leader we're not abandoning the work
1143                 if (w == w->l->team)
1144                     result = CONTINUE_EXECUTION;
1145             } else {
1146                 __cilkrts_push_next_frame(w, ff);
1147                 result = CONTINUE_EXECUTION;  // Continue working on this thread
1148             }
1149
1150             // The __cilkrts_push_next_frame() call changes ownership
1151             // of ff to the specified worker.
1152         }
1153     } STOP_INTERVAL(w, INTERVAL_PROVABLY_GOOD_STEAL);
1154
1155     // Only write a SYNC record if:
1156     // - We're recording a log *AND*
1157     // - We're the worker continuing from this sync
1158     replay_record_sync(w, result == CONTINUE_EXECUTION);
1159
1160     // If we're replaying a log, and matched a sync from the log, mark the
1161     // sync record seen if the sync isn't going to be abandoned.
1162     replay_advance_from_sync (w, match_found, result == CONTINUE_EXECUTION);
1163
1164     return result;
1165 }
1166
1167 static void unconditional_steal(__cilkrts_worker *w,
1168                                 full_frame *ff)
1169 {
1170     // ASSERT: we hold ff->lock
1171
1172     START_INTERVAL(w, INTERVAL_UNCONDITIONAL_STEAL) {
1173         decjoin(ff);
1174         __cilkrts_push_next_frame(w, ff);
1175     } STOP_INTERVAL(w, INTERVAL_UNCONDITIONAL_STEAL);
1176 }
1177
1178
1179 /* CHILD is about to die.  Give its exceptions to a sibling or to the
1180    parent.  */
1181 static inline void splice_exceptions_for_call(__cilkrts_worker *w,
1182                                               full_frame *parent_ff,
1183                                               full_frame *child_ff)
1184 {
1185     // ASSERT: We own parent_ff->lock
1186     CILK_ASSERT(child_ff->is_call_child);
1187     CILK_ASSERT(NULL == child_ff->right_pending_exception);
1188     CILK_ASSERT(NULL == parent_ff->pending_exception);
1189
1190     parent_ff->pending_exception = child_ff->pending_exception;
1191     child_ff->pending_exception = NULL;
1192 }
1193
1194 /**
1195  * Merge exceptions for a dying child.
1196  *
1197  * @param w                   The currently executing worker.
1198  * @param ff                  The child frame that is dying.
1199  * @param left_exception_ptr  Pointer to the exception that is to our left.
1200  */
1201 static inline
1202 void splice_exceptions_for_spawn(__cilkrts_worker *w,
1203                                  full_frame *ff,
1204                                  struct pending_exception_info **left_exception_ptr)
1205 {
1206     // ASSERT: parent_ff == child_ff->parent.
1207     // ASSERT: We own parent_ff->lock
1208
1209     // Merge current exception into the slot where the left
1210     // exception should go.
1211     *left_exception_ptr =
1212         __cilkrts_merge_pending_exceptions(w,
1213                                            *left_exception_ptr,
1214                                            ff->pending_exception);
1215     ff->pending_exception = NULL;
1216
1217
1218     // Merge right exception into the slot where the left exception
1219     // should go.
1220     *left_exception_ptr =
1221         __cilkrts_merge_pending_exceptions(w,
1222                                            *left_exception_ptr,
1223                                            ff->right_pending_exception);
1224     ff->right_pending_exception = NULL;
1225 }
1226
1227
1228 static inline void splice_stacks_for_call(__cilkrts_worker *w,
1229                                           full_frame *parent_ff,
1230                                           full_frame *child_ff)
1231 {
1232 #if CILK_LIB_DEBUG
1233     if (parent_ff->call_stack)
1234         CILK_ASSERT(!(parent_ff->call_stack->flags & CILK_FRAME_MBZ));
1235 #endif
1236
1237     /* A synched frame does not have accumulated child reducers. */
1238     CILK_ASSERT(!child_ff->fiber_child);
1239     CILK_ASSERT(child_ff->is_call_child);
1240
1241     /* An attached parent has no self fiber.  It may have
1242        accumulated child fibers or child owners, which should be
1243        ignored until sync. */
1244     CILK_ASSERT(!parent_ff->fiber_self);
1245     parent_ff->fiber_self = child_ff->fiber_self;
1246     child_ff->fiber_self = NULL;
1247 }
1248
1249 static void finalize_child_for_call(__cilkrts_worker *w,
1250                                     full_frame *parent_ff,
1251                                     full_frame *child_ff)
1252 {
1253     // ASSERT: we hold w->lock and parent_ff->lock
1254
1255     START_INTERVAL(w, INTERVAL_FINALIZE_CHILD) {
1256         CILK_ASSERT(child_ff->is_call_child);
1257         CILK_ASSERT(child_ff->join_counter == 0);
1258         CILK_ASSERT(!child_ff->rightmost_child);
1259         CILK_ASSERT(child_ff == parent_ff->rightmost_child);
1260
1261         // CHILD is about to die.
1262         // Splicing out reducers is a no-op for a call since
1263         // w->reducer_map should already store the correct
1264         // reducer map.
1265
1266         // ASSERT there are no maps left to reduce.
1267         CILK_ASSERT(NULL == child_ff->children_reducer_map);
1268         CILK_ASSERT(NULL == child_ff->right_reducer_map);
1269
1270         splice_exceptions_for_call(w, parent_ff, child_ff);
1271
1272         splice_stacks_for_call(w, parent_ff, child_ff);
1273
1274         /* remove CHILD from list of children of PARENT */
1275         unlink_child(parent_ff, child_ff);
1276
1277         /* continue with the parent. */
1278         unconditional_steal(w, parent_ff);
1279         __cilkrts_destroy_full_frame(w, child_ff);
1280     } STOP_INTERVAL(w, INTERVAL_FINALIZE_CHILD);
1281 }
1282
1283
1284 /**
1285  * The invariant on ff->children_reducer_map is that when ff is
1286  * synched and when we are about to resume execution of ff, at least
1287  * one of ff->children_reducer_map and w->reducer_map must be NULL.
1288  *
1289  * Consider the two possibilities before resuming execution of ff:
1290  *
1291  * 1.  Suppose ff is synched and suspended.  Then either
1292  *
1293  *     (a) ff->children_reducer_map stores the reducer map that w
1294  *         should use, where w is the worker resuming execution of ff,
1295  *         OR
1296  *     (b) w already has a user map, and ff->children_reducer_map is NULL.
1297  *
1298  *     Case (a) happens when we are resuming execution of ff as a
1299  *     provably good steal.  In this case, w->reducer_map should be
1300  *     NULL and ff->children_reducer_map is valid.  To resume
1301  *     execution of ff on w, set w->reducer_map to
1302  *     ff->children_reducer_map.
1303  *
1304  *     Case (b) occurs when we resume execution of ff because ff is a
1305  *     called child.  Then, ff->children_reducer_map should be NULL,
1306  *     and w should already have a valid reducer map when resuming
1307  *     execution of ff.  We resume execution of ff without changing
1308  *     w->reducer_map.
1309  *
1310  * 2. Suppose frame ff is not synched (i.e., it is active and might have
1311  *    active children).   Then ff->children_reducer_map is the slot for
1312  *    storing the reducer map from ff's leftmost child, as in the reducer
1313  *    protocol.   The runtime may resume execution of ff while it is not
1314  *    synched only because of a steal.
1315  *    In this case, while we are resuming ff, ff->children_reducer_map
1316  *    may be non-NULL (because one of ff's children has completed).
1317  *    We resume execution of ff without changing w->reducer_map.
1318  */
1319 static void setup_for_execution_reducers(__cilkrts_worker *w,
1320                                          full_frame *ff)
1321 {
1322     // We only need to move ff->children_reducer_map into
1323     // w->reducer_map in case 1(a).
1324     //
1325     // First check whether ff is synched.
1326     __cilkrts_stack_frame *sf = ff->call_stack;
1327     if (!(sf->flags & CILK_FRAME_UNSYNCHED)) {
1328         // In this case, ff is synched. (Case 1).
1329         CILK_ASSERT(!ff->rightmost_child);
1330
1331         // Test whether we are in case 1(a) and have
1332         // something to do.  Note that if both
1333         // ff->children_reducer_map and w->reducer_map are NULL, we
1334         // can't distinguish between cases 1(a) and 1(b) here.
1335         if (ff->children_reducer_map) {
1336             // We are in Case 1(a).
1337             CILK_ASSERT(!w->reducer_map);
1338             w->reducer_map = ff->children_reducer_map;
1339             ff->children_reducer_map = NULL;
1340         }
1341     }
1342 }
1343
1344 static void setup_for_execution_exceptions(__cilkrts_worker *w,
1345                                            full_frame *ff)
1346 {
1347     CILK_ASSERT(NULL == w->l->pending_exception);
1348     w->l->pending_exception = ff->pending_exception;
1349     ff->pending_exception = NULL;
1350 }
1351
1352 #if 0 /* unused */
1353 static void setup_for_execution_stack(__cilkrts_worker *w,
1354                                       full_frame *ff)
1355 {
1356 }
1357 #endif
1358
1359 /*
1360  * setup_for_execution_pedigree
1361  *
1362  * Copies the pedigree information from the frame we're resuming to the
1363  * worker.  Increments the pedigree if this is work that has been stolen
1364  * to match the increment on a return from a spawn helper.
1365  */
1366 static void setup_for_execution_pedigree(__cilkrts_worker *w)
1367 {
1368     int pedigree_unsynched;
1369     __cilkrts_stack_frame *sf = w->current_stack_frame;
1370
1371     CILK_ASSERT(NULL != sf);
1372
1373     // If this isn't an ABI 1 or later frame, there's no pedigree information
1374     if (0 == CILK_FRAME_VERSION_VALUE(sf->flags))
1375         return;
1376
1377     // Note whether the pedigree is unsynched and clear the flag before
1378     // we forget
1379     pedigree_unsynched = sf->flags & CILK_FRAME_SF_PEDIGREE_UNSYNCHED;
1380     sf->flags &= ~CILK_FRAME_SF_PEDIGREE_UNSYNCHED;
1381
1382     // If we're just marshalling onto this worker, do not increment
1383     // the rank since that wouldn't happen in a sequential execution
1384     if (w->l->work_stolen || pedigree_unsynched)
1385     {
1386         if (w->l->work_stolen)
1387             w->pedigree.rank = sf->parent_pedigree.rank + 1;
1388         else
1389             w->pedigree.rank = sf->parent_pedigree.rank;
1390     }
1391
1392     w->pedigree.parent = sf->parent_pedigree.parent;
1393     w->l->work_stolen = 0;
1394 }
1395
1396 static void setup_for_execution(__cilkrts_worker *w,
1397                                 full_frame *ff,
1398                                 int is_return_from_call)
1399 {
1400     // ASSERT: We own w->lock and ff->lock || P == 1
1401
1402     setup_for_execution_reducers(w, ff);
1403     setup_for_execution_exceptions(w, ff);
1404     /*setup_for_execution_stack(w, ff);*/
1405
1406     ff->call_stack->worker = w;
1407     w->current_stack_frame = ff->call_stack;
1408
1409     // If this is a return from a call, leave the pedigree alone
1410     if (! is_return_from_call)
1411         setup_for_execution_pedigree(w);
1412
1413     __cilkrts_setup_for_execution_sysdep(w, ff);
1414
1415     w->head = w->tail = w->l->ltq;
1416     reset_THE_exception(w);
1417
1418     make_runnable(w, ff);
1419 }
1420
1421
1422 /*
1423  * Called by the scheduling fiber, right before
1424  * resuming a sf/ff for user code.
1425  *
1426  * This method associates the specified sf with the worker.
1427  *
1428  * It also asserts that w, ff, and sf all have the expected properties
1429  * for resuming user code.
1430  */
1431 void scheduling_fiber_prepare_to_resume_user_code(__cilkrts_worker *w,
1432                                                   full_frame *ff,
1433                                                   __cilkrts_stack_frame *sf)
1434 {
1435     w->current_stack_frame = sf;
1436     sf->worker = w;
1437
1438     // Lots of debugging checks on the state of the fiber we might be
1439     // resuming.
1440 #if FIBER_DEBUG >= 1
1441 #   if FIBER_DEBUG >= 3
1442     {
1443         fprintf(stderr, "w=%d: ff=%p, sf=%p. about to resume user code\n",
1444                 w->self, ff, sf);
1445     }
1446 #   endif
1447
1448     const int flags = sf->flags;
1449     CILK_ASSERT(flags & CILK_FRAME_SUSPENDED);
1450     CILK_ASSERT(!sf->call_parent);
1451     CILK_ASSERT(w->head == w->tail);
1452
1453     /* A frame can not be resumed unless it was suspended. */
1454     CILK_ASSERT(ff->sync_sp != NULL);
1455
1456     /* The leftmost frame has no allocated stack */
1457     if (ff->simulated_stolen)
1458         CILK_ASSERT(flags & CILK_FRAME_UNSYNCHED);
1459     else if (flags & CILK_FRAME_UNSYNCHED)
1460         /* XXX By coincidence sync_sp could be null. */
1461         CILK_ASSERT(ff->fiber_self != NULL);
1462     else
1463         /* XXX This frame could be resumed unsynched on the leftmost stack */
1464         CILK_ASSERT((ff->sync_master == 0 || ff->sync_master == w));
1465     CILK_ASSERT(w->l->frame_ff == ff);
1466 #endif
1467 }
1468
1469
1470 /**
1471  * This method is the first method that should execute after we've
1472  * switched to a scheduling fiber from user code.
1473  *
1474  * @param fiber The scheduling fiber for the current worker.
1475  * @param wptr  The current worker.
1476  */
1477 static void enter_runtime_transition_proc(cilk_fiber *fiber)
1478 {
1479     // We can execute this method for one of three reasons:
1480     // 1. Undo-detach finds parent stolen.
1481     // 2. Sync suspends frame.
1482     // 3. Return from Cilk entry point.
1483     //
1484     //
1485     // In cases 1 and 2, the frame may be truly suspended or
1486     // may be immediately executed by this worker after provably_good_steal.
1487     //
1488     //
1489     // There is a fourth case, which can, but does not need to execute
1490     // this function:
1491     //   4. Starting up the scheduling loop on a user or
1492     //      system worker.  In this case, we won't have
1493     //      a scheduling stack function to run.
1494     __cilkrts_worker* w = cilk_fiber_get_owner(fiber);
1495     if (w->l->post_suspend) {
1496         // Run the continuation function passed to longjmp_into_runtime
1497         run_scheduling_stack_fcn(w);
1498
1499         // After we have jumped into the runtime and run the
1500         // scheduling function, any reducer map the worker had before entering the runtime
1501         // should have already been saved into the appropriate full
1502         // frame.
1503         CILK_ASSERT(NULL == w->reducer_map);
1504
1505         // There shouldn't be any uncaught exceptions.
1506         //
1507         // In Windows, the OS catches any exceptions not caught by the
1508         // user code.  Thus, we are omitting the check on Windows.
1509         //
1510         // On Android, calling std::uncaught_exception with the stlport
1511         // library causes a seg fault.  Since we're not supporting
1512         // exceptions there at this point, just don't do the check
1513         //
1514         // TBD: Is this check also safe to do on Windows?
1515         CILKBUG_ASSERT_NO_UNCAUGHT_EXCEPTION();
1516     }
1517 }
1518
1519
1520 /**
1521  * Method called to jump back to executing user code.
1522  *
1523  * A normal return from the runtime back to resuming user code calls
1524  * this method.  A computation executed using force_reduce also calls
1525  * this method to return to user code.
1526  *
1527  * This function should not contain any code that depends on a fiber.
1528  * In a force-reduce case, the user worker may not have a fiber.  In
1529  * the force-reduce case, we call this method directly instead of
1530  * calling @c user_code_resume_after_switch_into_runtime.
1531  */
1532 static inline NORETURN
1533 cilkrts_resume(__cilkrts_stack_frame *sf, full_frame *ff)
1534 {
1535     // Save the sync stack pointer, and do the bookkeeping
1536     char* sync_sp = ff->sync_sp;
1537     __cilkrts_take_stack(ff, sync_sp);  // leaves ff->sync_sp null
1538
1539     sf->flags &= ~CILK_FRAME_SUSPENDED;
1540     // Actually longjmp to the user code.
1541     // We may have exceptions to deal with, since we are resuming
1542     // a previous-suspended frame.
1543     sysdep_longjmp_to_sf(sync_sp, sf, ff);
1544 }
1545
1546
1547 /**
1548  * Called by the user-code fiber right before resuming a full frame
1549  * (sf/ff).
1550  *
1551  * This method pulls sf/ff out of the worker, and then calls
1552  * cilkrts_resume to jump to user code.
1553  */
1554 static NORETURN
1555 user_code_resume_after_switch_into_runtime(cilk_fiber *fiber)
1556 {
1557     __cilkrts_worker *w = cilk_fiber_get_owner(fiber);
1558     __cilkrts_stack_frame *sf;
1559     full_frame *ff;
1560     sf = w->current_stack_frame;
1561     ff = sf->worker->l->frame_ff;
1562
1563 #if FIBER_DEBUG >= 1
1564     CILK_ASSERT(ff->fiber_self == fiber);
1565     cilk_fiber_data *fdata = cilk_fiber_get_data(fiber);
1566     DBGPRINTF ("%d-%p: resume_after_switch_into_runtime, fiber=%p\n",
1567                w->self, w, fiber);
1568     CILK_ASSERT(sf == fdata->resume_sf);
1569 #endif
1570
1571     // Notify the Intel tools that we're stealing code
1572     ITT_SYNC_ACQUIRED(sf->worker);
1573     NOTIFY_ZC_INTRINSIC("cilk_continue", sf);
1574     cilk_fiber_invoke_tbb_stack_op(fiber, CILK_TBB_STACK_ADOPT);
1575
1576     // Actually jump to user code.
1577     cilkrts_resume(sf, ff);
1578  }
1579
1580
1581 /* The current stack is about to either be suspended or destroyed.  This
1582  * function will switch to the stack on which the scheduler is suspended and
1583  * resume running the scheduler within function do_work().  Upon waking up,
1584  * the scheduler will run the 'cont' function, using the supplied worker and
1585  * frame.
1586  */
1587 static NORETURN
1588 longjmp_into_runtime(__cilkrts_worker *w,
1589                      scheduling_stack_fcn_t fcn,
1590                      __cilkrts_stack_frame *sf)
1591 {
1592     full_frame *ff, *ff2;
1593
1594     CILK_ASSERT(!w->l->post_suspend);
1595     ff = w->l->frame_ff;
1596
1597     // If we've got only one worker, stealing shouldn't be possible.
1598     // Assume that this is a steal or return from spawn in a force-reduce case.
1599     // We don't have a scheduling stack to switch to, so call the continuation
1600     // function directly.
1601     if (1 == w->g->P) {
1602         fcn(w, ff, sf);
1603
1604         /* The call to function c() will have pushed ff as the next frame.  If
1605          * this were a normal (non-forced-reduce) execution, there would have
1606          * been a pop_next_frame call in a separate part of the runtime.  We
1607          * must call pop_next_frame here to complete the push/pop cycle. */
1608         ff2 = pop_next_frame(w);
1609
1610         setup_for_execution(w, ff2, 0);
1611         scheduling_fiber_prepare_to_resume_user_code(w, ff2, w->current_stack_frame);
1612         cilkrts_resume(w->current_stack_frame, ff2);
1613
1614 // Suppress clang warning that the expression result is unused
1615 #if defined(__clang__) && (! defined(__INTEL_COMPILER))
1616 #   pragma clang diagnostic push
1617 #   pragma clang diagnostic ignored "-Wunused-value"
1618 #endif // __clang__
1619         /* no return */
1620         CILK_ASSERT(((void)"returned from __cilkrts_resume", 0));
1621 #if defined(__clang__) && (! defined(__INTEL_COMPILER))
1622 #   pragma clang diagnostic pop
1623 #endif // __clang__
1624     }
1625
1626     w->l->post_suspend = fcn;
1627     w->l->suspended_stack = sf;
1628
1629     ITT_SYNC_RELEASING(w);
1630     ITT_SYNC_PREPARE(w);
1631
1632 #if FIBER_DEBUG >= 2
1633     fprintf(stderr, "ThreadId=%p, W=%d: about to switch into runtime... w->l->frame_ff = %p, sf=%p\n",
1634             cilkos_get_current_thread_id(),
1635             w->self, w->l->frame_ff,
1636             sf);
1637 #endif
1638
1639     // Current fiber is either the (1) one we are about to free,
1640     // or (2) it has been passed up to the parent.
1641     cilk_fiber *current_fiber = ( w->l->fiber_to_free ?
1642                                   w->l->fiber_to_free :
1643                                   w->l->frame_ff->parent->fiber_child );
1644     cilk_fiber_data* fdata = cilk_fiber_get_data(current_fiber);
1645     CILK_ASSERT(NULL == w->l->frame_ff->fiber_self);
1646
1647     // Clear the sf in the current fiber for cleanliness, to prevent
1648     // us from accidentally resuming a bad sf.
1649     // Technically, resume_sf gets overwritten for a fiber when
1650     // we are about to resume it anyway.
1651     fdata->resume_sf = NULL;
1652     CILK_ASSERT(fdata->owner == w);
1653
1654     // Set the function to execute immediately after switching to the
1655     // scheduling fiber, but before freeing any fibers.
1656     cilk_fiber_set_post_switch_proc(w->l->scheduling_fiber,
1657                                     enter_runtime_transition_proc);
1658     cilk_fiber_invoke_tbb_stack_op(current_fiber, CILK_TBB_STACK_ORPHAN);
1659
1660     if (w->l->fiber_to_free) {
1661         // Case 1: we are freeing this fiber.  We never
1662         // resume this fiber again after jumping into the runtime.
1663         w->l->fiber_to_free = NULL;
1664
1665         // Extra check. Normally, the fiber we are about to switch to
1666         // should have a NULL owner.
1667         CILK_ASSERT(NULL == cilk_fiber_get_data(w->l->scheduling_fiber)->owner);
1668 #if FIBER_DEBUG >= 4
1669         fprintf(stderr, "ThreadId=%p, W=%d: about to switch into runtime.. current_fiber = %p, deallcoate, switch to fiber %p\n",
1670                 cilkos_get_current_thread_id(),
1671                 w->self,
1672                 current_fiber, w->l->scheduling_fiber);
1673 #endif
1674         cilk_fiber_invoke_tbb_stack_op(current_fiber, CILK_TBB_STACK_RELEASE);
1675         NOTE_INTERVAL(w, INTERVAL_DEALLOCATE_RESUME_OTHER);
1676         cilk_fiber_remove_reference_from_self_and_resume_other(current_fiber,
1677                                                                &w->l->fiber_pool,
1678                                                                w->l->scheduling_fiber);
1679         // We should never come back here!
1680         CILK_ASSERT(0);
1681     }
1682     else {
1683         // Case 2: We are passing the fiber to our parent because we
1684         // are leftmost.  We should come back later to
1685         // resume execution of user code.
1686         //
1687         // If we are not freeing a fiber, there we must be
1688         // returning from a spawn or processing an exception.  The
1689         // "sync" path always frees a fiber.
1690         //
1691         // We must be the leftmost child, and by left holder logic, we
1692         // have already moved the current fiber into our parent full
1693         // frame.
1694 #if FIBER_DEBUG >= 2
1695         fprintf(stderr, "ThreadId=%p, W=%d: about to suspend self into runtime.. current_fiber = %p, deallcoate, switch to fiber %p\n",
1696                 cilkos_get_current_thread_id(),
1697                 w->self,
1698                 current_fiber, w->l->scheduling_fiber);
1699 #endif
1700
1701         NOTE_INTERVAL(w, INTERVAL_SUSPEND_RESUME_OTHER);
1702
1703         cilk_fiber_suspend_self_and_resume_other(current_fiber,
1704                                                  w->l->scheduling_fiber);
1705         // Resuming this fiber returns control back to
1706         // this function because our implementation uses OS fibers.
1707         //
1708         // On Unix, we could have the choice of passing the
1709         // user_code_resume_after_switch_into_runtime as an extra "resume_proc"
1710         // that resumes execution of user code instead of the
1711         // jumping back here, and then jumping back to user code.
1712 #if FIBER_DEBUG >= 2
1713         CILK_ASSERT(fdata->owner == __cilkrts_get_tls_worker());
1714 #endif
1715         user_code_resume_after_switch_into_runtime(current_fiber);
1716     }
1717 }
1718
1719 /*
1720  * Send a message to the children of the specified worker: run or wait.
1721  */
1722 static void notify_children(__cilkrts_worker *w, unsigned int msg)
1723 {
1724     int child_num;
1725     __cilkrts_worker *child;
1726     int num_sys_workers = w->g->P - 1;
1727
1728     // If worker is "n", then its children are 2n + 1, and 2n + 2.
1729     child_num = (w->self << 1) + 1;
1730     if (child_num < num_sys_workers) {
1731         child = w->g->workers[child_num];
1732         CILK_ASSERT(child->l->signal_node);
1733         signal_node_msg(child->l->signal_node, msg);
1734         child_num++;
1735         if (child_num < num_sys_workers) {
1736             child = w->g->workers[child_num];
1737             CILK_ASSERT(child->l->signal_node);
1738             signal_node_msg(child->l->signal_node, msg);
1739         }
1740     }
1741 }
1742
1743 /*
1744  * Notify this worker's children that they need to wait.
1745  */
1746 static void notify_children_wait(__cilkrts_worker *w)
1747 {
1748     notify_children(w, 0);
1749 }
1750
1751 /*
1752  * Notify this worker's children to run and start trying to steal.
1753  */
1754 static void notify_children_run(__cilkrts_worker *w)
1755 {
1756     notify_children(w, 1);
1757 }
1758
1759 /**
1760  * A single "check" to find work, either on our queue or through a
1761  * steal attempt.  This method checks our local queue once, and
1762  * performs one steal attempt.
1763  */
1764 static full_frame* check_for_work(__cilkrts_worker *w)
1765 {
1766     full_frame *ff = NULL;
1767     ff = pop_next_frame(w);
1768     // If there is no work on the queue, try to steal some.
1769     if (NULL == ff) {
1770         START_INTERVAL(w, INTERVAL_STEALING) {
1771             if (w->l->type != WORKER_USER && w->l->team != NULL) {
1772                 // At this point, the worker knows for certain that it has run
1773                 // out of work.  Therefore, it loses its team affiliation.  User
1774                 // workers never change teams, of course.
1775                 __cilkrts_worker_lock(w);
1776                 w->l->team = NULL;
1777                 __cilkrts_worker_unlock(w);
1778             }
1779
1780             // If we are about to do a random steal, we should have no
1781             // full frame...
1782             CILK_ASSERT(NULL == w->l->frame_ff);
1783             random_steal(w);
1784         } STOP_INTERVAL(w, INTERVAL_STEALING);
1785
1786         // If the steal was successful, then the worker has populated its next
1787         // frame with the work to resume.
1788         ff = pop_next_frame(w);
1789         if (NULL == ff) {
1790             // Punish the worker for failing to steal.
1791             // No quantum for you!
1792             unsigned int max_fails = w->g->max_steal_failures << 1;
1793             if (w->l->has_stolen == 0 &&
1794                 w->l->steal_failure_count % max_fails == max_fails - 1) {
1795                 // Idle briefly if the worker has never stolen anything for
1796                 // the given grace period
1797                 __cilkrts_idle();
1798             } else {
1799                 __cilkrts_yield();
1800             }
1801             w->l->steal_failure_count++;
1802             if (w->l->steal_failure_count > (max_fails << 8)) {
1803                 // Reset the flag after certain amount of failures
1804                 // - This will reduce cpu time in top-level synched regions
1805                 // - max_fails can be controlled by user (CILK_STEAL_FAILURES)
1806                 w->l->has_stolen = 0;
1807             }
1808         } else {
1809             // Reset steal_failure_count since there is obviously still work to
1810             // be done.
1811             w->l->steal_failure_count = 0;
1812             w->l->has_stolen = 1;
1813         }
1814     }
1815     return ff;
1816 }
1817
1818 /**
1819  * Keep stealing or looking on our queue.
1820  *
1821  * Returns either when a full frame is found, or NULL if the
1822  * computation is done.
1823  */
1824 static full_frame* search_until_work_found_or_done(__cilkrts_worker *w)
1825 {
1826     full_frame *ff = NULL;
1827     // Find a full frame to execute (either through random stealing,
1828     // or because we pull it off w's 1-element queue).
1829     while (!ff) {
1830         // Check worker state to figure out our next action.
1831         switch (worker_runnable(w))
1832         {
1833         case SCHEDULE_RUN:             // One attempt at checking for work.
1834             ff = check_for_work(w);
1835             break;
1836         case SCHEDULE_WAIT:            // go into wait-mode.
1837             START_INTERVAL(w, INTERVAL_SCHEDULE_WAIT);
1838             CILK_ASSERT(WORKER_SYSTEM == w->l->type);
1839             // If we are about to wait, then we better not have
1840             // a frame that we should execute...
1841             CILK_ASSERT(NULL == w->l->next_frame_ff);
1842             notify_children_wait(w);
1843             signal_node_wait(w->l->signal_node);
1844             // ...
1845             // Runtime is waking up.
1846             notify_children_run(w);
1847             w->l->steal_failure_count = 0;
1848             STOP_INTERVAL(w, INTERVAL_SCHEDULE_WAIT);
1849             break;
1850         case SCHEDULE_EXIT:            // exit the scheduler.
1851             CILK_ASSERT(WORKER_USER != w->l->type);
1852             return NULL;
1853         default:
1854             CILK_ASSERT(0);
1855             abort();
1856         }
1857     }
1858     return ff;
1859 }
1860
1861 /**
1862  * The proc method for a scheduling fiber on a user worker.
1863  *
1864  * When a user worker jumps into the runtime, it jumps into this
1865  * method by either starting it if the scheduling fiber has never run
1866  * before, or resuming the fiber if it was previously suspended.
1867  */
1868 COMMON_PORTABLE
1869 void scheduler_fiber_proc_for_user_worker(cilk_fiber *fiber)
1870 {
1871     __cilkrts_worker* w = cilk_fiber_get_owner(fiber);
1872     CILK_ASSERT(w);
1873
1874     // This must be a user worker
1875     CILK_ASSERT(WORKER_USER == w->l->type);
1876
1877     // If we aren't the current worker, then something is very wrong
1878     // here..
1879     verify_current_wkr(w);
1880
1881     __cilkrts_run_scheduler_with_exceptions(w);
1882 }
1883
1884
1885 /**
1886  * The body of the runtime scheduling loop.  This function executes in
1887  * 4 stages:
1888  *
1889  * 1. Transitions from the user code into the runtime by
1890  *    executing any scheduling-stack functions.
1891  * 2. Looks for a full frame enqueued from a successful provably
1892  *    good steal.
1893  * 3. If no full frame is found in step 2, steal until
1894  *    a frame is found or we are done.  If we are done, finish
1895  *    the scheduling loop.
1896  * 4. When a frame is found, setup to resume user code.
1897  *    In particular, suspend the current fiber and resume the
1898  *    user fiber to execute the frame.
1899  *
1900  * Returns a fiber object that we should switch to after completing
1901  * the body of the loop, or NULL if we should continue executing on
1902  * this fiber.
1903  *
1904  * @pre @c current_fiber should equal @c wptr->l->scheduling_fiber
1905  *
1906  * @param current_fiber   The currently executing (scheduling_ fiber
1907  * @param wptr            The currently executing worker.
1908  * @param return          The next fiber we should switch to.
1909  */
1910 static cilk_fiber* worker_scheduling_loop_body(cilk_fiber* current_fiber,
1911                                                void* wptr)
1912 {
1913     __cilkrts_worker *w = (__cilkrts_worker*) wptr;
1914     CILK_ASSERT(current_fiber == w->l->scheduling_fiber);
1915
1916     // Stage 1: Transition from executing user code to the runtime code.
1917     // We don't need to do this call here any more, because
1918     // every switch to the scheduling fiber should make this call
1919     // using a post_switch_proc on the fiber.
1920     //
1921     //  enter_runtime_transition_proc(w->l->scheduling_fiber, wptr);
1922
1923     // After Stage 1 is complete, w should no longer have
1924     // an associated full frame.
1925     CILK_ASSERT(NULL == w->l->frame_ff);
1926
1927     // Stage 2.  First do a quick check of our 1-element queue.
1928     full_frame *ff = pop_next_frame(w);
1929
1930     if (!ff) {
1931         // Stage 3.  We didn't find anything from our 1-element
1932         // queue.  Now go through the steal loop to find work.
1933         ff = search_until_work_found_or_done(w);
1934         if (!ff) {
1935             CILK_ASSERT(w->g->work_done);
1936             return NULL;
1937         }
1938     }
1939
1940     // Stage 4.  Now that we have found a full frame to work on,
1941     // actually execute it.
1942     __cilkrts_stack_frame *sf;
1943
1944     // There shouldn't be any uncaught exceptions.
1945     //
1946     // In Windows, the OS catches any exceptions not caught by the
1947     // user code.  Thus, we are omitting the check on Windows.
1948     //
1949     // On Android, calling std::uncaught_exception with the stlport
1950     // library causes a seg fault.  Since we're not supporting
1951     // exceptions there at this point, just don't do the check
1952     CILKBUG_ASSERT_NO_UNCAUGHT_EXCEPTION();
1953
1954     BEGIN_WITH_WORKER_LOCK(w) {
1955         CILK_ASSERT(!w->l->frame_ff);
1956         BEGIN_WITH_FRAME_LOCK(w, ff) {
1957             sf = ff->call_stack;
1958             CILK_ASSERT(sf && !sf->call_parent);
1959             setup_for_execution(w, ff, 0);
1960         } END_WITH_FRAME_LOCK(w, ff);
1961     } END_WITH_WORKER_LOCK(w);
1962
1963     /* run it */
1964     //
1965     // Prepare to run the full frame.  To do so, we need to:
1966     //   (a) Execute some code on this fiber (the scheduling
1967     //       fiber) to set up data structures, and
1968     //   (b) Suspend the scheduling fiber, and resume the
1969     //       user-code fiber.
1970
1971     // Part (a). Set up data structures.
1972     scheduling_fiber_prepare_to_resume_user_code(w, ff, sf);
1973
1974     cilk_fiber *other = w->l->frame_ff->fiber_self;
1975     cilk_fiber_data* other_data = cilk_fiber_get_data(other);
1976     cilk_fiber_data* current_fiber_data = cilk_fiber_get_data(current_fiber);
1977
1978     // I believe two cases are possible here, both of which
1979     // should have other_data->resume_sf as NULL.
1980     //
1981     // 1. Resuming a fiber that was previously executing
1982     //    user code (i.e., a provably-good-steal).
1983     //    In this case, resume_sf should have been
1984     //    set to NULL when it was suspended.
1985     //
1986     // 2. Resuming code on a steal.  In this case, since we
1987     //    grabbed a new fiber, resume_sf should be NULL.
1988     CILK_ASSERT(NULL == other_data->resume_sf);
1989
1990 #if FIBER_DEBUG >= 2
1991     fprintf(stderr, "W=%d: other fiber=%p, setting resume_sf to %p\n",
1992             w->self, other, other_data->resume_sf);
1993 #endif
1994     // Update our own fiber's data.
1995     current_fiber_data->resume_sf = NULL;
1996     // The scheduling fiber should have the right owner from before.
1997     CILK_ASSERT(current_fiber_data->owner == w);
1998     other_data->resume_sf = sf;
1999
2000
2001 #if FIBER_DEBUG >= 3
2002     fprintf(stderr, "ThreadId=%p (about to suspend self resume other), W=%d: current_fiber=%p, other=%p, current_fiber->resume_sf = %p, other->resume_sf = %p\n",
2003             cilkos_get_current_thread_id(),
2004             w->self,
2005             current_fiber, other,
2006             current_fiber_data->resume_sf,
2007             other_data->resume_sf);
2008 #endif
2009     return other;
2010 }
2011
2012
2013 /**
2014  * This function is executed once by each worker, to initialize its
2015  * scheduling loop.
2016  */
2017 static void worker_scheduler_init_function(__cilkrts_worker *w)
2018 {
2019     // First, execute the startup tasks that must happen for all
2020     // worker types.
2021     ITT_SYNC_PREPARE(w);
2022     /* Notify tools about the new worker. Inspector needs this, but we
2023        don't want to confuse Cilkscreen with system threads.  User threads
2024        do this notification in bind_thread */
2025     if (! w->g->under_ptool)
2026         __cilkrts_cilkscreen_establish_worker(w);
2027
2028     // Seed the initial random number generator.
2029     // If we forget to do this, then the worker always steals from 0.
2030     // Programs will still execute correctly, but
2031     // you may see a subtle performance bug...
2032     mysrand(w, (w->self + 1));
2033
2034     // The startup work varies, depending on the worker type.
2035     switch (w->l->type) {
2036     case WORKER_USER:
2037         break;
2038
2039     case WORKER_SYSTEM:
2040         // If a system worker is starting, we must also be starting
2041         // the runtime.
2042
2043         // Runtime begins in a wait-state and is woken up by the first user
2044         // worker when the runtime is ready.
2045         signal_node_wait(w->l->signal_node);
2046         // ...
2047         // Runtime is waking up.
2048         notify_children_run(w);
2049         w->l->steal_failure_count = 0;
2050         break;
2051     default:
2052         __cilkrts_bug("Unknown worker %p of type %d entering scheduling loop\n",
2053                       w, w->l->type);
2054     }
2055 }
2056
2057 /**
2058  * This function is executed once by each worker, to finish its
2059  * scheduling loop.
2060  *
2061  * @note Currently, only system workers finish their loops.  User
2062  * workers will jump away to user code without exiting their
2063  * scheduling loop.
2064  */
2065 static void worker_scheduler_terminate_function(__cilkrts_worker *w)
2066 {
2067     // A user worker should never finish by falling through the
2068     // scheduling loop.
2069     CILK_ASSERT(WORKER_USER != w->l->type);
2070 }
2071
2072 /**
2073  * The main scheduler function executed by a worker's scheduling
2074  * fiber.
2075  *
2076  * This method is started by either a new system worker, or a user
2077  * worker that has stalled and just been imported into the runtime.
2078  */
2079 static void worker_scheduler_function(__cilkrts_worker *w)
2080 {
2081     START_INTERVAL(w, INTERVAL_INIT_WORKER);
2082     worker_scheduler_init_function(w);
2083     STOP_INTERVAL(w, INTERVAL_INIT_WORKER);
2084
2085     // The main scheduling loop body.
2086
2087     while (!w->g->work_done) {
2088         // Execute the "body" of the scheduling loop, and figure
2089         // out the fiber to jump to next.
2090         START_INTERVAL(w, INTERVAL_SCHED_LOOP);
2091         cilk_fiber* fiber_to_resume
2092             = worker_scheduling_loop_body(w->l->scheduling_fiber, w);
2093         STOP_INTERVAL(w, INTERVAL_SCHED_LOOP);
2094
2095         if (fiber_to_resume) {
2096             // Suspend the current fiber and resume next one.
2097             NOTE_INTERVAL(w, INTERVAL_SUSPEND_RESUME_OTHER);
2098
2099             // Whenever we jump to resume user code, we stop being in
2100             // the runtime, and start working.
2101             STOP_INTERVAL(w, INTERVAL_IN_RUNTIME);
2102             START_INTERVAL(w, INTERVAL_WORKING);
2103             cilk_fiber_suspend_self_and_resume_other(w->l->scheduling_fiber,
2104                                                      fiber_to_resume);
2105             // Return here only when this (scheduling) fiber is
2106             // resumed (i.e., this worker wants to reenter the runtime).
2107
2108             // We've already switched from WORKING to IN_RUNTIME in
2109             // the runtime code that handles the fiber switch.  Thus, at
2110             // this point we are IN_RUNTIME already.
2111         }
2112     }
2113
2114     // Finish the scheduling loop.
2115     worker_scheduler_terminate_function(w);
2116 }
2117
2118
2119 /*************************************************************
2120   Forward declarations for reduction protocol.
2121 *************************************************************/
2122
2123 static __cilkrts_worker*
2124 execute_reductions_for_sync(__cilkrts_worker *w,
2125                             full_frame *ff,
2126                             __cilkrts_stack_frame *sf_at_sync);
2127
2128 static __cilkrts_worker*
2129 execute_reductions_for_spawn_return(__cilkrts_worker *w,
2130                                     full_frame *ff,
2131                                     __cilkrts_stack_frame *returning_sf);
2132
2133
2134
2135 /*************************************************************
2136   Scheduler functions that are callable by client code
2137 *************************************************************/
2138 static full_frame *disown(__cilkrts_worker *w,
2139                           full_frame *ff,
2140                           __cilkrts_stack_frame *sf,
2141                           const char *why)
2142 {
2143     CILK_ASSERT(ff);
2144     make_unrunnable(w, ff, sf, sf != 0, why);
2145     w->l->frame_ff = 0;
2146     return ff->parent;
2147 }
2148
2149 /**
2150  * Called when ff is returning from a spawn, and we need to execute a
2151  * reduction.
2152  *
2153  * @param w             The currently executing worker.
2154  * @param ff            The full frame for w.
2155  * @param returning_sf  The stack frame for the spawn helper that is returning.
2156  *
2157  * Normally, by the time we gain control in the runtime, the worker
2158  * has already popped off the __cilkrts_stack_frame "returning_sf"
2159  * from its call chain.
2160  *
2161  * When we have only serial reductions, w->current_stack_frame is not
2162  * needed any more, because w is about to enter the runtime scheduling
2163  * loop anyway.  Similarly, the frame "ff" is slated to be destroyed
2164  * after the runtime finishes the return from spawn and splices ff out
2165  * of the tree of full frames.
2166  *
2167  * To execute a parallel reduction, however, we still want
2168  * w->current_stack_frame == returning_sf, and we are going to use the
2169  * frame ff for a little bit longer.
2170  *
2171  * This method:
2172  *
2173  *   1. Puts returning_sf back as w's current stack frame.
2174  *   2. Makes "ff" runnable again on w.
2175  */
2176 static inline
2177 void restore_frame_for_spawn_return_reduction(__cilkrts_worker *w,
2178                                               full_frame *ff,
2179                                               __cilkrts_stack_frame *returning_sf) {
2180 #if REDPAR_DEBUG >= 2
2181     CILK_ASSERT(returning_sf);
2182     CILK_ASSERT(returning_sf->worker == w);
2183 #endif
2184     // Change w's current stack frame back to "returning_sf".
2185     //
2186     // Intuitively, w->current_stack_frame should be
2187     // returning_sf->call_parent at this point.
2188     //
2189     // We can not assert this, however, because the pop of
2190     // returning_sf from the call chain has already cleared
2191     // returning_sf->call_parent.  We don't want to restore the call
2192     // parent of returning_sf, because its parent has been stolen, and
2193     // the runtime assumes that steals break this link.
2194
2195     // We cannot assert call_parent is NULL either, since that's not true for
2196     // Win64 exception handling
2197 //    CILK_ASSERT(returning_sf->call_parent == NULL);
2198     w->current_stack_frame = returning_sf;
2199
2200     // Make the full frame "ff" runnable again, in preparation for
2201     // executing the reduction.
2202     make_runnable(w, ff);
2203 }
2204
2205
2206 NORETURN __cilkrts_c_sync(__cilkrts_worker *w,
2207                           __cilkrts_stack_frame *sf_at_sync)
2208 {
2209     full_frame *ff;
2210     STOP_INTERVAL(w, INTERVAL_WORKING);
2211     START_INTERVAL(w, INTERVAL_IN_RUNTIME);
2212
2213     // Claim: This read of w->l->frame_ff can occur without
2214     // holding the worker lock because when w has reached a sync
2215     // and entered the runtime (because it stalls), w's deque is empty
2216     // and no one else can steal and change w->l->frame_ff.
2217
2218     ff = w->l->frame_ff;
2219 #ifdef _WIN32
2220     __cilkrts_save_exception_state(w, ff);
2221 #else
2222     // Move any pending exceptions into the full frame
2223     CILK_ASSERT(NULL == ff->pending_exception);
2224     ff->pending_exception = w->l->pending_exception;
2225     w->l->pending_exception = NULL;
2226 #endif
2227
2228     w = execute_reductions_for_sync(w, ff, sf_at_sync);
2229
2230 #if FIBER_DEBUG >= 3
2231     fprintf(stderr, "ThreadId=%p, w->self = %d. about to longjmp_into_runtim[c_sync] with ff=%p\n",
2232             cilkos_get_current_thread_id(), w->self, ff);
2233 #endif
2234
2235     longjmp_into_runtime(w, do_sync, sf_at_sync);
2236 }
2237
2238 static void do_sync(__cilkrts_worker *w, full_frame *ff,
2239                     __cilkrts_stack_frame *sf)
2240 {
2241     //int abandoned = 1;
2242     enum provably_good_steal_t steal_result = ABANDON_EXECUTION;
2243
2244     START_INTERVAL(w, INTERVAL_SYNC_CHECK) {
2245         BEGIN_WITH_WORKER_LOCK_OPTIONAL(w) {
2246
2247             CILK_ASSERT(ff);
2248             BEGIN_WITH_FRAME_LOCK(w, ff) {
2249                 CILK_ASSERT(sf->call_parent == 0);
2250                 CILK_ASSERT(sf->flags & CILK_FRAME_UNSYNCHED);
2251
2252                 // Before switching into the scheduling fiber, we should have
2253                 // already taken care of deallocating the current
2254                 // fiber.
2255                 CILK_ASSERT(NULL == ff->fiber_self);
2256
2257                 // Update the frame's pedigree information if this is an ABI 1
2258                 // or later frame
2259                 if (CILK_FRAME_VERSION_VALUE(sf->flags) >= 1)
2260                 {
2261                     sf->parent_pedigree.rank = w->pedigree.rank;
2262                     sf->parent_pedigree.parent = w->pedigree.parent;
2263
2264                     // Note that the pedigree rank needs to be updated
2265                     // when setup_for_execution_pedigree runs
2266                     sf->flags |= CILK_FRAME_SF_PEDIGREE_UNSYNCHED;
2267                 }
2268
2269                 /* the decjoin() occurs in provably_good_steal() */
2270                 steal_result = provably_good_steal(w, ff);
2271
2272             } END_WITH_FRAME_LOCK(w, ff);
2273             // set w->l->frame_ff = NULL after checking abandoned
2274             if (WAIT_FOR_CONTINUE != steal_result) {
2275                 w->l->frame_ff = NULL;
2276             }
2277         } END_WITH_WORKER_LOCK_OPTIONAL(w);
2278     } STOP_INTERVAL(w, INTERVAL_SYNC_CHECK);
2279
2280     // Now, if we are in a replay situation and provably_good_steal() returned
2281     // WAIT_FOR_CONTINUE, we should sleep, reacquire locks, call
2282     // provably_good_steal(), and release locks until we get a value other
2283     // than WAIT_FOR_CONTINUE from the function.
2284 #ifdef CILK_RECORD_REPLAY
2285     // We don't have to explicitly check for REPLAY_LOG below because
2286     // steal_result can only be set to WAIT_FOR_CONTINUE during replay
2287     while(WAIT_FOR_CONTINUE == steal_result)
2288     {
2289         __cilkrts_sleep();
2290         BEGIN_WITH_WORKER_LOCK_OPTIONAL(w)
2291         {
2292             ff = w->l->frame_ff;
2293             BEGIN_WITH_FRAME_LOCK(w, ff)
2294             {
2295                 steal_result = provably_good_steal(w, ff);
2296             } END_WITH_FRAME_LOCK(w, ff);
2297             if (WAIT_FOR_CONTINUE != steal_result)
2298                 w->l->frame_ff = NULL;
2299         } END_WITH_WORKER_LOCK_OPTIONAL(w);
2300     }
2301 #endif  // CILK_RECORD_REPLAY
2302
2303 #ifdef ENABLE_NOTIFY_ZC_INTRINSIC
2304     // If we can't make any further progress on this thread, tell Inspector
2305     // that we're abandoning the work and will go find something else to do.
2306     if (ABANDON_EXECUTION == steal_result)
2307     {
2308         NOTIFY_ZC_INTRINSIC("cilk_sync_abandon", 0);
2309     }
2310 #endif // defined ENABLE_NOTIFY_ZC_INTRINSIC
2311
2312     return; /* back to scheduler loop */
2313 }
2314
2315 /* worker W completely promotes its own deque, simulating the case
2316    where the whole deque is stolen.  We use this mechanism to force
2317    the allocation of new storage for reducers for race-detection
2318    purposes. */
2319 void __cilkrts_promote_own_deque(__cilkrts_worker *w)
2320 {
2321     // Remember the fiber we start this method on.
2322     CILK_ASSERT(w->l->frame_ff);
2323     cilk_fiber* starting_fiber = w->l->frame_ff->fiber_self;
2324
2325     BEGIN_WITH_WORKER_LOCK(w) {
2326         while (dekker_protocol(w)) {
2327             /* PLACEHOLDER_FIBER is used as non-null marker to tell detach()
2328                and make_child() that this frame should be treated as a spawn
2329                parent, even though we have not assigned it a stack. */
2330             detach_for_steal(w, w, PLACEHOLDER_FIBER);
2331         }
2332     } END_WITH_WORKER_LOCK(w);
2333
2334
2335     // TBD: The management of full frames and fibers is a bit
2336     // sketchy here.  We are promoting stack frames into full frames,
2337     // and pretending they are stolen away, but no other worker is
2338     // actually working on them.  Some runtime invariants
2339     // may be broken here.
2340     //
2341     // Technically, if we are simulating a steal from w
2342     // w should get a new full frame, but
2343     // keep the same fiber.  A real thief would be taking the
2344     // loot frame away, get a new fiber, and starting executing the
2345     // loot frame.
2346     //
2347     // What should a fake thief do?  Where does the frame go?
2348
2349     // In any case, we should be finishing the promotion process with
2350     // the same fiber with.
2351     CILK_ASSERT(w->l->frame_ff);
2352     CILK_ASSERT(w->l->frame_ff->fiber_self == starting_fiber);
2353 }
2354
2355
2356
2357 /* the client code calls this function after a spawn when the dekker
2358    protocol fails.  The function may either return or longjmp
2359    into the rts
2360
2361    This function takes in a "returning_sf" argument which corresponds
2362    to the __cilkrts_stack_frame that we are finishing (i.e., the
2363    argument to __cilkrts_leave_frame).
2364    */
2365 void __cilkrts_c_THE_exception_check(__cilkrts_worker *w,
2366                                      __cilkrts_stack_frame *returning_sf)
2367 {
2368     full_frame *ff;
2369     int stolen_p;
2370     __cilkrts_stack_frame *saved_sf = NULL;
2371
2372     // For the exception check, stop working and count as time in
2373     // runtime.
2374     STOP_INTERVAL(w, INTERVAL_WORKING);
2375     START_INTERVAL(w, INTERVAL_IN_RUNTIME);
2376
2377     START_INTERVAL(w, INTERVAL_THE_EXCEPTION_CHECK);
2378
2379     BEGIN_WITH_WORKER_LOCK(w) {
2380         ff = w->l->frame_ff;
2381         CILK_ASSERT(ff);
2382         /* This code is called only upon a normal return and never
2383            upon an exceptional return.  Assert that this is the
2384            case. */
2385         CILK_ASSERT(!w->l->pending_exception);
2386
2387         reset_THE_exception(w);
2388         stolen_p = !(w->head < (w->tail + 1)); /* +1 because tail was
2389                                                   speculatively
2390                                                   decremented by the
2391                                                   compiled code */
2392
2393         if (stolen_p) {
2394             /* XXX This will be charged to THE for accounting purposes */
2395             __cilkrts_save_exception_state(w, ff);
2396
2397             // Save the value of the current stack frame.
2398             saved_sf = w->current_stack_frame;
2399
2400             // Reverse the decrement from undo_detach.
2401             // This update effectively resets the deque to be
2402             // empty (i.e., changes w->tail back to equal w->head).
2403             // We need to reset the deque to execute parallel
2404             // reductions.  When we have only serial reductions, it
2405             // does not matter, since serial reductions do not
2406             // change the deque.
2407             w->tail++;
2408 #if REDPAR_DEBUG > 1
2409             // ASSERT our deque is empty.
2410             CILK_ASSERT(w->head == w->tail);
2411 #endif
2412         }
2413     } END_WITH_WORKER_LOCK(w);
2414
2415     STOP_INTERVAL(w, INTERVAL_THE_EXCEPTION_CHECK);
2416
2417     if (stolen_p)
2418     {
2419         w = execute_reductions_for_spawn_return(w, ff, returning_sf);
2420
2421         // "Mr. Policeman?  My parent always told me that if I was in trouble
2422         // I should ask a nice policeman for help.  I can't find my parent
2423         // anywhere..."
2424         //
2425         // Write a record to the replay log for an attempt to return to a stolen parent
2426         replay_record_orphaned(w);
2427
2428         // Update the pedigree only after we've finished the
2429         // reductions.
2430         update_pedigree_on_leave_frame(w, returning_sf);
2431
2432         // Notify Inspector that the parent has been stolen and we're
2433         // going to abandon this work and go do something else.  This
2434         // will match the cilk_leave_begin in the compiled code
2435         NOTIFY_ZC_INTRINSIC("cilk_leave_stolen", saved_sf);
2436
2437         DBGPRINTF ("%d: longjmp_into_runtime from __cilkrts_c_THE_exception_check\n", w->self);
2438         longjmp_into_runtime(w, do_return_from_spawn, 0);
2439         DBGPRINTF ("%d: returned from longjmp_into_runtime from __cilkrts_c_THE_exception_check?!\n", w->self);
2440     }
2441     else
2442     {
2443         NOTE_INTERVAL(w, INTERVAL_THE_EXCEPTION_CHECK_USELESS);
2444
2445         // If we fail the exception check and return, then switch back
2446         // to working.
2447         STOP_INTERVAL(w, INTERVAL_IN_RUNTIME);
2448         START_INTERVAL(w, INTERVAL_WORKING);
2449         return;
2450     }
2451 }
2452
2453 /* Return an exception to a stolen parent. */
2454 NORETURN __cilkrts_exception_from_spawn(__cilkrts_worker *w,
2455                                         __cilkrts_stack_frame *returning_sf)
2456 {
2457     full_frame *ff = w->l->frame_ff;
2458     STOP_INTERVAL(w, INTERVAL_WORKING);
2459     START_INTERVAL(w, INTERVAL_IN_RUNTIME);
2460
2461     // This is almost the same as THE_exception_check, except
2462     // the detach didn't happen, we don't need to undo the tail
2463     // update.
2464     CILK_ASSERT(w->head == w->tail);
2465     w = execute_reductions_for_spawn_return(w, ff, returning_sf);
2466
2467     longjmp_into_runtime(w, do_return_from_spawn, 0);
2468     CILK_ASSERT(0);
2469 }
2470
2471 static void do_return_from_spawn(__cilkrts_worker *w,
2472                                  full_frame *ff,
2473                                  __cilkrts_stack_frame *sf)
2474 {
2475     full_frame *parent_ff;
2476     enum provably_good_steal_t steal_result = ABANDON_EXECUTION;
2477
2478     BEGIN_WITH_WORKER_LOCK_OPTIONAL(w) {
2479         CILK_ASSERT(ff);
2480         CILK_ASSERT(!ff->is_call_child);
2481         CILK_ASSERT(sf == NULL);
2482         parent_ff = ff->parent;
2483
2484         BEGIN_WITH_FRAME_LOCK(w, ff) {
2485             decjoin(ff);
2486         } END_WITH_FRAME_LOCK(w, ff);
2487
2488         BEGIN_WITH_FRAME_LOCK(w, parent_ff) {
2489             if (parent_ff->simulated_stolen)
2490                 unconditional_steal(w, parent_ff);
2491             else
2492                 steal_result = provably_good_steal(w, parent_ff);
2493         } END_WITH_FRAME_LOCK(w, parent_ff);
2494
2495     } END_WITH_WORKER_LOCK_OPTIONAL(w);
2496
2497     // Loop here in replay mode
2498 #ifdef CILK_RECORD_REPLAY
2499     // We don't have to explicitly check for REPLAY_LOG below because
2500     // steal_result can only get set to WAIT_FOR_CONTINUE during replay.
2501     // We also don't have to worry about the simulated_stolen flag
2502     // because steal_result can only be set to WAIT_FOR_CONTINUE by
2503     // provably_good_steal().
2504     while(WAIT_FOR_CONTINUE == steal_result)
2505     {
2506         __cilkrts_sleep();
2507         BEGIN_WITH_WORKER_LOCK_OPTIONAL(w)
2508         {
2509             BEGIN_WITH_FRAME_LOCK(w, parent_ff)
2510             {
2511                 steal_result = provably_good_steal(w, parent_ff);
2512             } END_WITH_FRAME_LOCK(w, parent_ff);
2513         } END_WITH_WORKER_LOCK_OPTIONAL(w);
2514     }
2515 #endif  // CILK_RECORD_REPLAY
2516
2517     // Cleanup the child frame.
2518     __cilkrts_destroy_full_frame(w, ff);
2519     return;
2520 }
2521
2522 #ifdef _WIN32
2523 /* migrate an exception across fibers.  Call this function when an exception has
2524  * been thrown and has to traverse across a steal.  The exception has already
2525  * been wrapped up, so all that remains is to longjmp() into the continuation,
2526  * sync, and re-raise it.
2527  */
2528 void __cilkrts_migrate_exception(__cilkrts_stack_frame *sf) {
2529
2530     __cilkrts_worker *w = sf->worker;
2531     full_frame *ff;
2532
2533     BEGIN_WITH_WORKER_LOCK(w) {
2534         ff = w->l->frame_ff;
2535         reset_THE_exception(w);
2536         /* there is no need to check for a steal because we wouldn't be here if
2537            there weren't a steal. */
2538         __cilkrts_save_exception_state(w, ff);
2539
2540         CILK_ASSERT(w->head == w->tail);
2541     } END_WITH_WORKER_LOCK(w);
2542
2543     {
2544         // TBD(jsukha): This function emulates the
2545         // the "do_return_from_spawn" path.
2546         w = execute_reductions_for_spawn_return(w, ff, sf);
2547     }
2548
2549     longjmp_into_runtime(w, do_return_from_spawn, 0); /* does not return. */
2550     CILK_ASSERT(! "Shouldn't be here...");
2551 }
2552 #endif
2553
2554
2555 /* Pop a call stack from TAIL.  Return the call stack, or NULL if the
2556    queue is empty */
2557 __cilkrts_stack_frame *__cilkrts_pop_tail(__cilkrts_worker *w)
2558 {
2559     __cilkrts_stack_frame *sf;
2560     BEGIN_WITH_WORKER_LOCK(w) {
2561         __cilkrts_stack_frame *volatile *tail = w->tail;
2562         if (w->head < tail) {
2563             --tail;
2564             sf = *tail;
2565             w->tail = tail;
2566         } else {
2567             sf = 0;
2568         }
2569     } END_WITH_WORKER_LOCK(w);
2570     return sf;
2571 }
2572
2573 #ifdef CILK_RECORD_REPLAY
2574 __cilkrts_stack_frame *simulate_pop_tail(__cilkrts_worker *w)
2575 {
2576     __cilkrts_stack_frame *sf;
2577     BEGIN_WITH_WORKER_LOCK(w) {
2578         if (w->head < w->tail) {
2579             sf = *(w->tail-1);
2580         } else {
2581             sf = 0;
2582         }
2583     } END_WITH_WORKER_LOCK(w);
2584     return sf;
2585 }
2586 #endif
2587
2588
2589 /* Return from a call, not a spawn. */
2590 void __cilkrts_return(__cilkrts_worker *w)
2591 {
2592     full_frame *ff, *parent_ff;
2593
2594     // Count time during the return as in the runtime.
2595     STOP_INTERVAL(w, INTERVAL_WORKING);
2596     START_INTERVAL(w, INTERVAL_IN_RUNTIME);
2597     START_INTERVAL(w, INTERVAL_RETURNING);
2598
2599     BEGIN_WITH_WORKER_LOCK_OPTIONAL(w) {
2600         ff = w->l->frame_ff;
2601         CILK_ASSERT(ff);
2602         CILK_ASSERT(ff->join_counter == 1);
2603         /* This path is not used to return from spawn. */
2604         CILK_ASSERT(ff->is_call_child);
2605
2606         BEGIN_WITH_FRAME_LOCK(w, ff) {
2607             // After this call, w->l->frame_ff != ff.
2608             // Technically, w will "own" ff until ff is freed,
2609             // however, because ff is a dying leaf full frame.
2610             parent_ff = disown(w, ff, 0, "return");
2611             decjoin(ff);
2612
2613 #ifdef _WIN32
2614             __cilkrts_save_exception_state(w, ff);
2615 #else
2616             // Move the pending exceptions into the full frame
2617             // This should always be NULL if this isn't a
2618             // return with an exception
2619             CILK_ASSERT(NULL == ff->pending_exception);
2620             ff->pending_exception = w->l->pending_exception;
2621             w->l->pending_exception = NULL;
2622 #endif  // _WIN32
2623
2624         } END_WITH_FRAME_LOCK(w, ff);
2625
2626         __cilkrts_fence(); /* redundant */
2627
2628         CILK_ASSERT(parent_ff);
2629
2630         BEGIN_WITH_FRAME_LOCK(w, parent_ff) {
2631             finalize_child_for_call(w, parent_ff, ff);
2632         } END_WITH_FRAME_LOCK(w, parent_ff);
2633
2634         ff = pop_next_frame(w);
2635         /* ff will be non-null except when the parent frame is owned
2636            by another worker.
2637            CILK_ASSERT(ff)
2638         */
2639         CILK_ASSERT(!w->l->frame_ff);
2640         if (ff) {
2641             BEGIN_WITH_FRAME_LOCK(w, ff) {
2642                 __cilkrts_stack_frame *sf = ff->call_stack;
2643                 CILK_ASSERT(sf && !sf->call_parent);
2644                 setup_for_execution(w, ff, 1);
2645             } END_WITH_FRAME_LOCK(w, ff);
2646         }
2647     } END_WITH_WORKER_LOCK_OPTIONAL(w);
2648
2649     STOP_INTERVAL(w, INTERVAL_RETURNING);
2650     STOP_INTERVAL(w, INTERVAL_IN_RUNTIME);
2651     START_INTERVAL(w, INTERVAL_WORKING);
2652 }
2653
2654 static void __cilkrts_unbind_thread()
2655 {
2656     int stop_cilkscreen = 0;
2657     global_state_t *g;
2658
2659     // Take out the global OS mutex to protect accesses to the table of workers
2660     global_os_mutex_lock();
2661
2662     if (cilkg_is_published()) {
2663         __cilkrts_worker *w = __cilkrts_get_tls_worker();
2664         if (w) {
2665             g = w->g;
2666
2667
2668             // Matches the START in bind_thread in cilk-abi.c.
2669             STOP_INTERVAL(w, INTERVAL_IN_RUNTIME);
2670             STOP_INTERVAL(w, INTERVAL_IN_SCHEDULER);
2671
2672             __cilkrts_set_tls_worker(0);
2673
2674             if (w->self == -1) {
2675                 // This worker is an overflow worker.  I.e., it was created on-
2676                 // demand when the global pool ran out of workers.
2677                 destroy_worker(w);
2678                 __cilkrts_free(w);
2679             } else {
2680                 // This is a normal user worker and needs to be counted by the
2681                 // global state for the purposes of throttling system workers.
2682                 w->l->type = WORKER_FREE;
2683                 __cilkrts_leave_cilk(g);
2684             }
2685
2686             stop_cilkscreen = (0 == g->Q);
2687         }
2688     }
2689     global_os_mutex_unlock();
2690
2691     /* Turn off Cilkscreen.  This needs to be done when we are NOT holding the
2692      * os mutex. */
2693     if (stop_cilkscreen)
2694         __cilkrts_cilkscreen_disable_instrumentation();
2695 }
2696
2697 /* special return from the initial frame */
2698
2699 void __cilkrts_c_return_from_initial(__cilkrts_worker *w)
2700 {
2701     struct cilkred_map *rm;
2702
2703     // When we are returning from the initial frame, switch from
2704     // INTERVAL_WORKING into INTERVAL_IN_RUNTIME.
2705     STOP_INTERVAL(w, INTERVAL_WORKING);
2706     START_INTERVAL(w, INTERVAL_IN_RUNTIME);
2707
2708     /* This is only called on a user thread worker. */
2709     CILK_ASSERT(w->l->type == WORKER_USER);
2710
2711     #if REDPAR_DEBUG >= 3
2712     fprintf(stderr, "[W=%d, desc=cilkrts_c_return_from_initial, ff=%p]\n",
2713             w->self, w->l->frame_ff);
2714     #endif
2715
2716     BEGIN_WITH_WORKER_LOCK_OPTIONAL(w) {
2717         full_frame *ff = w->l->frame_ff;
2718         CILK_ASSERT(ff);
2719         CILK_ASSERT(ff->join_counter == 1);
2720         w->l->frame_ff = 0;
2721
2722         CILK_ASSERT(ff->fiber_self);
2723         // Save any TBB interop data for the next time this thread enters Cilk
2724         cilk_fiber_tbb_interop_save_info_from_stack(ff->fiber_self);
2725
2726         // Deallocate cilk_fiber that mapped to the user stack.  The stack
2727         // itself does not get deallocated (of course) but our data
2728         // structure becomes divorced from it.
2729
2730 #if FIBER_DEBUG >= 1
2731         fprintf(stderr, "ThreadId=%p: w=%d: We are about to deallocate ff->fiber_self  = %p here. w->l->scheduling_fiber = %p. w->l->type = %d\n",
2732                 cilkos_get_current_thread_id(),
2733                 w->self,
2734                 ff->fiber_self,
2735                 w->l->scheduling_fiber,
2736                 w->l->type);
2737 #endif
2738         // The fiber in ff is a user-code fiber.  The fiber in
2739         // w->l->scheduling_fiber is a scheduling fiber.  These fibers should
2740         // never be equal.  When a user worker returns (and will unbind), we
2741         // should destroy only the fiber in ff.  The scheduling fiber will be
2742         // re-used.
2743
2744         CILK_ASSERT(ff->fiber_self != w->l->scheduling_fiber);
2745
2746         START_INTERVAL(w, INTERVAL_FIBER_DEALLOCATE) {
2747             // This fiber might not be deallocated here if there
2748             // is a pending exception on Windows that refers
2749             // to this fiber.
2750             //
2751             // First "suspend" the fiber, and then try to delete it.
2752             cilk_fiber_deallocate_from_thread(ff->fiber_self);
2753         } STOP_INTERVAL(w, INTERVAL_FIBER_DEALLOCATE);
2754         ff->fiber_self = NULL;
2755
2756         /* Save reducer map into global_state object */
2757         rm = w->reducer_map;
2758         w->reducer_map = NULL;
2759
2760 #if REDPAR_DEBUG >= 3
2761         fprintf(stderr, "W=%d, reducer_map_to_delete=%p, was in ff=%p\n",
2762                 w->self,
2763                 rm,
2764                 ff);
2765 #endif
2766         __cilkrts_destroy_full_frame(w, ff);
2767
2768
2769         /* Work is never done. w->g->work_done = 1; __cilkrts_fence(); */
2770     } END_WITH_WORKER_LOCK_OPTIONAL(w);
2771
2772
2773     save_pedigree_leaf_from_user_worker(w);
2774
2775     // Workers can have NULL reducer maps now.
2776     if (rm) {
2777         __cilkrts_destroy_reducer_map(w, rm);
2778     }
2779
2780
2781 #if FIBER_DEBUG >= 1
2782     __cilkrts_worker* tmp = w;
2783     int tmp_id = w->self;
2784     fprintf(stderr, "w=%d: We are about unbind thread (w= %p)\n",
2785             w->self,
2786             w);
2787 #endif
2788
2789     w = NULL;
2790
2791     __cilkrts_unbind_thread();
2792
2793 #if FIBER_DEBUG >= 1
2794
2795     fprintf(stderr, "w=%p, %d: Finished unbind\n",
2796             tmp, tmp_id);
2797 #endif
2798
2799     /* Other workers will stop trying to steal if this was the last worker. */
2800
2801     return;
2802 }
2803
2804
2805 /*
2806  * __cilkrts_restore_stealing
2807  *
2808  * Restore the protected_tail to a previous state, possibly allowing frames
2809  * to be stolen.  The dekker_protocol has been extended to steal only if
2810  * head+1 is < protected_tail.
2811  */
2812
2813 void __cilkrts_restore_stealing(
2814     __cilkrts_worker *w,
2815     __cilkrts_stack_frame *volatile *saved_protected_tail)
2816 {
2817     /* On most x86 this pair of operations would be slightly faster
2818        as an atomic exchange due to the implicit memory barrier in
2819        an atomic instruction. */
2820     w->protected_tail = saved_protected_tail;
2821     __cilkrts_fence();
2822 }
2823
2824 /*
2825  * __cilkrts_disallow_stealing
2826  *
2827  * Move the protected_tail to NEW_PROTECTED_TAIL, preventing any
2828  * frames from being stolen.  If NEW_PROTECTED_TAIL is NULL, prevent
2829  * stealing from the whole queue.  The dekker_protocol has been
2830  * extended to only steal if head+1 is also < protected_tail.
2831  */
2832
2833 __cilkrts_stack_frame *volatile *__cilkrts_disallow_stealing(
2834     __cilkrts_worker *w,
2835     __cilkrts_stack_frame *volatile *new_protected_tail)
2836 {
2837     __cilkrts_stack_frame *volatile *saved_protected_tail = w->protected_tail;
2838
2839     if (!new_protected_tail)
2840         new_protected_tail = w->l->ltq;
2841
2842     if (w->protected_tail > new_protected_tail) {
2843         w->protected_tail = new_protected_tail;
2844         /* Issue a store-store barrier.  The update to protected_tail
2845            here must precede the update to tail in the next spawn.
2846            On x86 this is probably not needed. */
2847 #if defined __GNUC__ && __ICC >= 1200 && !(__MIC__ ||__MIC2__)
2848         _mm_sfence();
2849 #else
2850         __cilkrts_fence();
2851 #endif
2852     }
2853
2854     return saved_protected_tail;
2855 }
2856
2857 /*************************************************************
2858   Initialization and startup
2859 *************************************************************/
2860
2861 __cilkrts_worker *make_worker(global_state_t *g,
2862                               int self, __cilkrts_worker *w)
2863 {
2864     w->self = self;
2865     w->g = g;
2866
2867     w->pedigree.rank = 0;    // Initial rank is 0
2868     w->pedigree.parent = NULL;
2869
2870     w->l = (local_state *)__cilkrts_malloc(sizeof(*w->l));
2871
2872     __cilkrts_frame_malloc_per_worker_init(w);
2873
2874     w->reducer_map = NULL;
2875     w->current_stack_frame = NULL;
2876     w->reserved = NULL;
2877
2878     w->l->worker_magic_0 = WORKER_MAGIC_0;
2879     w->l->team = NULL;
2880     w->l->type = WORKER_FREE;
2881
2882     __cilkrts_mutex_init(&w->l->lock);
2883     __cilkrts_mutex_init(&w->l->steal_lock);
2884     w->l->do_not_steal = 0;
2885     w->l->frame_ff = 0;
2886     w->l->next_frame_ff = 0;
2887     w->l->last_full_frame = NULL;
2888
2889     w->l->ltq = (__cilkrts_stack_frame **)
2890         __cilkrts_malloc(g->ltqsize * sizeof(*w->l->ltq));
2891     w->ltq_limit = w->l->ltq + g->ltqsize;
2892     w->head = w->tail = w->l->ltq;
2893
2894     cilk_fiber_pool_init(&w->l->fiber_pool,
2895                          &g->fiber_pool,
2896                          g->stack_size,
2897                          g->fiber_pool_size,
2898                          0,   // alloc_max is 0.  We don't allocate from the heap directly without checking the parent pool.
2899                          0);
2900 #if FIBER_DEBUG >= 2
2901     fprintf(stderr, "ThreadId=%p: Making w=%d (%p), pool = %p\n",
2902             cilkos_get_current_thread_id(),
2903             w->self, w,
2904             &w->l->fiber_pool);
2905 #endif
2906     w->l->scheduling_fiber = NULL;
2907     w->l->original_pedigree_leaf = NULL;
2908     w->l->rand_seed = 0; /* the scheduler will overwrite this field */
2909
2910     w->l->post_suspend = 0;
2911     w->l->suspended_stack = 0;
2912     w->l->fiber_to_free = NULL;
2913     w->l->pending_exception = NULL;
2914
2915 #if CILK_PROFILE
2916     w->l->stats = __cilkrts_malloc(sizeof(statistics));
2917     __cilkrts_init_stats(w->l->stats);
2918 #else
2919     w->l->stats = NULL;
2920 #endif
2921     w->l->steal_failure_count = 0;
2922     w->l->has_stolen = 0;
2923
2924     w->l->work_stolen = 0;
2925
2926     // Initialize record/replay assuming we're doing neither
2927     w->l->record_replay_fptr = NULL;
2928     w->l->replay_list_root = NULL;
2929     w->l->replay_list_entry = NULL;
2930     w->l->signal_node = NULL;
2931     // Nothing's been stolen yet
2932     w->l->worker_magic_1 = WORKER_MAGIC_1;
2933
2934     /*w->parallelism_disabled = 0;*/
2935
2936     // Allow stealing all frames. Sets w->saved_protected_tail
2937     __cilkrts_restore_stealing(w, w->ltq_limit);
2938
2939     __cilkrts_init_worker_sysdep(w);
2940
2941     reset_THE_exception(w);
2942
2943     return w;
2944 }
2945
2946 void destroy_worker(__cilkrts_worker *w)
2947 {
2948     CILK_ASSERT (NULL == w->l->pending_exception);
2949
2950     // Deallocate the scheduling fiber
2951     if (NULL != w->l->scheduling_fiber)
2952     {
2953         // The scheduling fiber is the main fiber for system workers and must
2954         // be deallocated by the thread that created it.  Thus, we can
2955         // deallocate only free workers' (formerly user workers) scheduling
2956         // fibers here.
2957         CILK_ASSERT(WORKER_FREE == w->l->type);
2958
2959 #if FIBER_DEBUG >=1
2960         fprintf(stderr, "ThreadId=%p, w=%p, %d, deallocating scheduling fiber = %p, \n",
2961                 cilkos_get_current_thread_id(),
2962                 w,
2963                 w->self,
2964                 w->l->scheduling_fiber);
2965 #endif
2966         int ref_count = cilk_fiber_remove_reference(w->l->scheduling_fiber, NULL);
2967         // Scheduling fiber should never have extra references because of exceptions.
2968         CILK_ASSERT(0 == ref_count);
2969         w->l->scheduling_fiber = NULL;
2970     }
2971
2972 #if CILK_PROFILE
2973     if (w->l->stats) {
2974         __cilkrts_free(w->l->stats);
2975     }
2976 #else
2977     CILK_ASSERT(NULL == w->l->stats);
2978 #endif
2979
2980     /* Free any cached fibers. */
2981     cilk_fiber_pool_destroy(&w->l->fiber_pool);
2982
2983     __cilkrts_destroy_worker_sysdep(w);
2984
2985     if (w->l->signal_node) {
2986         CILK_ASSERT(WORKER_SYSTEM == w->l->type);
2987         signal_node_destroy(w->l->signal_node);
2988     }
2989
2990     __cilkrts_free(w->l->ltq);
2991     __cilkrts_mutex_destroy(0, &w->l->lock);
2992     __cilkrts_mutex_destroy(0, &w->l->steal_lock);
2993     __cilkrts_frame_malloc_per_worker_cleanup(w);
2994
2995     __cilkrts_free(w->l);
2996
2997     // The caller is responsible for freeing the worker memory
2998 }
2999
3000 /*
3001  * Make a worker into a system worker.
3002  */
3003 static void make_worker_system(__cilkrts_worker *w) {
3004     CILK_ASSERT(WORKER_FREE == w->l->type);
3005     w->l->type = WORKER_SYSTEM;
3006     w->l->signal_node = signal_node_create();
3007 }
3008
3009 void __cilkrts_deinit_internal(global_state_t *g)
3010 {
3011     int i;
3012     __cilkrts_worker *w;
3013
3014     // If there's no global state then we're done
3015     if (NULL == g)
3016         return;
3017
3018 #ifdef CILK_PROFILE
3019     __cilkrts_dump_stats_to_stderr(g);
3020 #endif
3021
3022     w = g->workers[0];
3023     if (w->l->frame_ff) {
3024         __cilkrts_destroy_full_frame(w, w->l->frame_ff);
3025         w->l->frame_ff = 0;
3026     }
3027
3028     // Release any resources used for record/replay
3029     replay_term(g);
3030
3031     // Destroy any system dependent global state
3032     __cilkrts_destroy_global_sysdep(g);
3033
3034     for (i = 0; i < g->total_workers; ++i)
3035         destroy_worker(g->workers[i]);
3036
3037     // Free memory for all worker blocks which were allocated contiguously
3038     __cilkrts_free(g->workers[0]);
3039
3040     __cilkrts_free(g->workers);
3041
3042     cilk_fiber_pool_destroy(&g->fiber_pool);
3043     __cilkrts_frame_malloc_global_cleanup(g);
3044
3045     cilkg_deinit_global_state();
3046 }
3047
3048 /*
3049  * Wake the runtime by notifying the system workers that they can steal.  The
3050  * first user worker into the runtime should call this.
3051  */
3052 static void wake_runtime(global_state_t *g)
3053 {
3054     __cilkrts_worker *root;
3055     if (g->P > 1) {
3056         // Send a message to the root node.  The message will propagate.
3057         root = g->workers[0];
3058         CILK_ASSERT(root->l->signal_node);
3059         signal_node_msg(root->l->signal_node, 1);
3060     }
3061 }
3062
3063 /*
3064  * Put the runtime to sleep.  The last user worker out of the runtime should
3065  * call this.  Like Dad always said, turn out the lights when nobody's in the
3066  * room.
3067  */
3068 static void sleep_runtime(global_state_t *g)
3069 {
3070     __cilkrts_worker *root;
3071     if (g->P > 1) {
3072         // Send a message to the root node.  The message will propagate.
3073         root = g->workers[0];
3074         CILK_ASSERT(root->l->signal_node);
3075         signal_node_msg(root->l->signal_node, 0);
3076     }
3077 }
3078
3079 /* Called when a user thread joins Cilk.
3080    Global lock must be held. */
3081 void __cilkrts_enter_cilk(global_state_t *g)
3082 {
3083     if (g->Q++ == 0) {
3084         // If this is the first user thread to enter Cilk wake
3085         // up all the workers.
3086         wake_runtime(g);
3087     }
3088 }
3089
3090 /* Called when a user thread leaves Cilk.
3091    Global lock must be held. */
3092 void __cilkrts_leave_cilk(global_state_t *g)
3093 {
3094     if (--g->Q == 0) {
3095         // Put the runtime to sleep.
3096         sleep_runtime(g);
3097     }
3098 }
3099
3100 /*
3101  * worker_runnable
3102  *
3103  * Return true if the worker should continue to try to steal.  False, otherwise.
3104  */
3105
3106 NOINLINE
3107 static enum schedule_t worker_runnable(__cilkrts_worker *w)
3108 {
3109     global_state_t *g = w->g;
3110
3111     /* If this worker has something to do, do it.
3112        Otherwise the work would be lost. */
3113     if (w->l->next_frame_ff)
3114         return SCHEDULE_RUN;
3115
3116     // If Cilk has explicitly (by the user) been told to exit (i.e., by
3117     // __cilkrts_end_cilk() -> __cilkrts_stop_workers(g)), then return 0.
3118     if (g->work_done)
3119         return SCHEDULE_EXIT;
3120
3121     if (0 == w->self) {
3122         // This worker is the root node and is the only one that may query the
3123         // global state to see if there are still any user workers in Cilk.
3124         if (w->l->steal_failure_count > g->max_steal_failures) {
3125             if (signal_node_should_wait(w->l->signal_node)) {
3126                 return SCHEDULE_WAIT;
3127             } else {
3128                 // Reset the steal_failure_count since we have verified that
3129                 // user workers are still in Cilk.
3130                 w->l->steal_failure_count = 0;
3131             }
3132         }
3133     } else if (WORKER_SYSTEM == w->l->type &&
3134                signal_node_should_wait(w->l->signal_node)) {
3135         // This worker has been notified by its parent that it should stop
3136         // trying to steal.
3137         return SCHEDULE_WAIT;
3138     }
3139
3140     return SCHEDULE_RUN;
3141 }
3142
3143
3144
3145 // Initialize the worker structs, but don't start the workers themselves.
3146 static void init_workers(global_state_t *g)
3147 {
3148     int total_workers = g->total_workers;
3149     int i;
3150     struct CILK_ALIGNAS(256) buffered_worker {
3151         __cilkrts_worker w;
3152         char buf[64];
3153     } *workers_memory;
3154
3155     /* not needed if only one worker */
3156     cilk_fiber_pool_init(&g->fiber_pool,
3157                          NULL,
3158                          g->stack_size,
3159                          g->global_fiber_pool_size,           // buffer_size
3160                          g->max_stacks,                       // maximum # to allocate
3161                          1);
3162
3163     cilk_fiber_pool_set_fiber_limit(&g->fiber_pool,
3164                                     (g->max_stacks ? g->max_stacks : INT_MAX));
3165
3166     g->workers = (__cilkrts_worker **)
3167         __cilkrts_malloc(total_workers * sizeof(*g->workers));
3168
3169     // Allocate 1 block of memory for workers to make life easier for tools
3170     // like Inspector which run multithreaded and need to know the memory
3171     // range for all the workers that will be accessed in a user's program
3172     workers_memory = (struct buffered_worker*)
3173         __cilkrts_malloc(sizeof(*workers_memory) * total_workers);
3174
3175     // Notify any tools that care (Cilkscreen and Inspector) that they should
3176     // ignore memory allocated for the workers
3177     __cilkrts_cilkscreen_ignore_block(&workers_memory[0],
3178                                       &workers_memory[total_workers]);
3179
3180     // Initialize worker structs, including unused worker slots.
3181     for (i = 0; i < total_workers; ++i) {
3182         g->workers[i] = make_worker(g, i, &workers_memory[i].w);
3183     }
3184
3185     // Set the workers in the first P - 1 slots to be system workers.
3186     // Remaining worker structs already have type == 0.
3187     for (i = 0; i < g->system_workers; ++i) {
3188         make_worker_system(g->workers[i]);
3189     }
3190 }
3191
3192 void __cilkrts_init_internal(int start)
3193 {
3194     global_state_t *g = NULL;
3195
3196     if (cilkg_is_published()) {
3197         g = cilkg_init_global_state();
3198     }
3199     else {
3200
3201         // We think the state has not been published yet.
3202         // Grab the lock and try to initialize/publish.
3203         global_os_mutex_lock();
3204
3205         if (cilkg_is_published()) {
3206             // Some other thread must have snuck in and published.
3207             g = cilkg_init_global_state();
3208         }
3209         else {
3210             // Initialize and retrieve global state
3211             g = cilkg_init_global_state();
3212
3213             // Set the scheduler pointer
3214             g->scheduler = worker_scheduler_function;
3215
3216             // If we're running under a sequential P-Tool (Cilkscreen or
3217             // Cilkview) then there's only one worker and we need to tell
3218             // the tool about the extent of the stack
3219             if (g->under_ptool)
3220                 __cilkrts_establish_c_stack();
3221             init_workers(g);
3222
3223             // Initialize per-work record/replay logging
3224             replay_init_workers(g);
3225
3226             // Initialize any system dependent global state
3227             __cilkrts_init_global_sysdep(g);
3228
3229
3230             cilkg_publish_global_state(g);
3231         }
3232
3233         global_os_mutex_unlock();
3234     }
3235
3236     CILK_ASSERT(g);
3237
3238     if (start && !g->workers_running)
3239     {
3240         // Acquire the global OS mutex while we're starting the workers
3241         global_os_mutex_lock();
3242         if (!g->workers_running)
3243             // Start P - 1 system workers since P includes the first user
3244             // worker.
3245             __cilkrts_start_workers(g, g->P - 1);
3246         global_os_mutex_unlock();
3247     }
3248 }
3249
3250
3251 /************************************************************************
3252   Methods for reducer protocol.
3253
3254   Reductions occur in two places:
3255     A. A full frame "ff" is returning from a spawn with a stolen parent.
3256     B. A full frame "ff" is stalling at a sync.
3257
3258   To support parallel reductions, reduction functions need to be
3259   executed while control is on a user stack, before jumping into the
3260   runtime.  These reductions can not occur while holding a worker or
3261   frame lock.
3262
3263   Before a worker w executes a reduction in either Case A or B, w's
3264   deque is empty.
3265
3266   Since parallel reductions push work onto the deque, we must do extra
3267   work to set up runtime data structures properly before reductions
3268   begin to allow stealing.  ( Normally, when we have only serial
3269   reductions, once a worker w starts a reduction, its deque remains
3270   empty until w either steals another frame or resumes a suspended
3271   frame.  Thus, we don't care about the state of the deque, since w
3272   will reset its deque when setting up execution of a frame. )
3273
3274   To allow for parallel reductions, we coerce the runtime data
3275   structures so that, from their perspective, it looks as though we
3276   have spliced in an "execute_reductions()" function.  Consider the
3277   two cases for reductions:
3278
3279     Case A: Return from a spawn with a stolen parent.
3280       Consider a spawned function g is returning on a worker w.
3281       Assume:
3282           -   g was spawned from a parent function f.
3283           -   ff is the full frame for g's spawn helper
3284           -   sf be the __cilkrts_stack_frame for g's spawn helper.
3285
3286       We are conceptually splicing "execute_reductions()" so that it
3287       occurs immediately before the spawn helper of g returns to f.
3288
3289       We do so by creating two different world views --- one for the
3290       runtime data structures, and one for the actual control flow.
3291
3292         - Before reductions begin, the runtime data structures should
3293           look as though the spawn helper of g is calling
3294           "execute_reductions()", in terms of both the user stack and
3295           worker deque.  More precisely, w should satisfy the
3296           following properties:
3297
3298               (a) w has ff as its full frame,
3299               (b) w has sf as its __cilkrts_stack_frame, and
3300               (c) w has an empty deque.
3301
3302           If the runtime satisfies these properties, then if w
3303           encounters a spawn in a parallel reduction, it can push onto
3304           a valid deque.  Also, when a steal from w occurs, it will
3305           build the correct tree of full frames when w is stolen from.
3306
3307         - In actual control flow, however, once the
3308           "execute_reductions()" function returns, it is actually
3309           returning to runtime code instead of g's spawn helper.
3310
3311           At the point a worker w began executing reductions, the
3312           control flow / compiled code had already finished g's spawn
3313           helper, and w was about to enter the runtime.  With parallel
3314           reductions, some worker v (which might be different from w)
3315           is the one returning to the runtime.
3316
3317
3318       The reduction logic consists of 4 steps:
3319
3320        A1. Restore runtime data structures to make it look as though
3321            the spawn helper of g() is still the currently executing
3322            frame for w.
3323
3324        A2. Execute reductions on the user stack.  Reductions also
3325            includes the logic for exceptions and stacks.  Note that
3326            reductions start on w, but may finish on a different
3327            worker if there is parallelism in the reduce.
3328
3329        A3. Splice out ff from the tree of full frames.
3330
3331        A4. Jump into the runtime/scheduling stack and execute
3332            "do_return_from_spawn".  This method
3333
3334            (a) Frees the user stack we were just on if it is no longer needed.
3335            (b) Decrement the join counter on ff->parent, and tries to do a
3336                provably good steal.
3337            (c) Clean up the full frame ff.
3338
3339
3340    Case B: Stalling at a sync.
3341
3342      Consider a function g(), with full frame ff and
3343      __cilkrts_stack_frame sf.  Suppose g() stalls at a sync, and we
3344      are executing reductions.
3345
3346      Conceptually, we are splicing in an "execute_reductions()"
3347      function into g() as the last action that g() takes immediately
3348      before it executes the cilk_sync.
3349
3350      The reduction logic for this case is similar to Case A.
3351
3352        B1. Restore the runtime data structures.
3353
3354            The main difference from Case A is that ff/sf is still a
3355            frame that needs to be executed later (since it is stalling
3356            at a cilk_sync).  Thus, we also need to save the current
3357            stack information into "ff" so that we can correctly resume
3358            execution of "ff" after the sync.
3359
3360        B2. Execute reductions on the user stack.
3361
3362        B3. No frame to splice out of the tree.
3363
3364        B4. Jump into the runtime/scheduling stack and execute "do_sync".
3365            This method:
3366            (a) Frees the user stack we were just on if it is no longer needed.
3367            (b) Tries to execute a provably good steal.
3368
3369   Finally, for the reducer protocol, we consider two reduction paths,
3370   namely a "fast" and "slow" path.  On a fast path, only trivial
3371   merges of reducer maps happen (i.e., one or both of the maps are
3372   NULL).  Otherwise, on the slow path, a reduction actually needs to
3373   happen.
3374
3375 *****************************************************************/
3376
3377 /**
3378  * @brief Locations to store the result of a reduction.
3379  *
3380  * Struct storing pointers to the fields in our "left" sibling that we
3381  * should update when splicing out a full frame or stalling at a sync.
3382  */
3383 typedef struct {
3384     /** A pointer to the location of our left reducer map. */
3385     struct cilkred_map **map_ptr;
3386
3387     /** A pointer to the location of our left exception. */
3388     struct pending_exception_info **exception_ptr;
3389 } splice_left_ptrs;
3390
3391 /**
3392  * For a full frame returning from a spawn, calculate the pointers to
3393  * the maps and exceptions to my left.
3394  *
3395  * @param w   The currently executing worker.
3396  * @param ff  Full frame that is dying
3397  * @return    Pointers to our "left" for reducers and exceptions.
3398  */
3399 static inline
3400 splice_left_ptrs compute_left_ptrs_for_spawn_return(__cilkrts_worker *w,
3401                                                     full_frame *ff)
3402 {
3403     // ASSERT: we hold the lock on ff->parent
3404
3405     splice_left_ptrs left_ptrs;
3406     if (ff->left_sibling) {
3407         left_ptrs.map_ptr = &ff->left_sibling->right_reducer_map;
3408         left_ptrs.exception_ptr = &ff->left_sibling->right_pending_exception;
3409     }
3410     else {
3411         full_frame *parent_ff = ff->parent;
3412         left_ptrs.map_ptr = &parent_ff->children_reducer_map;
3413         left_ptrs.exception_ptr = &parent_ff->child_pending_exception;
3414     }
3415     return left_ptrs;
3416 }
3417
3418 /**
3419  * For a full frame at a sync, calculate the pointers to the maps and
3420  * exceptions to my left.
3421  *
3422  * @param w   The currently executing worker.
3423  * @param ff  Full frame that is stalling at a sync.
3424  * @return    Pointers to our "left" for reducers and exceptions.
3425  */
3426 static inline
3427 splice_left_ptrs compute_left_ptrs_for_sync(__cilkrts_worker *w,
3428                                             full_frame *ff)
3429 {
3430     // ASSERT: we hold the lock on ff
3431     splice_left_ptrs left_ptrs;
3432
3433     // Figure out which map to the left we should merge into.
3434     if (ff->rightmost_child) {
3435         CILK_ASSERT(ff->rightmost_child->parent == ff);
3436         left_ptrs.map_ptr = &(ff->rightmost_child->right_reducer_map);
3437         left_ptrs.exception_ptr = &(ff->rightmost_child->right_pending_exception);
3438     }
3439     else {
3440         // We have no children.  Then, we should be the last
3441         // worker at the sync... "left" is our child map.
3442         left_ptrs.map_ptr = &(ff->children_reducer_map);
3443         left_ptrs.exception_ptr = &(ff->child_pending_exception);
3444     }
3445     return left_ptrs;
3446 }
3447
3448 /**
3449  * After we have completed all reductions on a spawn return, call this
3450  * method to finish up before jumping into the runtime.
3451  *
3452  *   1. Perform the "reduction" on stacks, i.e., execute the left
3453  *      holder logic to pass the leftmost stack up.
3454  *
3455  *      w->l->fiber_to_free holds any stack that needs to be freed
3456  *      when control switches into the runtime fiber.
3457  *
3458  *   2. Unlink and remove child_ff from the tree of full frames.
3459  *
3460  * @param   w          The currently executing worker.
3461  * @param   parent_ff  The parent of child_ff.
3462  * @param   child_ff   The full frame returning from a spawn.
3463  */
3464 static inline
3465 void finish_spawn_return_on_user_stack(__cilkrts_worker *w,
3466                                        full_frame *parent_ff,
3467                                        full_frame *child_ff)
3468 {
3469     CILK_ASSERT(w->l->fiber_to_free == NULL);
3470
3471     // Execute left-holder logic for stacks.
3472     if (child_ff->left_sibling || parent_ff->fiber_child) {
3473         // Case where we are not the leftmost stack.
3474         CILK_ASSERT(parent_ff->fiber_child != child_ff->fiber_self);
3475
3476         // Remember any fiber we need to free in the worker.
3477         // After we jump into the runtime, we will actually do the
3478         // free.
3479         w->l->fiber_to_free = child_ff->fiber_self;
3480     }
3481     else {
3482         // We are leftmost, pass stack/fiber up to parent.
3483         // Thus, no stack/fiber to free.
3484         parent_ff->fiber_child = child_ff->fiber_self;
3485         w->l->fiber_to_free = NULL;
3486     }
3487
3488     child_ff->fiber_self = NULL;
3489
3490     unlink_child(parent_ff, child_ff);
3491 }
3492
3493
3494 /**
3495  * Executes any fast reductions necessary to splice ff out of the tree
3496  * of full frames.
3497  *
3498  * This "fast" path performs only trivial merges of reducer maps,
3499  * i.e,. when one of them is NULL.
3500  * (See slow_path_reductions_for_spawn_return() for slow path.)
3501  *
3502  * Returns: 1 if we finished all reductions.
3503  * Returns: 0 if there are still reductions to execute, and
3504  *            we should execute the slow path.
3505  *
3506  * This method assumes w holds the frame lock on parent_ff.
3507  * After this method completes:
3508  *    1. We have spliced ff out of the tree of full frames.
3509  *    2. The reducer maps of child_ff have been deposited
3510  *       "left" according to the reducer protocol.
3511  *    3. w->l->stack_to_free stores the stack
3512  *       that needs to be freed once we jump into the runtime.
3513  *
3514  * We have not, however, decremented the join counter on ff->parent.
3515  * This prevents any other workers from resuming execution of the parent.
3516  *
3517  * @param   w    The currently executing worker.
3518  * @param   ff   The full frame returning from a spawn.
3519  * @return  NULL if we finished all reductions.
3520  * @return  The address where the left map is stored (which should be passed to
3521  *          slow_path_reductions_for_spawn_return()) if there are
3522  *          still reductions to execute.
3523  */
3524 struct cilkred_map**
3525 fast_path_reductions_for_spawn_return(__cilkrts_worker *w,
3526                                       full_frame *ff)
3527 {
3528     // ASSERT: we hold ff->parent->lock.
3529     splice_left_ptrs left_ptrs;
3530
3531     CILK_ASSERT(NULL == w->l->pending_exception);
3532
3533     // Figure out the pointers to the left where I want
3534     // to put reducers and exceptions.
3535     left_ptrs = compute_left_ptrs_for_spawn_return(w, ff);
3536
3537     // Go ahead and merge exceptions while holding the lock.
3538     splice_exceptions_for_spawn(w, ff, left_ptrs.exception_ptr);
3539
3540     // Now check if we have any reductions to perform.
3541     //
3542     // Consider all the cases of left, middle and right maps.
3543     //  0. (-, -, -)  :  finish and return 1
3544     //  1. (L, -, -)  :  finish and return 1
3545     //  2. (-, M, -)  :  slide over to left, finish, and return 1.
3546     //  3. (L, M, -)  :  return 0
3547     //  4. (-, -, R)  :  slide over to left, finish, and return 1.
3548     //  5. (L, -, R)  :  return 0
3549     //  6. (-, M, R)  :  return 0
3550     //  7. (L, M, R)  :  return 0
3551     //
3552     // In terms of code:
3553     //  L == *left_ptrs.map_ptr
3554     //  M == w->reducer_map
3555     //  R == f->right_reducer_map.
3556     //
3557     // The goal of the code below is to execute the fast path with
3558     // as few branches and writes as possible.
3559
3560     int case_value = (*(left_ptrs.map_ptr) != NULL);
3561     case_value += ((w->reducer_map != NULL) << 1);
3562     case_value += ((ff->right_reducer_map != NULL) << 2);
3563
3564     // Fastest path is case_value == 0 or 1.
3565     if (case_value >=2) {
3566         switch (case_value) {
3567         case 2:
3568             *(left_ptrs.map_ptr) = w->reducer_map;
3569             w->reducer_map = NULL;
3570             return NULL;
3571             break;
3572         case 4:
3573             *(left_ptrs.map_ptr) = ff->right_reducer_map;
3574             ff->right_reducer_map = NULL;
3575             return NULL;
3576         default:
3577             // If we have to execute the slow path, then
3578             // return the pointer to the place to deposit the left
3579             // map.
3580             return left_ptrs.map_ptr;
3581         }
3582     }
3583
3584     // Do nothing
3585     return NULL;
3586 }
3587
3588
3589 /**
3590  * Executes any reductions necessary to splice "ff" frame out of
3591  * the steal tree.
3592  *
3593  * This method executes the "slow" path for reductions on a spawn
3594  * return, i.e., there are non-NULL maps that need to be merged
3595  * together.
3596  *
3597  * This method should execute only if
3598  * fast_path_reductions_for_spawn_return() returns a non-NULL
3599  * left_map_ptr.
3600  *
3601  * Upon entry, left_map_ptr should be the location of the left map
3602  * at the start of the reduction, as calculated by
3603  * fast_path_reductions_for_spawn_return().
3604  *
3605  * After this method completes:
3606  *    1. We have spliced ff out of the tree of full frames.
3607  *    2. The reducer maps of child_ff have been deposited
3608  *       "left" according to the reducer protocol.
3609  *    3. w->l->stack_to_free stores the stack
3610  *       that needs to be freed once we jump into the runtime.
3611  * We have not, however, decremented the join counter on ff->parent,
3612  * so no one can resume execution of the parent yet.
3613  *
3614  * WARNING:
3615  *   This method assumes the lock on ff->parent is held upon entry, and
3616  *   Upon exit, the worker that returns still holds a lock on ff->parent
3617  *   This method can, however, release and reacquire the lock on ff->parent.
3618  *
3619  * @param w             The currently executing worker.
3620  * @param ff            The full frame returning from a spawn.
3621  * @param left_map_ptr  Pointer to our initial left map.
3622  * @return              The worker that this method returns on.
3623  */
3624 static __cilkrts_worker*
3625 slow_path_reductions_for_spawn_return(__cilkrts_worker *w,
3626                                       full_frame *ff,
3627                                       struct cilkred_map **left_map_ptr)
3628 {
3629
3630     // CILK_ASSERT: w is holding frame lock on parent_ff.
3631 #if REDPAR_DEBUG > 0
3632     CILK_ASSERT(!ff->rightmost_child);
3633     CILK_ASSERT(!ff->is_call_child);
3634 #endif
3635
3636     // Loop invariant:
3637     // When beginning this loop, we should
3638     //   1. Be holding the lock on ff->parent.
3639     //   2. left_map_ptr should be the address of the pointer to the left map.
3640     //   3. All maps should be slid over left by one, if possible.
3641     //   4. All exceptions should be merged so far.
3642     while (1) {
3643
3644         // Slide middle map left if possible.
3645         if (!(*left_map_ptr)) {
3646             *left_map_ptr = w->reducer_map;
3647             w->reducer_map = NULL;
3648         }
3649         // Slide right map to middle if possible.
3650         if (!w->reducer_map) {
3651             w->reducer_map = ff->right_reducer_map;
3652             ff->right_reducer_map = NULL;
3653         }
3654
3655         // Since we slid everything left by one,
3656         // we are finished if there is no middle map.
3657         if (!w->reducer_map) {
3658             verify_current_wkr(w);
3659             return w;
3660         }
3661         else {
3662             struct cilkred_map* left_map;
3663             struct cilkred_map* middle_map;
3664             struct cilkred_map* right_map;
3665
3666             // Take all the maps from their respective locations.
3667             // We can't leave them in place and execute a reduction because these fields
3668             // might change once we release the lock.
3669             left_map = *left_map_ptr;
3670             *left_map_ptr = NULL;
3671             middle_map = w->reducer_map;
3672             w->reducer_map = NULL;
3673             right_map = ff->right_reducer_map;
3674             ff->right_reducer_map = NULL;
3675
3676             // WARNING!!! Lock release here.
3677             // We have reductions to execute (and we can't hold locks).
3678             __cilkrts_frame_unlock(w, ff->parent);
3679
3680             // After we've released the lock, start counting time as
3681             // WORKING again.
3682             STOP_INTERVAL(w, INTERVAL_IN_RUNTIME);
3683             START_INTERVAL(w, INTERVAL_WORKING);
3684
3685             // Merge all reducers into the left map.
3686             left_map = repeated_merge_reducer_maps(&w,
3687                                                    left_map,
3688                                                    middle_map);
3689             verify_current_wkr(w);
3690             left_map = repeated_merge_reducer_maps(&w,
3691                                                    left_map,
3692                                                    right_map);
3693             verify_current_wkr(w);
3694             CILK_ASSERT(NULL == w->reducer_map);
3695             // Put the final answer back into w->reducer_map.
3696             w->reducer_map = left_map;
3697
3698             // Save any exceptions generated because of the reduction
3699             // process from the returning worker.  These get merged
3700             // the next time around the loop.
3701             CILK_ASSERT(NULL == ff->pending_exception);
3702             ff->pending_exception = w->l->pending_exception;
3703             w->l->pending_exception = NULL;
3704
3705             STOP_INTERVAL(w, INTERVAL_WORKING);
3706             START_INTERVAL(w, INTERVAL_IN_RUNTIME);
3707
3708             // Lock ff->parent for the next loop around.
3709             __cilkrts_frame_lock(w, ff->parent);
3710
3711             // Once we have the lock again, recompute who is to our
3712             // left.
3713             splice_left_ptrs left_ptrs;
3714             left_ptrs = compute_left_ptrs_for_spawn_return(w, ff);
3715
3716             // Update the pointer for the left map.
3717             left_map_ptr = left_ptrs.map_ptr;
3718             // Splice the exceptions for spawn.
3719             splice_exceptions_for_spawn(w, ff, left_ptrs.exception_ptr);
3720         }
3721     }
3722     // We should never break out of this loop.
3723
3724     CILK_ASSERT(0);
3725     return NULL;
3726 }
3727
3728
3729
3730 /**
3731  * Execute reductions when returning from a spawn whose parent has
3732  * been stolen.
3733  *
3734  * Execution may start on w, but may finish on a different worker.
3735  * This method acquires/releases the lock on ff->parent.
3736  *
3737  * @param w            The currently executing worker.
3738  * @param ff           The full frame of the spawned function that is returning.
3739  * @param returning_sf The __cilkrts_stack_frame for this returning function.
3740  * @return             The worker returning from this method.
3741  */
3742 static __cilkrts_worker*
3743 execute_reductions_for_spawn_return(__cilkrts_worker *w,
3744                                     full_frame *ff,
3745                                     __cilkrts_stack_frame *returning_sf)
3746 {
3747     // Step A1 from reducer protocol described above.
3748     //
3749     // Coerce the runtime into thinking that
3750     // ff/returning_sf are still on the bottom of
3751     // w's deque.
3752     restore_frame_for_spawn_return_reduction(w, ff, returning_sf);
3753
3754     // Step A2 and A3: Execute reductions on user stack.
3755     BEGIN_WITH_FRAME_LOCK(w, ff->parent) {
3756         struct cilkred_map **left_map_ptr;
3757         left_map_ptr = fast_path_reductions_for_spawn_return(w, ff);
3758
3759         // Pointer will be non-NULL if there are
3760         // still reductions to execute.
3761         if (left_map_ptr) {
3762             // WARNING: This method call may release the lock
3763             // on ff->parent and re-acquire it (possibly on a
3764             // different worker).
3765             // We can't hold locks while actually executing
3766             // reduce functions.
3767             w = slow_path_reductions_for_spawn_return(w,
3768                                                       ff,
3769                                                       left_map_ptr);
3770             verify_current_wkr(w);
3771         }
3772
3773         finish_spawn_return_on_user_stack(w, ff->parent, ff);
3774         // WARNING: the use of this lock macro is deceptive.
3775         // The worker may have changed here.
3776     } END_WITH_FRAME_LOCK(w, ff->parent);
3777     return w;
3778 }
3779
3780
3781
3782 /**
3783  * Execute fast "reductions" when ff stalls at a sync.
3784  *
3785  * @param   w  The currently executing worker.
3786  * @param   ff The full frame stalling at a sync.
3787  * @return  1 if we are finished with all reductions after calling this method.
3788  * @return  0 if we still need to execute the slow path reductions.
3789  */
3790 static inline
3791 int fast_path_reductions_for_sync(__cilkrts_worker *w,
3792                                   full_frame *ff) {
3793     // Return 0 if there is some reduction that needs to happen.
3794     return !(w->reducer_map  || ff->pending_exception);
3795 }
3796
3797 /**
3798  * Executes slow reductions when ff stalls at a sync.
3799  * This method should execute only if
3800  *   fast_path_reductions_for_sync(w, ff) returned 0.
3801  *
3802  * After this method completes:
3803  *   1. ff's current reducer map has been deposited into
3804  *       right_reducer_map of ff's rightmost child, or
3805  *       ff->children_reducer_map if ff has no children.
3806  *   2. Similarly for ff's current exception.
3807  *   3. Nothing to calculate for stacks --- if we are stalling
3808  *      we will always free a stack.
3809  *
3810  * This method may repeatedly acquire/release the lock on ff.
3811  *
3812  * @param   w  The currently executing worker.
3813  * @param   ff The full frame stalling at a sync.
3814  * @return  The worker returning from this method.
3815  */
3816 static __cilkrts_worker*
3817 slow_path_reductions_for_sync(__cilkrts_worker *w,
3818                               full_frame *ff)
3819 {
3820     struct cilkred_map *left_map;
3821     struct cilkred_map *middle_map;
3822
3823 #if (REDPAR_DEBUG > 0)
3824     CILK_ASSERT(ff);
3825     CILK_ASSERT(w->head == w->tail);
3826 #endif
3827
3828     middle_map = w->reducer_map;
3829     w->reducer_map = NULL;
3830
3831     // Loop invariant: middle_map should be valid (the current map to reduce).
3832     //                 left_map is junk.
3833     //                 w->reducer_map == NULL.
3834     while (1) {
3835         BEGIN_WITH_FRAME_LOCK(w, ff) {
3836             splice_left_ptrs left_ptrs = compute_left_ptrs_for_sync(w, ff);
3837
3838             // Grab the "left" map and store pointers to those locations.
3839             left_map = *(left_ptrs.map_ptr);
3840             *(left_ptrs.map_ptr) = NULL;
3841
3842             // Slide the maps in our struct left as far as possible.
3843             if (!left_map) {
3844                 left_map = middle_map;
3845                 middle_map = NULL;
3846             }
3847
3848             *(left_ptrs.exception_ptr) =
3849                 __cilkrts_merge_pending_exceptions(w,
3850                                                    *left_ptrs.exception_ptr,
3851                                                    ff->pending_exception);
3852             ff->pending_exception = NULL;
3853
3854             // If there is no middle map, then we are done.
3855             // Deposit left and return.
3856             if (!middle_map) {
3857                 *(left_ptrs).map_ptr = left_map;
3858                 #if (REDPAR_DEBUG > 0)
3859                 CILK_ASSERT(NULL == w->reducer_map);
3860                 #endif
3861                 // Sanity check upon leaving the loop.
3862                 verify_current_wkr(w);
3863                 // Make sure to unlock before we return!
3864                 __cilkrts_frame_unlock(w, ff);
3865                 return w;
3866             }
3867         } END_WITH_FRAME_LOCK(w, ff);
3868
3869         // After we've released the lock, start counting time as
3870         // WORKING again.
3871         STOP_INTERVAL(w, INTERVAL_IN_RUNTIME);
3872         START_INTERVAL(w, INTERVAL_WORKING);
3873
3874         // If we get here, we have a nontrivial reduction to execute.
3875         middle_map = repeated_merge_reducer_maps(&w,
3876                                                  left_map,
3877                                                  middle_map);
3878         verify_current_wkr(w);
3879
3880         STOP_INTERVAL(w, INTERVAL_WORKING);
3881         START_INTERVAL(w, INTERVAL_IN_RUNTIME);
3882
3883         // Save any exceptions generated because of the reduction
3884         // process.  These get merged the next time around the
3885         // loop.
3886         CILK_ASSERT(NULL == ff->pending_exception);
3887         ff->pending_exception = w->l->pending_exception;
3888         w->l->pending_exception = NULL;
3889     }
3890
3891     // We should never break out of the loop above.
3892     CILK_ASSERT(0);
3893     return NULL;
3894 }
3895
3896
3897 /**
3898  * Execute reductions when ff stalls at a sync.
3899  *
3900  * Execution starts on w, but may finish on a different worker.
3901  * This method may acquire/release the lock on ff.
3902  *
3903  * @param w          The currently executing worker.
3904  * @param ff         The full frame of the spawned function at the sync
3905  * @param sf_at_sync The __cilkrts_stack_frame stalling at a sync
3906  * @return           The worker returning from this method.
3907  */
3908 static __cilkrts_worker*
3909 execute_reductions_for_sync(__cilkrts_worker *w,
3910                             full_frame *ff,
3911                             __cilkrts_stack_frame *sf_at_sync)
3912 {
3913     int finished_reductions;
3914     // Step B1 from reducer protocol above:
3915     // Restore runtime invariants.
3916     //
3917     // The following code for this step is almost equivalent to
3918     // the following sequence:
3919     //   1. disown(w, ff, sf_at_sync, "sync") (which itself
3920     //        calls make_unrunnable(w, ff, sf_at_sync))
3921     //   2. make_runnable(w, ff, sf_at_sync).
3922     //
3923     // The "disown" will mark the frame "sf_at_sync"
3924     // as stolen and suspended, and save its place on the stack,
3925     // so it can be resumed after the sync.
3926     //
3927     // The difference is, that we don't want the disown to
3928     // break the following connections yet, since we are
3929     // about to immediately make sf/ff runnable again anyway.
3930     //   sf_at_sync->worker == w
3931     //   w->l->frame_ff == ff.
3932     //
3933     // These connections are needed for parallel reductions, since
3934     // we will use sf / ff as the stack frame / full frame for
3935     // executing any potential reductions.
3936     //
3937     // TBD: Can we refactor the disown / make_unrunnable code
3938     // to avoid the code duplication here?
3939
3940     ff->call_stack = NULL;
3941
3942     // Normally, "make_unrunnable" would add CILK_FRAME_STOLEN and
3943     // CILK_FRAME_SUSPENDED to sf_at_sync->flags and save the state of
3944     // the stack so that a worker can resume the frame in the correct
3945     // place.
3946     //
3947     // But on this path, CILK_FRAME_STOLEN should already be set.
3948     // Also, we technically don't want to suspend the frame until
3949     // the reduction finishes.
3950     // We do, however, need to save the stack before
3951     // we start any reductions, since the reductions might push more
3952     // data onto the stack.
3953     CILK_ASSERT(sf_at_sync->flags | CILK_FRAME_STOLEN);
3954
3955     __cilkrts_put_stack(ff, sf_at_sync);
3956     __cilkrts_make_unrunnable_sysdep(w, ff, sf_at_sync, 1,
3957                                      "execute_reductions_for_sync");
3958     CILK_ASSERT(w->l->frame_ff == ff);
3959
3960     // Step B2: Execute reductions on user stack.
3961     // Check if we have any "real" reductions to do.
3962     finished_reductions = fast_path_reductions_for_sync(w, ff);
3963
3964     if (!finished_reductions) {
3965         // Still have some real reductions to execute.
3966         // Run them here.
3967
3968         // This method may acquire/release the lock on ff.
3969         w = slow_path_reductions_for_sync(w, ff);
3970
3971         // The previous call may return on a different worker.
3972         // than what we started on.
3973         verify_current_wkr(w);
3974     }
3975
3976 #if REDPAR_DEBUG >= 0
3977     CILK_ASSERT(w->l->frame_ff == ff);
3978     CILK_ASSERT(ff->call_stack == NULL);
3979 #endif
3980
3981     // Now we suspend the frame ff (since we've
3982     // finished the reductions).  Roughly, we've split apart the
3983     // "make_unrunnable" call here --- we've already saved the
3984     // stack info earlier before the reductions execute.
3985     // All that remains is to restore the call stack back into the
3986     // full frame, and mark the frame as suspended.
3987     ff->call_stack = sf_at_sync;
3988     sf_at_sync->flags |= CILK_FRAME_SUSPENDED;
3989
3990     // At a nontrivial sync, we should always free the current fiber,
3991     // because it can not be leftmost.
3992     w->l->fiber_to_free = ff->fiber_self;
3993     ff->fiber_self = NULL;
3994     return w;
3995 }
3996
3997
3998 /*
3999   Local Variables: **
4000   c-file-style:"bsd" **
4001   c-basic-offset:4 **
4002   indent-tabs-mode:nil **
4003   End: **
4004 */