4 *************************************************************************
7 * Copyright (C) 2010-2013, Intel Corporation
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
15 * * Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 * * Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in
19 * the documentation and/or other materials provided with the
21 * * Neither the name of Intel Corporation nor the names of its
22 * contributors may be used to endorse or promote products derived
23 * from this software without specific prior written permission.
26 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
27 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
28 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
29 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
30 * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
31 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
32 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
33 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
34 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
36 * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 * POSSIBILITY OF SUCH DAMAGE.
39 **************************************************************************
43 // define _GNU_SOURCE before *any* #include.
44 // Even <stdint.h> will break later #includes if this macro is not
45 // already defined when it is #included.
52 #include "local_state.h"
53 #include "signal_node.h"
54 #include "full_frame.h"
56 #include "cilk_malloc.h"
57 #include "reducer_impl.h"
58 #include "metacall_impl.h"
61 // On x86 processors (but not MIC processors), the compiler generated code to
62 // save the FP state (rounding mode and the like) before calling setjmp. We
63 // will need to restore that state when we resume.
65 # if defined(__i386__) || defined(__x86_64)
66 # define RESTORE_X86_FP_STATE
67 # endif // defined(__i386__) || defined(__x86_64)
70 // contains notification macros for VTune.
71 #include "cilk-ittnotify.h"
76 // On Cygwin, string.h doesnt declare strcasecmp if __STRICT_ANSI__ is defined
77 # undef __STRICT_ANSI__
84 #if defined HAVE_ALLOCA_H
86 #elif defined __GNUC__
87 # define alloca __builtin_alloca
89 # define alloca __alloca
95 void *alloca (size_t);
99 //# include <scheduler.h> // Angle brackets include Apple's scheduler.h, not ours.
103 # include <sys/resource.h>
104 # include <sys/sysinfo.h>
108 # include <sys/resource.h>
109 // BSD does not define MAP_ANONYMOUS, but *does* define MAP_ANON. Aren't standards great!
110 # define MAP_ANONYMOUS MAP_ANON
114 # include <vxWorks.h>
115 # include <vxCpuLib.h>
118 struct global_sysdep_state
120 pthread_t
*threads
; ///< Array of pthreads for system workers
121 size_t pthread_t_size
; ///< for cilk_db
124 static void internal_enforce_global_visibility();
128 void __cilkrts_init_worker_sysdep(struct __cilkrts_worker
*w
)
130 ITT_SYNC_CREATE(w
, "Scheduler");
134 void __cilkrts_destroy_worker_sysdep(struct __cilkrts_worker
*w
)
139 void __cilkrts_init_global_sysdep(global_state_t
*g
)
141 internal_enforce_global_visibility();
143 __cilkrts_init_tls_variables();
145 CILK_ASSERT(g
->total_workers
>= g
->P
- 1);
146 g
->sysdep
= __cilkrts_malloc(sizeof (struct global_sysdep_state
));
147 CILK_ASSERT(g
->sysdep
);
148 g
->sysdep
->pthread_t_size
= sizeof (pthread_t
);
150 // TBD: Should this value be g->total_workers, or g->P?
151 // Need to check what we are using this field for.
152 g
->sysdep
->threads
= __cilkrts_malloc(sizeof(pthread_t
) * g
->total_workers
);
153 CILK_ASSERT(g
->sysdep
->threads
);
159 void __cilkrts_destroy_global_sysdep(global_state_t
*g
)
161 if (g
->sysdep
->threads
)
162 __cilkrts_free(g
->sysdep
->threads
);
163 __cilkrts_free(g
->sysdep
);
166 /*************************************************************
167 Creation of worker threads:
168 *************************************************************/
170 static void internal_run_scheduler_with_exceptions(__cilkrts_worker
*w
)
172 /* We assume the stack grows down. */
174 __cilkrts_cilkscreen_establish_c_stack(&var
- 1000000, &var
);
176 __cilkrts_run_scheduler_with_exceptions(w
);
182 * scheduler_thread_proc_for_system_worker
184 * Thread start function called when we start a new worker.
187 NON_COMMON
void* scheduler_thread_proc_for_system_worker(void *arg
)
190 __cilkrts_worker
*w
= (__cilkrts_worker
*)arg
;
192 #ifdef __INTEL_COMPILER
194 // Name the threads for Advisor. They don't want a worker number.
195 __itt_thread_set_name("Cilk Worker");
196 #endif // defined USE_ITTNOTIFY
197 #endif // defined __INTEL_COMPILER
199 /* Worker startup is serialized
200 status = pthread_mutex_lock(&__cilkrts_global_mutex);
201 CILK_ASSERT(status == 0);*/
202 CILK_ASSERT(w
->l
->type
== WORKER_SYSTEM
);
203 /*status = pthread_mutex_unlock(&__cilkrts_global_mutex);
204 CILK_ASSERT(status == 0);*/
206 __cilkrts_set_tls_worker(w
);
208 // Create a cilk fiber for this worker on this thread.
209 START_INTERVAL(w
, INTERVAL_FIBER_ALLOCATE_FROM_THREAD
) {
210 w
->l
->scheduling_fiber
= cilk_fiber_allocate_from_thread();
211 cilk_fiber_set_owner(w
->l
->scheduling_fiber
, w
);
212 } STOP_INTERVAL(w
, INTERVAL_FIBER_ALLOCATE_FROM_THREAD
);
214 internal_run_scheduler_with_exceptions(w
);
216 START_INTERVAL(w
, INTERVAL_FIBER_DEALLOCATE_FROM_THREAD
) {
217 // Deallocate the scheduling fiber. This operation reverses the
218 // effect cilk_fiber_allocate_from_thread() and must be done in this
219 // thread before it exits.
220 int ref_count
= cilk_fiber_deallocate_from_thread(w
->l
->scheduling_fiber
);
221 // Scheduling fibers should never have extra references to them.
222 // We only get extra references into fibers because of Windows
224 CILK_ASSERT(0 == ref_count
);
225 w
->l
->scheduling_fiber
= NULL
;
226 } STOP_INTERVAL(w
, INTERVAL_FIBER_DEALLOCATE_FROM_THREAD
);
233 * __cilkrts_user_worker_scheduling_stub
235 * Routine for the scheduling fiber created for an imported user
236 * worker thread. This method is analogous to
237 * scheduler_thread_proc_for_system_worker.
240 void __cilkrts_user_worker_scheduling_stub(cilk_fiber
* fiber
, void* null_arg
)
242 __cilkrts_worker
*w
= __cilkrts_get_tls_worker();
245 CILK_ASSERT(WORKER_USER
== w
->l
->type
);
247 // Enter the scheduling loop on the user worker.
248 // This function will never return.
249 __cilkrts_run_scheduler_with_exceptions(w
);
251 // A WORKER_USER, at some point, will resume on the original stack and leave
252 // Cilk. Under no circumstances do we ever exit off of the bottom of this
258 * We are exporting a function with this name to Inspector?
259 * What a confusing name...
261 * This function is exported so Piersol's stack trace displays
262 * reasonable information.
264 void* __cilkrts_worker_stub(void* arg
)
266 return scheduler_thread_proc_for_system_worker(arg
);
271 // /* Return the lesser of the argument and the operating system
272 // limit on the number of workers (threads) that may or ought
274 // int sysdep_thread_limit(int n, int physical_cpus)
276 // /* On Linux thread creation fails somewhere short of the
277 // number of available processes. */
278 // struct rlimit lim;
280 // if (n > 256 + 2 * physical_cpus)
281 // n = 256 + 2 * physical_cpus;
283 // if (getrlimit(RLIMIT_NPROC, &lim) == 0 && lim.rlim_cur != RLIM_INFINITY)
285 // /* If the limit reads 0 or absurdly small, ignore it. */
286 // unsigned int maxproc = (lim.rlim_cur * 3 + 3) / 4;
287 // if (maxproc > 8 + 2 * physical_cpus && maxproc < n)
295 static void write_version_file (global_state_t
*, int);
297 /* Create n worker threads from base..top-1
299 static void create_threads(global_state_t
*g
, int base
, int top
)
301 // TBD(11/30/12): We want to insert code providing the option of
302 // pinning system workers to cores.
303 for (int i
= base
; i
< top
; i
++) {
304 int status
= pthread_create(&g
->sysdep
->threads
[i
],
306 scheduler_thread_proc_for_system_worker
,
309 __cilkrts_bug("Cilk runtime error: thread creation (%d) failed: %d\n", i
, status
);
313 #if PARALLEL_THREAD_CREATE
314 static int volatile threads_created
= 0;
316 // Create approximately half of the worker threads, and then become a worker
318 static void * create_threads_and_work (void * arg
)
320 global_state_t
*g
= ((__cilkrts_worker
*)arg
)->g
;
322 create_threads(g
, g
->P
/2, g
->P
-1);
323 // Let the initial thread know that we're done.
326 // Ideally this turns into a tail call that wipes out this stack frame.
327 return scheduler_thread_proc_for_system_worker(arg
);
330 void __cilkrts_start_workers(global_state_t
*g
, int n
)
332 g
->workers_running
= 1;
335 if (!g
->sysdep
->threads
)
338 // Do we actually have any threads to create?
341 #if PARALLEL_THREAD_CREATE
343 // We create (a rounded up) half of the threads, thread one creates the rest
344 int half_threads
= (n
+1)/2;
346 // Create the first thread passing a different thread function, so that it creates threads itself
347 status
= pthread_create(&g
->sysdep
->threads
[0], NULL
, create_threads_and_work
, g
->workers
[0]);
350 __cilkrts_bug("Cilk runtime error: thread creation (0) failed: %d\n", status
);
352 // Then the rest of the ones we have to create
353 create_threads(g
, 1, half_threads
);
355 // Now wait for the first created thread to tell us it's created all of its threads.
356 // We could maybe drop this a bit lower and overlap with write_version_file.
357 while (!threads_created
)
360 // Simply create all the threads linearly here.
361 create_threads(g
, 0, n
);
364 // write the version information to a file if the environment is configured
365 // for it (the function makes the check).
366 write_version_file(g
, n
);
372 void __cilkrts_stop_workers(global_state_t
*g
)
376 // Tell the workers to give up
380 if (g
->workers_running
== 0)
383 if (!g
->sysdep
->threads
)
386 /* Make them all runnable. */
388 CILK_ASSERT(g
->workers
[0]->l
->signal_node
);
389 signal_node_msg(g
->workers
[0]->l
->signal_node
, 1);
392 for (i
= 0; i
< g
->P
- 1; ++i
) {
396 sc_status
= pthread_join(g
->sysdep
->threads
[i
], &th_status
);
398 __cilkrts_bug("Cilk runtime error: thread join (%d) failed: %d\n", i
, sc_status
);
401 g
->workers_running
= 0;
409 * @brief Returns the stack address for resuming execution of sf.
411 * This method takes in the top of the stack to use, and then returns
412 * a properly aligned address for resuming execution of sf.
414 * @param sf - The stack frame we want to resume executing.
415 * @param stack_base - The top of the stack we want to execute sf on.
418 static char* get_sp_for_executing_sf(char* stack_base
,
420 __cilkrts_stack_frame
*sf
)
422 // The original calculation that had been done to correct the stack
423 // pointer when resuming execution.
425 // But this code was never getting called in the eng branch anyway...
427 // TBD(11/30/12): This logic needs to be revisited to make sure that
428 // we are doing the proper calculation in reserving space for outgoing
429 // arguments on all platforms and architectures.
431 /* Preserve outgoing argument space and stack alignment on steal.
432 Outgoing argument space is bounded by the difference between
433 stack and frame pointers. Some user code is known to rely on
434 16 byte alignment. Maintain 32 byte alignment for future
436 #define SMASK 31 /* 32 byte alignment */
438 char *fp
= FP(sf
), *sp
= SP(sf
);
439 int fp_align
= (int)(size_t)fp
& SMASK
;
440 ptrdiff_t space
= fp
- sp
;
442 fprintf(stderr
, "Here: fp = %p, sp = %p\n", fp
, sp
);
443 char *top_aligned
= (char *)((((size_t)stack_base
- SMASK
) & ~(size_t)SMASK
) | fp_align
);
444 /* Don't allocate an unreasonable amount of stack space. */
446 fprintf(stderr
, "Here: stack_base = %p, top_aligned=%p, space=%ld\n",
447 stack_base
, top_aligned
, space
);
449 space
= 32 + (space
& SMASK
);
450 else if (space
> 40 * 1024)
451 space
= 40 * 1024 + (space
& SMASK
);
453 return top_aligned
- space
;
457 #define PERFORM_FRAME_SIZE_CALCULATION 0
459 char* new_stack_base
= stack_base
- 256;
461 #if PERFORM_FRAME_SIZE_CALCULATION
462 // If there is a frame size saved, then use that as the
463 // correction instead of 256.
464 if (ff
->frame_size
> 0) {
465 if (ff
->frame_size
< 40*1024) {
466 new_stack_base
= stack_base
- ff
->frame_size
;
469 // If for some reason, our frame size calculation is giving us
470 // a number which is bigger than about 10 pages, then
471 // there is likely something wrong here? Don't allocate
472 // an unreasonable amount of space.
473 new_stack_base
= stack_base
- 40*1024;
478 // Whatever correction we choose, align the final stack top.
479 // This alignment seems to be necessary in particular on 32-bit
480 // Linux, and possibly Mac. (Is 32-byte alignment is sufficient?)
481 /* 256-byte alignment. Why not? */
482 const uintptr_t align_mask
= ~(256 -1);
483 new_stack_base
= (char*)((size_t)new_stack_base
& align_mask
);
484 return new_stack_base
;
487 char* sysdep_reset_jump_buffers_for_resume(cilk_fiber
* fiber
,
489 __cilkrts_stack_frame
*sf
)
492 fprintf(stderr
, "ThreadId=%p (fiber_proc_to_resume), Fiber %p. sf = %p. ff=%p, ff->sync_sp=%p\n",
493 cilkos_get_current_thread_id(),
500 void* sp
= (void*)get_sp_for_executing_sf(cilk_fiber_get_stack_base(fiber
), ff
, sf
);
503 /* Debugging: make sure stack is accessible. */
504 ((volatile char *)sp
)[-1];
506 // Adjust the saved_sp to account for the SP we're about to run. This will
507 // allow us to track fluctations in the stack
509 fprintf(stderr
, "ThreadId=%p, about to take stack ff=%p, sp=%p, sync_sp=%p\n",
510 cilkos_get_current_thread_id(),
515 __cilkrts_take_stack(ff
, sp
);
520 NORETURN
sysdep_longjmp_to_sf(char* new_sp
,
521 __cilkrts_stack_frame
*sf
,
522 full_frame
*ff_for_exceptions
/* UNUSED on Unix */)
526 "ThreadId=%p. resume user code, sf=%p, new_sp = %p, original SP(sf) = %p, FP(sf) = %p\n",
527 cilkos_get_current_thread_id(), sf
, new_sp
, SP(sf
), FP(sf
));
530 // Set the stack pointer.
533 #ifdef RESTORE_X86_FP_STATE
534 if (CILK_FRAME_VERSION_VALUE(sf
->flags
) >= 1) {
535 // Restore the floating point state that was set in this frame at the
538 // This feature is only available in ABI 1 or later frames, and only
539 // needed on IA64 or Intel64 processors.
540 restore_x86_fp_state(sf
);
544 CILK_LONGJMP(sf
->ctx
);
551 #include <sys/mman.h>
555 void __cilkrts_make_unrunnable_sysdep(__cilkrts_worker
*w
,
557 __cilkrts_stack_frame
*sf
,
561 (void)w
; /* unused */
566 if (ff
->frame_size
== 0)
567 ff
->frame_size
= __cilkrts_get_frame_size(sf
);
569 // Null loot's sp for debugging purposes (so we'll know it's not valid)
575 * __cilkrts_sysdep_is_worker_thread_id
577 * Returns true if the thread ID specified matches the thread ID we saved
581 int __cilkrts_sysdep_is_worker_thread_id(global_state_t
*g
,
585 #if defined( __linux__) || defined(__VXWORKS__)
586 pthread_t tid
= *(pthread_t
*)thread_id
;
587 if (i
< 0 || i
> g
->total_workers
)
589 return g
->sysdep
->threads
[i
] == tid
;
591 // Needs to be implemented
599 /*************************************************************
601 *************************************************************/
604 #include "internal/cilk_version.h"
606 #include <sys/utsname.h>
612 /* (Non-static) dummy function is used by get_runtime_path() to find the path
613 * to the .so containing the Cilk runtime.
615 void dummy_function() { }
617 /* return a string with the path to the Cilk runtime, or "unknown" if the path
618 * cannot be determined.
620 static const char *get_runtime_path ()
623 // Cygwin doesn't support dladdr, which sucks
627 if (0 == dladdr(dummy_function
, &info
)) return "unknown";
628 return info
.dli_fname
;
632 /* if the environment variable, CILK_VERSION, is defined, writes the version
633 * information to the specified file.
634 * g is the global state that was just created, and n is the number of workers
635 * that were made (or requested from RML) for it.
637 static void write_version_file (global_state_t
*g
, int n
)
639 const char *env
; // environment variable.
640 char buf
[256]; // print buffer.
643 struct utsname sys_info
;
644 int err
; // error code from system calls.
646 // if CILK_VERSION is not set, or if the file cannot be opened, fail
647 // silently. Otherwise open the file for writing (or use stderr or stdout
648 // if the user specifies).
649 if (NULL
== (env
= getenv("CILK_VERSION"))) return;
650 if (0 == strcasecmp(env
, "stderr")) fp
= stderr
;
651 else if (0 == strcasecmp(env
, "stdout")) fp
= stdout
;
652 else if (NULL
== (fp
= fopen(env
, "w"))) return;
654 // get a string for the current time. E.g.,
655 // Cilk runtime initialized: Thu Jun 10 13:28:00 2010
657 strftime(buf
, 256, "%a %b %d %H:%M:%S %Y", localtime(&t
));
658 fprintf(fp
, "Cilk runtime initialized: %s\n", buf
);
660 // Print runtime info. E.g.,
661 // Cilk runtime information
662 // ========================
663 // Cilk version: 2.0.0 Build 9184
664 // Built by willtor on host willtor-desktop
665 // Compilation date: Thu Jun 10 13:27:42 2010
666 // Compiled with ICC V99.9.9, ICC build date: 20100610
668 fprintf(fp
, "\nCilk runtime information\n");
669 fprintf(fp
, "========================\n");
670 fprintf(fp
, "Cilk version: %d.%d.%d Build %d\n",
676 char * vxWorksVer
= VXWORKS_VERSION
;
677 fprintf(fp
, "Cross compiled for %s\n",vxWorksVer
);
678 // user and host not avalible if VxWorks cross compiled on windows build host
681 // User and host are not available for GCC builds
683 fprintf(fp
, "Built by "BUILD_USER
" on host "BUILD_HOST
"\n");
685 #endif // __VXWORKS__
687 // GCC has requested that this be removed for GCC builds
689 fprintf(fp
, "Compilation date: "__DATE__
" "__TIME__
"\n");
692 #ifdef __INTEL_COMPILER
693 // Compiled by the Intel C/C++ compiler.
694 fprintf(fp
, "Compiled with ICC V%d.%d.%d, ICC build date: %d\n",
695 __INTEL_COMPILER
/ 100,
696 (__INTEL_COMPILER
/ 10) % 10,
697 __INTEL_COMPILER
% 10,
698 __INTEL_COMPILER_BUILD_DATE
);
701 fprintf(fp
, "Compiled with GCC V%d.%d.%d\n",
704 __GNUC_PATCHLEVEL__
);
705 #endif // defined __INTEL_COMPILER
707 // Print system info. E.g.,
708 // System information
709 // ==================
710 // Cilk runtime path: /opt/icc/64/lib/libcilkrts.so.5
711 // System OS: Linux, release 2.6.28-19-generic
712 // System architecture: x86_64
714 err
= uname(&sys_info
);
715 fprintf(fp
, "\nSystem information\n");
716 fprintf(fp
, "==================\n");
717 fprintf(fp
, "Cilk runtime path: %s\n", get_runtime_path());
718 fprintf(fp
, "System OS: %s, release %s\n",
719 err
< 0 ? "unknown" : sys_info
.sysname
,
720 err
< 0 ? "?" : sys_info
.release
);
721 fprintf(fp
, "System architecture: %s\n",
722 err
< 0 ? "unknown" : sys_info
.machine
);
724 // Print thread info. E.g.,
725 // Thread information
726 // ==================
728 // Cilk workers requested: 8
729 // Thread creator: Private
731 fprintf(fp
, "\nThread information\n");
732 fprintf(fp
, "==================\n");
734 fprintf(fp
, "System cores: %d\n", (int)__builtin_popcount(vxCpuEnabledGet()));
736 fprintf(fp
, "System cores: %d\n", (int)sysconf(_SC_NPROCESSORS_ONLN
));
738 fprintf(fp
, "Cilk workers requested: %d\n", n
);
739 #if (PARALLEL_THREAD_CREATE)
740 fprintf(fp
, "Thread creator: Private (parallel)\n");
742 fprintf(fp
, "Thread creator: Private\n");
745 if (fp
!= stderr
&& fp
!= stdout
) fclose(fp
);
746 else fflush(fp
); // flush the handle buffer if it is stdout or stderr.
751 * __cilkrts_establish_c_stack
753 * Tell Cilkscreen about the user stack bounds.
755 * Note that the Cilk V1 runtime only included the portion of the stack from
756 * the entry into Cilk, down. We don't appear to be able to find that, but
757 * I think this will be sufficient.
760 void __cilkrts_establish_c_stack(void)
762 /* FIXME: Not implemented. */
764 /* TBD: Do we need this */
766 void __cilkrts_cilkscreen_establish_c_stack(char *begin, char *end);
769 MEMORY_BASIC_INFORMATION mbi;
771 r = VirtualQuery (&mbi,
775 __cilkrts_cilkscreen_establish_c_stack((char *)mbi.BaseAddress,
776 (char *)mbi.BaseAddress + mbi.RegionSize);
782 * internal_enforce_global_visibility
784 * Ensure global visibility of public symbols, for proper Cilk-TBB interop.
786 * If Cilk runtime is loaded dynamically, its symbols might remain unavailable
787 * for global search with dladdr; that might prevent TBB from finding Cilk
788 * in the process address space and initiating the interop protocol.
789 * The workaround is for the library to open itself with RTLD_GLOBAL flag.
792 static __attribute__((noinline
))
793 void internal_enforce_global_visibility()
795 void* handle
= dlopen( get_runtime_path(), RTLD_GLOBAL
|RTLD_LAZY
);
797 /* For proper reference counting, close the handle immediately. */
798 if( handle
) dlclose(handle
);
803 c-file-style:"bsd" **
805 indent-tabs-mode:nil **