3 *************************************************************************
6 * Copyright (C) 2009-2013, Intel Corporation
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
14 * * Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * * Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in
18 * the documentation and/or other materials provided with the
20 * * Neither the name of Intel Corporation nor the names of its
21 * contributors may be used to endorse or promote products derived
22 * from this software without specific prior written permission.
25 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
26 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
27 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
28 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
29 * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
30 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
31 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
32 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
33 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
35 * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 * POSSIBILITY OF SUCH DAMAGE.
37 **************************************************************************/
40 // define _GNU_SOURCE before *any* #include.
41 // Even <stdint.h> will break later #includes if this macro is not
42 // already defined when it is #included.
48 #include "cilk_malloc.h"
49 #include <internal/abi.h>
52 # include <sys/sysinfo.h>
53 # include <sys/syscall.h>
54 #elif defined __APPLE__
55 # include <sys/sysctl.h>
56 // Uses sysconf(_SC_NPROCESSORS_ONLN) in verbose output
57 #elif defined __DragonFly__
58 // No additional include files
59 #elif defined __FreeBSD__
60 // No additional include files
61 #elif defined __CYGWIN__
62 // Cygwin on Windows - no additional include files
63 #elif defined __VXWORKS__
65 # include <vxCpuLib.h>
68 #elif defined __sun__ && defined __svr4__
71 # error "Unsupported OS"
81 #include <sys/types.h>
85 // /* Thread-local storage */
87 // typedef unsigned cilkos_tls_key_t;
89 // typedef pthread_key_t cilkos_tls_key_t;
91 // cilkos_tls_key_t cilkos_allocate_tls_key();
92 // void cilkos_set_tls_pointer(cilkos_tls_key_t key, void* ptr);
93 // void* cilkos_get_tls_pointer(cilkos_tls_key_t key);
95 #if !defined CILK_WORKER_TLS
96 static int cilk_keys_defined
;
97 static pthread_key_t worker_key
, pedigree_leaf_key
, tbb_interop_key
;
99 #if SUPPORT_GET_CURRENT_FIBER > 0
100 static pthread_key_t fiber_key
;
103 static void *serial_worker
;
106 // This destructor is called when a pthread dies to deallocate the
108 static void __cilkrts_pedigree_leaf_destructor(void* pedigree_tls_ptr
)
110 __cilkrts_pedigree
* pedigree_tls
111 = (__cilkrts_pedigree
*)pedigree_tls_ptr
;
113 // Assert that we have either one or two nodes
114 // left in the pedigree chain.
115 // If we have more, then something is going wrong...
116 CILK_ASSERT(!pedigree_tls
->parent
|| !pedigree_tls
->parent
->parent
);
117 __cilkrts_free(pedigree_tls
);
121 void __cilkrts_init_tls_variables(void)
124 /* This will be called once in serial execution before any
125 Cilk parallelism so we do not need to worry about races
126 on cilk_keys_defined. */
127 if (cilk_keys_defined
)
129 status
= pthread_key_create(&worker_key
, NULL
);
130 CILK_ASSERT (status
== 0);
131 status
= pthread_key_create(&pedigree_leaf_key
,
132 __cilkrts_pedigree_leaf_destructor
);
133 CILK_ASSERT (status
== 0);
134 status
= pthread_key_create(&tbb_interop_key
, NULL
);
135 CILK_ASSERT (status
== 0);
137 #if SUPPORT_GET_CURRENT_FIBER > 0
138 status
= pthread_key_create(&fiber_key
, NULL
);
139 CILK_ASSERT (status
== 0);
141 cilk_keys_defined
= 1;
146 void* cilkos_get_current_thread_id(void)
148 return (void*)pthread_self();
152 CILK_ABI_WORKER_PTR
__cilkrts_get_tls_worker()
154 if (__builtin_expect(cilk_keys_defined
, 1))
155 return (__cilkrts_worker
*)pthread_getspecific(worker_key
);
157 return serial_worker
;
161 CILK_ABI_WORKER_PTR
__cilkrts_get_tls_worker_fast()
163 return (__cilkrts_worker
*)pthread_getspecific(worker_key
);
167 __cilk_tbb_stack_op_thunk
*__cilkrts_get_tls_tbb_interop(void)
169 if (__builtin_expect(cilk_keys_defined
, 1))
170 return (__cilk_tbb_stack_op_thunk
*)
171 pthread_getspecific(tbb_interop_key
);
176 // This counter should be updated atomically.
177 static int __cilkrts_global_pedigree_tls_counter
= -1;
180 __cilkrts_pedigree
*__cilkrts_get_tls_pedigree_leaf(int create_new
)
182 __cilkrts_pedigree
*pedigree_tls
;
183 if (__builtin_expect(cilk_keys_defined
, 1)) {
185 (struct __cilkrts_pedigree
*)pthread_getspecific(pedigree_leaf_key
);
191 if (!pedigree_tls
&& create_new
) {
192 // This call creates two nodes, X and Y.
193 // X == pedigree_tls[0] is the leaf node, which gets copied
194 // in and out of a user worker w when w binds and unbinds.
195 // Y == pedigree_tls[1] is the root node,
196 // which is a constant node that represents the user worker
198 pedigree_tls
= (__cilkrts_pedigree
*)
199 __cilkrts_malloc(2 * sizeof(__cilkrts_pedigree
));
201 // This call sets the TLS pointer to the new node.
202 __cilkrts_set_tls_pedigree_leaf(pedigree_tls
);
204 pedigree_tls
[0].rank
= 0;
205 pedigree_tls
[0].parent
= &pedigree_tls
[1];
207 // Create Y, whose rank begins as the global counter value.
208 pedigree_tls
[1].rank
=
209 __sync_add_and_fetch(&__cilkrts_global_pedigree_tls_counter
, 1);
211 pedigree_tls
[1].parent
= NULL
;
212 CILK_ASSERT(pedigree_tls
[1].rank
!= -1);
217 #if SUPPORT_GET_CURRENT_FIBER > 0
219 cilk_fiber_sysdep
* cilkos_get_tls_cilk_fiber(void)
221 if (__builtin_expect(cilk_keys_defined
, 1))
222 return (cilk_fiber_sysdep
*)pthread_getspecific(fiber_key
);
229 void __cilkrts_set_tls_worker(__cilkrts_worker
*w
)
231 if (__builtin_expect(cilk_keys_defined
, 1)) {
233 status
= pthread_setspecific(worker_key
, w
);
234 CILK_ASSERT (status
== 0);
244 void __cilkrts_set_tls_tbb_interop(__cilk_tbb_stack_op_thunk
*t
)
246 if (__builtin_expect(cilk_keys_defined
, 1)) {
248 status
= pthread_setspecific(tbb_interop_key
, t
);
249 CILK_ASSERT (status
== 0);
256 void __cilkrts_set_tls_pedigree_leaf(__cilkrts_pedigree
* pedigree_leaf
)
258 if (__builtin_expect(cilk_keys_defined
, 1)) {
260 status
= pthread_setspecific(pedigree_leaf_key
, pedigree_leaf
);
261 CILK_ASSERT (status
== 0);
267 #if SUPPORT_GET_CURRENT_FIBER > 0
269 void cilkos_set_tls_cilk_fiber(cilk_fiber_sysdep
* fiber
)
271 if (__builtin_expect(cilk_keys_defined
, 1)) {
273 status
= pthread_setspecific(fiber_key
, fiber
);
274 CILK_ASSERT (status
== 0);
282 void __cilkrts_init_tls_variables(void)
287 #if defined (__linux__) && ! defined(__ANDROID__)
289 * Get the thread id, rather than the pid. In the case of MIC offload, it's
290 * possible that we have multiple threads entering Cilk, and each has a
291 * different affinity.
293 static pid_t
linux_gettid(void)
295 return syscall(SYS_gettid
);
299 * On Linux we look at the thread affinity mask and restrict ourself to one
300 * thread for each of the hardware contexts to which we are bound.
301 * Therefore if user does
302 * % taskset 0-1 cilkProgram
303 * # restrict execution to hardware contexts zero and one
304 * the Cilk program will only use two threads even if it is running on a
305 * machine that has 32 hardware contexts.
306 * This is the right thing to do, because the threads are restricted to two
307 * hardware contexts by the affinity mask set by taskset, and if we were to
308 * create extra threads they would simply oversubscribe the hardware resources
310 * This is particularly important on MIC in offload mode, where the affinity
311 * mask is set by the offload library to force the offload code away from
312 * cores that have offload support threads running on them.
314 static int linux_get_affinity_count (int tid
)
316 #if !defined HAVE_PTHREAD_AFFINITY_NP
320 cpu_set_t process_mask
;
322 // Extract the thread affinity mask
323 int err
= sched_getaffinity (tid
, sizeof(process_mask
),&process_mask
);
330 // We have extracted the mask OK, so now we can count the number of threads
331 // in it. This is linear in the maximum number of CPUs available, We
332 // could do a logarithmic version, if we assume the format of the mask,
333 // but it's not really worth it. We only call this at thread startup
335 int available_procs
= 0;
337 for (i
= 0; i
< CPU_SETSIZE
; i
++)
339 if (CPU_ISSET(i
, &process_mask
))
345 return available_procs
;
348 #endif // defined (__linux__) && ! defined(__ANDROID__)
351 * __cilkrts_hardware_cpu_count
353 * Returns the number of available CPUs on this hardware. This is architecture-
357 COMMON_SYSDEP
int __cilkrts_hardware_cpu_count(void)
359 #if defined __ANDROID__ || (defined(__sun__) && defined(__svr4__))
360 return sysconf (_SC_NPROCESSORS_ONLN
);
361 #elif defined __MIC__
362 /// HACK: Usually, the 3rd and 4th hyperthreads are not beneficial
363 /// on KNC. Also, ignore the last core.
364 int P
= sysconf (_SC_NPROCESSORS_ONLN
);
366 #elif defined __linux__
367 int affinity_count
= linux_get_affinity_count(linux_gettid());
369 return (0 != affinity_count
) ? affinity_count
: sysconf (_SC_NPROCESSORS_ONLN
);
370 #elif defined __APPLE__
372 int cmd
[2] = { CTL_HW
, HW_NCPU
};
373 size_t len
= sizeof count
;
374 int status
= sysctl(cmd
, 2, &count
, &len
, 0, 0);
376 assert((unsigned)count
== count
);
379 #elif defined __FreeBSD__ || defined __CYGWIN__ || defined __DragonFly__
380 int ncores
= sysconf(_SC_NPROCESSORS_ONLN
);
383 // Just get the number of processors
384 // return sysconf(_SC_NPROCESSORS_ONLN);
385 #elif defined __VXWORKS__
386 return __builtin_popcount( vxCpuEnabledGet() );
388 #error "Unknown architecture"
392 COMMON_SYSDEP
void __cilkrts_sleep(void)
401 COMMON_SYSDEP
void __cilkrts_yield(void)
403 #if __APPLE__ || __FreeBSD__ || __VXWORKS__
404 // On MacOS, call sched_yield to yield quantum. I'm not sure why we
405 // don't do this on Linux also.
407 #elif defined(__DragonFly__)
408 // On DragonFly BSD, call sched_yield to yield quantum.
410 #elif defined(__MIC__)
411 // On MIC, pthread_yield() really trashes things. Arch's measurements
412 // showed that calling _mm_delay_32() (or doing nothing) was a better
413 // option. Delaying 1024 clock cycles is a reasonable compromise between
414 // giving up the processor and latency starting up when work becomes
417 #elif defined(__ANDROID__) || (defined(__sun__) && defined(__svr4__))
418 // On Android and Solaris, call sched_yield to yield quantum. I'm not
419 // sure why we don't do this on Linux also.
422 // On Linux, call pthread_yield (which in turn will call sched_yield)
428 COMMON_SYSDEP __STDNS
size_t cilkos_getenv(char* value
, __STDNS
size_t vallen
,
432 CILK_ASSERT(varname
);
434 const char* envstr
= getenv(varname
);
437 size_t len
= strlen(envstr
);
438 if (len
> vallen
- 1)
441 strcpy(value
, envstr
);
452 * Unrecoverable error: Print an error message and abort execution.
454 COMMON_SYSDEP
void cilkos_error(const char *fmt
, ...)
458 fprintf(stderr
, "Cilk error: ");
460 vfprintf(stderr
, fmt
, l
);
462 fprintf(stderr
, "Exiting.\n");
469 * Print a warning message and return.
471 COMMON_SYSDEP
void cilkos_warning(const char *fmt
, ...)
475 fprintf(stderr
, "Cilk warning: ");
477 vfprintf(stderr
, fmt
, l
);
482 static void __attribute__((constructor
)) init_once()
484 /*__cilkrts_debugger_notification_internal(CILK_DB_RUNTIME_LOADED);*/
485 __cilkrts_init_tls_variables();
490 #define CILK_MIN_STACK_SIZE (4*PAGE)
491 // Default size for the stacks that we create in Cilk for Unix.
492 #define CILK_DEFAULT_STACK_SIZE 0x100000
495 * Convert the user's specified stack size into a "reasonable" value
498 size_t cilkos_validate_stack_size(size_t specified_stack_size
) {
499 // Convert any negative value to the default.
500 if (specified_stack_size
== 0) {
501 CILK_ASSERT((CILK_DEFAULT_STACK_SIZE
% PAGE
) == 0);
502 return CILK_DEFAULT_STACK_SIZE
;
504 // Round values in between 0 and CILK_MIN_STACK_SIZE up to
505 // CILK_MIN_STACK_SIZE.
506 if (specified_stack_size
<= CILK_MIN_STACK_SIZE
) {
507 return CILK_MIN_STACK_SIZE
;
509 if ((specified_stack_size
% PAGE
) > 0) {
510 // Round the user's stack size value up to nearest page boundary.
511 return (PAGE
* (1 + specified_stack_size
/ PAGE
));
513 return specified_stack_size
;
516 long cilkos_atomic_add(volatile long* p
, long x
)
518 return __sync_add_and_fetch(p
, x
);