3 *************************************************************************
5 * Copyright (C) 2009-2016, Intel Corporation
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
12 * * Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * * Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in
16 * the documentation and/or other materials provided with the
18 * * Neither the name of Intel Corporation nor the names of its
19 * contributors may be used to endorse or promote products derived
20 * from this software without specific prior written permission.
22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
25 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
26 * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
28 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
29 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
32 * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
33 * POSSIBILITY OF SUCH DAMAGE.
35 * *********************************************************************
37 * PLEASE NOTE: This file is a downstream copy of a file mainitained in
38 * a repository at cilkplus.org. Changes made to this file that are not
39 * submitted through the contribution process detailed at
40 * http://www.cilkplus.org/submit-cilk-contribution will be lost the next
41 * time that a new version is released. Changes only submitted to the
42 * GNU compiler collection or posted to the git repository at
43 * https://bitbucket.org/intelcilkruntime/intel-cilk-runtime.git are
46 * We welcome your contributions to this open source project. Thank you
47 * for your assistance in helping us improve Cilk Plus.
48 **************************************************************************/
52 #include "cilk_malloc.h"
53 #include <internal/abi.h>
56 # include <sys/sysinfo.h>
57 # include <sys/syscall.h>
59 #elif defined __APPLE__
60 # include <sys/sysctl.h>
61 // Uses sysconf(_SC_NPROCESSORS_ONLN) in verbose output
63 #elif defined __VXWORKS__
65 # include <vxCpuLib.h>
69 #elif defined __sun__ && defined __svr4__
72 // OSes we know about which don't require any additional files
73 #elif defined __CYGWIN__ || \
74 defined __DragonFly__ || \
75 defined __FreeBSD__ || \
77 // No additional include files
80 # error "Unsupported OS"
90 #include <sys/types.h>
94 // /* Thread-local storage */
96 // typedef unsigned cilkos_tls_key_t;
98 // typedef pthread_key_t cilkos_tls_key_t;
100 // cilkos_tls_key_t cilkos_allocate_tls_key();
101 // void cilkos_set_tls_pointer(cilkos_tls_key_t key, void* ptr);
102 // void* cilkos_get_tls_pointer(cilkos_tls_key_t key);
104 #if !defined CILK_WORKER_TLS
105 static int cilk_keys_defined
;
106 static pthread_key_t worker_key
, pedigree_leaf_key
, tbb_interop_key
;
108 #if SUPPORT_GET_CURRENT_FIBER > 0
109 static pthread_key_t fiber_key
;
112 static void *serial_worker
;
115 // This destructor is called when a pthread dies to deallocate the
117 static void __cilkrts_pedigree_leaf_destructor(void* pedigree_tls_ptr
)
119 __cilkrts_pedigree
* pedigree_tls
120 = (__cilkrts_pedigree
*)pedigree_tls_ptr
;
122 // Assert that we have either one or two nodes
123 // left in the pedigree chain.
124 // If we have more, then something is going wrong...
125 CILK_ASSERT(!pedigree_tls
->parent
|| !pedigree_tls
->parent
->parent
);
126 __cilkrts_free(pedigree_tls
);
130 void __cilkrts_init_tls_variables(void)
133 /* This will be called once in serial execution before any
134 Cilk parallelism so we do not need to worry about races
135 on cilk_keys_defined. */
136 if (cilk_keys_defined
)
138 status
= pthread_key_create(&worker_key
, NULL
);
139 CILK_ASSERT (status
== 0);
140 status
= pthread_key_create(&pedigree_leaf_key
,
141 __cilkrts_pedigree_leaf_destructor
);
142 CILK_ASSERT (status
== 0);
143 status
= pthread_key_create(&tbb_interop_key
, NULL
);
144 CILK_ASSERT (status
== 0);
146 #if SUPPORT_GET_CURRENT_FIBER > 0
147 status
= pthread_key_create(&fiber_key
, NULL
);
148 CILK_ASSERT (status
== 0);
150 cilk_keys_defined
= 1;
155 void* cilkos_get_current_thread_id(void)
157 return (void*)pthread_self();
161 CILK_ABI_WORKER_PTR
__cilkrts_get_tls_worker()
163 if (__builtin_expect(cilk_keys_defined
, 1))
164 return (__cilkrts_worker
*)pthread_getspecific(worker_key
);
166 return serial_worker
;
170 CILK_ABI_WORKER_PTR
__cilkrts_get_tls_worker_fast()
172 return (__cilkrts_worker
*)pthread_getspecific(worker_key
);
176 __cilk_tbb_stack_op_thunk
*__cilkrts_get_tls_tbb_interop(void)
178 if (__builtin_expect(cilk_keys_defined
, 1))
179 return (__cilk_tbb_stack_op_thunk
*)
180 pthread_getspecific(tbb_interop_key
);
185 // This counter should be updated atomically.
186 static int __cilkrts_global_pedigree_tls_counter
= -1;
189 __cilkrts_pedigree
*__cilkrts_get_tls_pedigree_leaf(int create_new
)
191 __cilkrts_pedigree
*pedigree_tls
;
192 if (__builtin_expect(cilk_keys_defined
, 1)) {
194 (struct __cilkrts_pedigree
*)pthread_getspecific(pedigree_leaf_key
);
200 if (!pedigree_tls
&& create_new
) {
201 // This call creates two nodes, X and Y.
202 // X == pedigree_tls[0] is the leaf node, which gets copied
203 // in and out of a user worker w when w binds and unbinds.
204 // Y == pedigree_tls[1] is the root node,
205 // which is a constant node that represents the user worker
207 pedigree_tls
= (__cilkrts_pedigree
*)
208 __cilkrts_malloc(2 * sizeof(__cilkrts_pedigree
));
210 // This call sets the TLS pointer to the new node.
211 __cilkrts_set_tls_pedigree_leaf(pedigree_tls
);
213 pedigree_tls
[0].rank
= 0;
214 pedigree_tls
[0].parent
= &pedigree_tls
[1];
216 // Create Y, whose rank begins as the global counter value.
217 pedigree_tls
[1].rank
=
218 __sync_add_and_fetch(&__cilkrts_global_pedigree_tls_counter
, 1);
220 pedigree_tls
[1].parent
= NULL
;
221 CILK_ASSERT(pedigree_tls
[1].rank
!= -1);
226 #if SUPPORT_GET_CURRENT_FIBER > 0
228 cilk_fiber_sysdep
* cilkos_get_tls_cilk_fiber(void)
230 if (__builtin_expect(cilk_keys_defined
, 1))
231 return (cilk_fiber_sysdep
*)pthread_getspecific(fiber_key
);
238 void __cilkrts_set_tls_worker(__cilkrts_worker
*w
)
240 if (__builtin_expect(cilk_keys_defined
, 1)) {
242 status
= pthread_setspecific(worker_key
, w
);
243 CILK_ASSERT (status
== 0);
253 void __cilkrts_set_tls_tbb_interop(__cilk_tbb_stack_op_thunk
*t
)
255 if (__builtin_expect(cilk_keys_defined
, 1)) {
257 status
= pthread_setspecific(tbb_interop_key
, t
);
258 CILK_ASSERT (status
== 0);
265 void __cilkrts_set_tls_pedigree_leaf(__cilkrts_pedigree
* pedigree_leaf
)
267 if (__builtin_expect(cilk_keys_defined
, 1)) {
269 status
= pthread_setspecific(pedigree_leaf_key
, pedigree_leaf
);
270 CILK_ASSERT (status
== 0);
276 #if SUPPORT_GET_CURRENT_FIBER > 0
278 void cilkos_set_tls_cilk_fiber(cilk_fiber_sysdep
* fiber
)
280 if (__builtin_expect(cilk_keys_defined
, 1)) {
282 status
= pthread_setspecific(fiber_key
, fiber
);
283 CILK_ASSERT (status
== 0);
291 void __cilkrts_init_tls_variables(void)
296 #if defined (__linux__) && ! defined(__ANDROID__)
298 * Get the thread id, rather than the pid. In the case of MIC offload, it's
299 * possible that we have multiple threads entering Cilk, and each has a
300 * different affinity.
302 static pid_t
linux_gettid(void)
304 return syscall(SYS_gettid
);
308 * On Linux we look at the thread affinity mask and restrict ourself to one
309 * thread for each of the hardware contexts to which we are bound.
310 * Therefore if user does
311 * % taskset 0-1 cilkProgram
312 * # restrict execution to hardware contexts zero and one
313 * the Cilk program will only use two threads even if it is running on a
314 * machine that has 32 hardware contexts.
315 * This is the right thing to do, because the threads are restricted to two
316 * hardware contexts by the affinity mask set by taskset, and if we were to
317 * create extra threads they would simply oversubscribe the hardware resources
319 * This is particularly important on MIC in offload mode, where the affinity
320 * mask is set by the offload library to force the offload code away from
321 * cores that have offload support threads running on them.
323 static int linux_get_affinity_count ()
325 long system_cores
= sysconf(_SC_NPROCESSORS_ONLN
);
326 int affinity_cores
= 0;
328 #if defined HAVE_PTHREAD_AFFINITY_NP
330 #if defined (CPU_ALLOC_SIZE) && ! defined(DONT_USE_CPU_ALLOC_SIZE)
331 // Statically allocated cpu_set_t's max out at 1024 cores. If
332 // CPU_ALLOC_SIZE is available, use it to support large numbers of cores
333 size_t cpusetsize
= CPU_ALLOC_SIZE(system_cores
);
334 cpu_set_t
*process_mask
= (cpu_set_t
*)__cilkrts_malloc(cpusetsize
);
336 // Get the affinity mask for this thread
337 int err
= pthread_getaffinity_np(pthread_self(),
341 // Count the available cores.
343 affinity_cores
= CPU_COUNT_S(cpusetsize
, process_mask
);
345 __cilkrts_free(process_mask
);
348 // CPU_ALLOC_SIZE isn't available, or this is the Intel compiler build
349 // and we have to support RHEL5. Use a statically allocated cpu_set_t
351 cpu_set_t process_mask
;
353 // Extract the thread affinity mask
354 int err
= pthread_getaffinity_np(pthread_self(),
355 sizeof(process_mask
),
360 // We have extracted the mask OK, so now we can count the number of
361 // threads in it. This is linear in the maximum number of CPUs
362 // available, We could do a logarithmic version, if we assume the
363 // format of the mask, but it's not really worth it. We only call
364 // this at thread startup anyway.
366 for (i
= 0; i
< CPU_SETSIZE
; i
++)
368 if (CPU_ISSET(i
, &process_mask
))
374 #endif // CPU_ALLOC_SIZE
375 #endif // ! defined HAVE_PTHREAD_AFFINITY_NP
377 // If we've got a count of cores this thread is supposed to use, that's
378 // the number or cores we'll use. Otherwise, default to the number of
379 // cores on the system.
380 if (0 == affinity_cores
)
383 return affinity_cores
;
385 #endif // defined (__linux__) && ! defined(__ANDROID__)
388 * __cilkrts_hardware_cpu_count
390 * Returns the number of available CPUs on this hardware. This is architecture-
394 COMMON_SYSDEP
int __cilkrts_hardware_cpu_count(void)
396 #if defined __ANDROID__ || \
397 defined __CYGWIN__ || \
398 defined __DragonFly__ || \
399 defined __FreeBSD__ || \
400 (defined(__sun__) && defined(__svr4__))
401 return (int)sysconf(_SC_NPROCESSORS_ONLN
);
402 #elif defined __MIC__
403 /// HACK: Usually, the 3rd and 4th hyperthreads are not beneficial
404 /// on KNC. Also, ignore the last core.
405 int count
= (int)sysconf (_SC_NPROCESSORS_ONLN
);
407 #elif defined __linux__
408 return linux_get_affinity_count();
409 #elif defined __APPLE__
411 size_t len
= sizeof count
;
412 int status
= sysctlbyname("hw.logicalcpu", &count
, &len
, 0, 0);
416 #elif defined __VXWORKS__
417 return __builtin_popcount(vxCpuEnabledGet());
419 #error "Unsupported architecture"
423 COMMON_SYSDEP
void __cilkrts_idle(void)
425 // This is another version of __cilkrts_yield() to be used when
426 // silencing workers that are not stealing work.
427 #if defined(__ANDROID__) || \
428 defined(__FreeBSD__) || \
429 defined(__VXWORKS__) || \
430 (defined(__sun__) && defined(__svr4__))
432 #elif defined(__MIC__)
434 #elif defined(__linux__) || \
438 # error "Unsupported architecture"
442 COMMON_SYSDEP
void __cilkrts_sleep(void)
451 COMMON_SYSDEP
void __cilkrts_yield(void)
453 #if defined(__ANDROID__) || \
454 defined(__APPLE__) || \
455 defined(__FreeBSD__) || \
456 defined(__VXWORKS__) || \
457 (defined(__sun__) && defined(__svr4__))
458 // Call sched_yield to yield quantum. I'm not sure why we
459 // don't do this on Linux also.
461 #elif defined(__MIC__)
462 // On MIC, pthread_yield() really trashes things. Arch's measurements
463 // showed that calling _mm_delay_32() (or doing nothing) was a better
464 // option. Delaying 1024 clock cycles is a reasonable compromise between
465 // giving up the processor and latency starting up when work becomes
468 #elif defined(__linux__)
469 // On Linux, call pthread_yield (which in turn will call sched_yield)
473 # error "Unsupported architecture"
477 COMMON_SYSDEP __STDNS
size_t cilkos_getenv(char* value
, __STDNS
size_t vallen
,
481 CILK_ASSERT(varname
);
483 const char* envstr
= getenv(varname
);
486 size_t len
= cilk_strlen(envstr
);
487 if (len
> vallen
- 1)
489 cilk_strcpy_s(value
, vallen
, envstr
);
500 * Unrecoverable error: Print an error message and abort execution.
502 COMMON_SYSDEP
void cilkos_error(const char *fmt
, ...)
506 fprintf(stderr
, "Cilk error: ");
508 vfprintf(stderr
, fmt
, l
);
510 fprintf(stderr
, "Exiting.\n");
517 * Print a warning message and return.
519 COMMON_SYSDEP
void cilkos_warning(const char *fmt
, ...)
523 fprintf(stderr
, "Cilk warning: ");
525 vfprintf(stderr
, fmt
, l
);
534 __cilkrts_init_tls_variables();
537 _WRS_CONSTRUCTOR(cilkInit
, 100)
539 __cilkrts_init_tls_variables();
543 static void __attribute__((constructor
)) init_once()
545 /*__cilkrts_debugger_notification_internal(CILK_DB_RUNTIME_LOADED);*/
546 __cilkrts_init_tls_variables();
552 #define CILK_MIN_STACK_SIZE (4*PAGE)
553 // Default size for the stacks that we create in Cilk for Unix.
554 #define CILK_DEFAULT_STACK_SIZE 0x100000
557 * Convert the user's specified stack size into a "reasonable" value
560 size_t cilkos_validate_stack_size(size_t specified_stack_size
) {
561 // Convert any negative value to the default.
562 if (specified_stack_size
== 0) {
563 CILK_ASSERT((CILK_DEFAULT_STACK_SIZE
% PAGE
) == 0);
564 return CILK_DEFAULT_STACK_SIZE
;
566 // Round values in between 0 and CILK_MIN_STACK_SIZE up to
567 // CILK_MIN_STACK_SIZE.
568 if (specified_stack_size
<= CILK_MIN_STACK_SIZE
) {
569 return CILK_MIN_STACK_SIZE
;
571 if ((specified_stack_size
% PAGE
) > 0) {
572 // Round the user's stack size value up to nearest page boundary.
573 return (PAGE
* (1 + specified_stack_size
/ PAGE
));
575 return specified_stack_size
;
578 long cilkos_atomic_add(volatile long* p
, long x
)
580 return __sync_add_and_fetch(p
, x
);