libcilkrts/runtime/os-unix.c

   1 /* os-unix.c                  -*-C-*-
   2  *
   3  *************************************************************************
   4  *
   5  *  Copyright (C) 2009-2016, Intel Corporation
   6  *  All rights reserved.
   7  *
   8  *  Redistribution and use in source and binary forms, with or without
   9  *  modification, are permitted provided that the following conditions
  10  *  are met:
  11  *
  12  *    * Redistributions of source code must retain the above copyright
  13  *      notice, this list of conditions and the following disclaimer.
  14  *    * Redistributions in binary form must reproduce the above copyright
  15  *      notice, this list of conditions and the following disclaimer in
  16  *      the documentation and/or other materials provided with the
  17  *      distribution.
  18  *    * Neither the name of Intel Corporation nor the names of its
  19  *      contributors may be used to endorse or promote products derived
  20  *      from this software without specific prior written permission.
  21  *
  22  *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  23  *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  24  *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  25  *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  26  *  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  27  *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  28  *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
  29  *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  30  *  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  31  *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
  32  *  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  33  *  POSSIBILITY OF SUCH DAMAGE.
  34  *
  35  *  *********************************************************************
  36  *
  37  *  PLEASE NOTE: This file is a downstream copy of a file mainitained in
  38  *  a repository at cilkplus.org. Changes made to this file that are not
  39  *  submitted through the contribution process detailed at
  40  *  http://www.cilkplus.org/submit-cilk-contribution will be lost the next
  41  *  time that a new version is released. Changes only submitted to the
  42  *  GNU compiler collection or posted to the git repository at
  43  *  https://bitbucket.org/intelcilkruntime/intel-cilk-runtime.git are
  44  *  not tracked.
  45  *
  46  *  We welcome your contributions to this open source project. Thank you
  47  *  for your assistance in helping us improve Cilk Plus.
  48  **************************************************************************/
  49
  50 #include "os.h"
  51 #include "bug.h"
  52 #include "cilk_malloc.h"
  53 #include <internal/abi.h>
  54
  55 #if defined __linux__
  56 #   include <sys/sysinfo.h>
  57 #   include <sys/syscall.h>
  58
  59 #elif defined __APPLE__
  60 #   include <sys/sysctl.h>
  61     // Uses sysconf(_SC_NPROCESSORS_ONLN) in verbose output
  62
  63 #elif defined  __VXWORKS__
  64 #   include <vxWorks.h>
  65 #   include <vxCpuLib.h>
  66 #   include <taskLib.h>
  67
  68 // Solaris
  69 #elif defined __sun__ && defined __svr4__
  70 #   include <sched.h>
  71
  72 // OSes we know about which don't require any additional files
  73 #elif defined __CYGWIN__ || \
  74       defined __DragonFly__ || \
  75       defined __FreeBSD__ || \
  76       defined __GNU__
  77 // No additional include files
  78
  79 #else
  80 #   error "Unsupported OS"
  81 #endif
  82
  83 #include <stdarg.h>
  84 #include <stddef.h>
  85 #include <stdio.h>
  86 #include <stdlib.h>
  87 #include <string.h>
  88 #include <unistd.h>
  89 #include <pthread.h>
  90 #include <sys/types.h>
  91
  92
  93
  94 // /* Thread-local storage */
  95 // #ifdef _WIN32
  96 // typedef unsigned cilkos_tls_key_t;
  97 // #else
  98 // typedef pthread_key_t cilkos_tls_key_t;
  99 // #endif
 100 // cilkos_tls_key_t cilkos_allocate_tls_key();
 101 // void cilkos_set_tls_pointer(cilkos_tls_key_t key, void* ptr);
 102 // void* cilkos_get_tls_pointer(cilkos_tls_key_t key);
 103
 104 #if !defined CILK_WORKER_TLS
 105 static int cilk_keys_defined;
 106 static pthread_key_t worker_key, pedigree_leaf_key, tbb_interop_key;
 107
 108 #if SUPPORT_GET_CURRENT_FIBER > 0
 109 static pthread_key_t fiber_key;
 110 #endif
 111
 112 static void *serial_worker;
 113
 114
 115 // This destructor is called when a pthread dies to deallocate the
 116 // pedigree node.
 117 static void __cilkrts_pedigree_leaf_destructor(void* pedigree_tls_ptr)
 118 {
 119     __cilkrts_pedigree* pedigree_tls
 120         = (__cilkrts_pedigree*)pedigree_tls_ptr;
 121     if (pedigree_tls) {
 122         // Assert that we have either one or two nodes
 123         // left in the pedigree chain.
 124         // If we have more, then something is going wrong...
 125         CILK_ASSERT(!pedigree_tls->parent || !pedigree_tls->parent->parent);
 126         __cilkrts_free(pedigree_tls);
 127     }
 128 }
 129
 130 void __cilkrts_init_tls_variables(void)
 131 {
 132     int status;
 133     /* This will be called once in serial execution before any
 134        Cilk parallelism so we do not need to worry about races
 135        on cilk_keys_defined. */
 136     if (cilk_keys_defined)
 137         return;
 138     status = pthread_key_create(&worker_key, NULL);
 139     CILK_ASSERT (status == 0);
 140     status = pthread_key_create(&pedigree_leaf_key,
 141                                 __cilkrts_pedigree_leaf_destructor);
 142     CILK_ASSERT (status == 0);
 143     status = pthread_key_create(&tbb_interop_key, NULL);
 144     CILK_ASSERT (status == 0);
 145
 146 #if SUPPORT_GET_CURRENT_FIBER > 0
 147     status = pthread_key_create(&fiber_key, NULL);
 148     CILK_ASSERT (status == 0);
 149 #endif
 150     cilk_keys_defined = 1;
 151     return;
 152 }
 153
 154 COMMON_SYSDEP
 155 void* cilkos_get_current_thread_id(void)
 156 {
 157     return (void*)pthread_self();
 158 }
 159
 160
 161 CILK_ABI_WORKER_PTR __cilkrts_get_tls_worker()
 162 {
 163     if (__builtin_expect(cilk_keys_defined, 1))
 164         return (__cilkrts_worker *)pthread_getspecific(worker_key);
 165     else
 166         return serial_worker;
 167
 168 }
 169
 170 CILK_ABI_WORKER_PTR __cilkrts_get_tls_worker_fast()
 171 {
 172   return (__cilkrts_worker *)pthread_getspecific(worker_key);
 173 }
 174
 175 COMMON_SYSDEP
 176 __cilk_tbb_stack_op_thunk *__cilkrts_get_tls_tbb_interop(void)
 177 {
 178     if (__builtin_expect(cilk_keys_defined, 1))
 179         return (__cilk_tbb_stack_op_thunk *)
 180             pthread_getspecific(tbb_interop_key);
 181     else
 182         return 0;
 183 }
 184
 185 // This counter should be updated atomically.
 186 static int __cilkrts_global_pedigree_tls_counter = -1;
 187
 188 COMMON_SYSDEP
 189 __cilkrts_pedigree *__cilkrts_get_tls_pedigree_leaf(int create_new)
 190 {
 191     __cilkrts_pedigree *pedigree_tls;
 192     if (__builtin_expect(cilk_keys_defined, 1)) {
 193         pedigree_tls =
 194             (struct __cilkrts_pedigree *)pthread_getspecific(pedigree_leaf_key);
 195     }
 196     else {
 197         return 0;
 198     }
 199
 200     if (!pedigree_tls && create_new) {
 201         // This call creates two nodes, X and Y.
 202         // X == pedigree_tls[0] is the leaf node, which gets copied
 203         // in and out of a user worker w when w binds and unbinds.
 204         // Y == pedigree_tls[1] is the root node,
 205         // which is a constant node that represents the user worker
 206         // thread w.
 207         pedigree_tls = (__cilkrts_pedigree*)
 208             __cilkrts_malloc(2 * sizeof(__cilkrts_pedigree));
 209
 210         // This call sets the TLS pointer to the new node.
 211         __cilkrts_set_tls_pedigree_leaf(pedigree_tls);
 212
 213         pedigree_tls[0].rank = 0;
 214         pedigree_tls[0].parent = &pedigree_tls[1];
 215
 216         // Create Y, whose rank begins as the global counter value.
 217         pedigree_tls[1].rank =
 218             __sync_add_and_fetch(&__cilkrts_global_pedigree_tls_counter, 1);
 219
 220         pedigree_tls[1].parent = NULL;
 221         CILK_ASSERT(pedigree_tls[1].rank != -1);
 222     }
 223     return pedigree_tls;
 224 }
 225
 226 #if SUPPORT_GET_CURRENT_FIBER > 0
 227 COMMON_SYSDEP
 228 cilk_fiber_sysdep* cilkos_get_tls_cilk_fiber(void)
 229 {
 230     if (__builtin_expect(cilk_keys_defined, 1))
 231         return (cilk_fiber_sysdep *)pthread_getspecific(fiber_key);
 232     else
 233         return NULL;
 234 }
 235 #endif
 236
 237 COMMON_SYSDEP
 238 void __cilkrts_set_tls_worker(__cilkrts_worker *w)
 239 {
 240     if (__builtin_expect(cilk_keys_defined, 1)) {
 241         int status;
 242         status = pthread_setspecific(worker_key, w);
 243         CILK_ASSERT (status == 0);
 244         return;
 245     }
 246     else
 247     {
 248         serial_worker = w;
 249     }
 250 }
 251
 252 COMMON_SYSDEP
 253 void __cilkrts_set_tls_tbb_interop(__cilk_tbb_stack_op_thunk *t)
 254 {
 255     if (__builtin_expect(cilk_keys_defined, 1)) {
 256         int status;
 257         status = pthread_setspecific(tbb_interop_key, t);
 258         CILK_ASSERT (status == 0);
 259         return;
 260     }
 261     abort();
 262 }
 263
 264 COMMON_SYSDEP
 265 void __cilkrts_set_tls_pedigree_leaf(__cilkrts_pedigree* pedigree_leaf)
 266 {
 267     if (__builtin_expect(cilk_keys_defined, 1)) {
 268         int status;
 269         status = pthread_setspecific(pedigree_leaf_key, pedigree_leaf);
 270         CILK_ASSERT (status == 0);
 271         return;
 272     }
 273     abort();
 274 }
 275
 276 #if SUPPORT_GET_CURRENT_FIBER > 0
 277 COMMON_SYSDEP
 278 void cilkos_set_tls_cilk_fiber(cilk_fiber_sysdep* fiber)
 279 {
 280     if (__builtin_expect(cilk_keys_defined, 1)) {
 281         int status;
 282         status = pthread_setspecific(fiber_key, fiber);
 283         CILK_ASSERT (status == 0);
 284         return;
 285     }
 286     abort();
 287 }
 288 #endif
 289
 290 #else
 291 void __cilkrts_init_tls_variables(void)
 292 {
 293 }
 294 #endif
 295
 296 #if defined (__linux__) && ! defined(__ANDROID__)
 297 /*
 298  * Get the thread id, rather than the pid. In the case of MIC offload, it's
 299  * possible that we have multiple threads entering Cilk, and each has a
 300  * different affinity.
 301  */
 302 static pid_t linux_gettid(void)
 303 {
 304     return syscall(SYS_gettid);
 305 }
 306
 307 /*
 308  * On Linux we look at the thread affinity mask and restrict ourself to one
 309  * thread for each of the hardware contexts to which we are bound.
 310  * Therefore if user does
 311  * % taskset 0-1 cilkProgram
 312  *       # restrict execution to hardware contexts zero and one
 313  * the Cilk program will only use two threads even if it is running on a
 314  * machine that has 32 hardware contexts.
 315  * This is the right thing to do, because the threads are restricted to two
 316  * hardware contexts by the affinity mask set by taskset, and if we were to
 317  * create extra threads they would simply oversubscribe the hardware resources
 318  * we can use.
 319  * This is particularly important on MIC in offload mode, where the affinity
 320  * mask is set by the offload library to force the offload code away from
 321  * cores that have offload support threads running on them.
 322  */
 323 static int linux_get_affinity_count ()
 324 {
 325     long system_cores = sysconf(_SC_NPROCESSORS_ONLN);
 326     int affinity_cores = 0;
 327
 328 #if defined HAVE_PTHREAD_AFFINITY_NP
 329
 330 #if defined (CPU_ALLOC_SIZE) && ! defined(DONT_USE_CPU_ALLOC_SIZE)
 331     // Statically allocated cpu_set_t's max out at 1024 cores.  If
 332     // CPU_ALLOC_SIZE is available, use it to support large numbers of cores
 333     size_t cpusetsize = CPU_ALLOC_SIZE(system_cores);
 334     cpu_set_t *process_mask = (cpu_set_t *)__cilkrts_malloc(cpusetsize);
 335
 336     // Get the affinity mask for this thread
 337     int err = pthread_getaffinity_np(pthread_self(),
 338                                      cpusetsize,
 339                                      process_mask);
 340
 341     // Count the available cores.
 342     if (0 == err)
 343         affinity_cores = CPU_COUNT_S(cpusetsize, process_mask);
 344
 345     __cilkrts_free(process_mask);
 346
 347 #else
 348     // CPU_ALLOC_SIZE isn't available, or this is the Intel compiler build
 349     // and we have to support RHEL5.  Use a statically allocated cpu_set_t
 350
 351     cpu_set_t process_mask;
 352
 353     // Extract the thread affinity mask
 354     int err = pthread_getaffinity_np(pthread_self(),
 355                                      sizeof(process_mask),
 356                                      &process_mask);
 357
 358     if (0 == err)
 359     {
 360         // We have extracted the mask OK, so now we can count the number of
 361         // threads in it.  This is linear in the maximum number of CPUs
 362         // available, We could do a logarithmic version, if we assume the
 363         // format of the mask, but it's not really worth it. We only call
 364         // this at thread startup anyway.
 365         int i;
 366         for (i = 0; i < CPU_SETSIZE; i++)
 367         {
 368             if (CPU_ISSET(i, &process_mask))
 369             {
 370                 affinity_cores++;
 371             }
 372         }
 373     }
 374 #endif  // CPU_ALLOC_SIZE
 375 #endif  //  ! defined HAVE_PTHREAD_AFFINITY_NP
 376
 377     // If we've got a count of cores this thread is supposed to use, that's
 378     // the number or cores we'll use.  Otherwise, default to the number of
 379     // cores on the system.
 380     if (0 == affinity_cores)
 381         return system_cores;
 382     else
 383         return affinity_cores;
 384 }
 385 #endif  //  defined (__linux__) && ! defined(__ANDROID__)
 386
 387 /*
 388  * __cilkrts_hardware_cpu_count
 389  *
 390  * Returns the number of available CPUs on this hardware.  This is architecture-
 391  * specific.
 392  */
 393
 394 COMMON_SYSDEP int __cilkrts_hardware_cpu_count(void)
 395 {
 396 #if defined __ANDROID__  || \
 397     defined __CYGWIN__   || \
 398     defined __DragonFly__  || \
 399     defined __FreeBSD__  || \
 400     (defined(__sun__) && defined(__svr4__))
 401     return (int)sysconf(_SC_NPROCESSORS_ONLN);
 402 #elif defined __MIC__
 403     /// HACK: Usually, the 3rd and 4th hyperthreads are not beneficial
 404     /// on KNC.  Also, ignore the last core.
 405     int count = (int)sysconf (_SC_NPROCESSORS_ONLN);
 406     return count/2 - 2;
 407 #elif defined __linux__
 408     return linux_get_affinity_count();
 409 #elif defined __APPLE__
 410     int count;
 411     size_t len = sizeof count;
 412     int status = sysctlbyname("hw.logicalcpu", &count, &len, 0, 0);
 413     assert(0 == status);
 414
 415     return count;
 416 #elif defined  __VXWORKS__
 417     return __builtin_popcount(vxCpuEnabledGet());
 418 #else
 419 #error "Unsupported architecture"
 420 #endif
 421 }
 422
 423 COMMON_SYSDEP void __cilkrts_idle(void)
 424 {
 425     // This is another version of __cilkrts_yield() to be used when
 426     // silencing workers that are not stealing work.
 427 #if defined(__ANDROID__)  || \
 428     defined(__FreeBSD__)  || \
 429     defined(__VXWORKS__)  || \
 430     (defined(__sun__) && defined(__svr4__))
 431     sched_yield();
 432 #elif defined(__MIC__)
 433     _mm_delay_32(1024);
 434 #elif defined(__linux__) || \
 435       defined(__APPLE__)
 436     usleep(10000);
 437 #else
 438 # error "Unsupported architecture"
 439 #endif
 440 }
 441
 442 COMMON_SYSDEP void __cilkrts_sleep(void)
 443 {
 444 #ifdef __VXWORKS__
 445     taskDelay(1);
 446 #else
 447     usleep(1);
 448 #endif
 449 }
 450
 451 COMMON_SYSDEP void __cilkrts_yield(void)
 452 {
 453 #if defined(__ANDROID__)  || \
 454     defined(__APPLE__)    || \
 455     defined(__FreeBSD__)  || \
 456     defined(__VXWORKS__)  || \
 457     (defined(__sun__) && defined(__svr4__))
 458     // Call sched_yield to yield quantum.  I'm not sure why we
 459     // don't do this on Linux also.
 460     sched_yield();
 461 #elif defined(__MIC__)
 462     // On MIC, pthread_yield() really trashes things.  Arch's measurements
 463     // showed that calling _mm_delay_32() (or doing nothing) was a better
 464     // option.  Delaying 1024 clock cycles is a reasonable compromise between
 465     // giving up the processor and latency starting up when work becomes
 466     // available
 467     _mm_delay_32(1024);
 468 #elif defined(__linux__)
 469     // On Linux, call pthread_yield (which in turn will call sched_yield)
 470     // to yield quantum.
 471     pthread_yield();
 472 #else
 473 # error "Unsupported architecture"
 474 #endif
 475 }
 476
 477 COMMON_SYSDEP __STDNS size_t cilkos_getenv(char* value, __STDNS size_t vallen,
 478                                            const char* varname)
 479 {
 480     CILK_ASSERT(value);
 481     CILK_ASSERT(varname);
 482
 483     const char* envstr = getenv(varname);
 484     if (envstr)
 485     {
 486         size_t len = cilk_strlen(envstr);
 487         if (len > vallen - 1)
 488             return len + 1;
 489         cilk_strcpy_s(value, vallen, envstr);
 490         return len;
 491     }
 492     else
 493     {
 494         value[0] = '\0';
 495         return 0;
 496     }
 497 }
 498
 499 /*
 500  * Unrecoverable error: Print an error message and abort execution.
 501  */
 502 COMMON_SYSDEP void cilkos_error(const char *fmt, ...)
 503 {
 504     va_list l;
 505     fflush(NULL);
 506     fprintf(stderr, "Cilk error: ");
 507     va_start(l, fmt);
 508     vfprintf(stderr, fmt, l);
 509     va_end(l);
 510     fprintf(stderr, "Exiting.\n");
 511     fflush(stderr);
 512
 513     abort();
 514 }
 515
 516 /*
 517  * Print a warning message and return.
 518  */
 519 COMMON_SYSDEP void cilkos_warning(const char *fmt, ...)
 520 {
 521     va_list l;
 522     fflush(NULL);
 523     fprintf(stderr, "Cilk warning: ");
 524     va_start(l, fmt);
 525     vfprintf(stderr, fmt, l);
 526     va_end(l);
 527     fflush(stderr);
 528 }
 529
 530 #ifdef __VXWORKS__
 531 #ifdef _WRS_KERNEL
 532 void cilkStart()
 533 {
 534     __cilkrts_init_tls_variables();
 535 }
 536 #else
 537 _WRS_CONSTRUCTOR(cilkInit, 100)
 538 {
 539     __cilkrts_init_tls_variables();
 540 }
 541 #endif
 542 #else
 543 static void __attribute__((constructor)) init_once()
 544 {
 545     /*__cilkrts_debugger_notification_internal(CILK_DB_RUNTIME_LOADED);*/
 546     __cilkrts_init_tls_variables();
 547 }
 548 #endif
 549
 550
 551 #define PAGE 4096
 552 #define CILK_MIN_STACK_SIZE (4*PAGE)
 553 // Default size for the stacks that we create in Cilk for Unix.
 554 #define CILK_DEFAULT_STACK_SIZE 0x100000
 555
 556 /*
 557  * Convert the user's specified stack size into a "reasonable" value
 558  * for this OS.
 559  */
 560 size_t cilkos_validate_stack_size(size_t specified_stack_size) {
 561     // Convert any negative value to the default.
 562     if (specified_stack_size == 0) {
 563         CILK_ASSERT((CILK_DEFAULT_STACK_SIZE % PAGE) == 0);
 564         return CILK_DEFAULT_STACK_SIZE;
 565     }
 566     // Round values in between 0 and CILK_MIN_STACK_SIZE up to
 567     // CILK_MIN_STACK_SIZE.
 568     if (specified_stack_size <= CILK_MIN_STACK_SIZE) {
 569         return CILK_MIN_STACK_SIZE;
 570     }
 571     if ((specified_stack_size % PAGE) > 0) {
 572         // Round the user's stack size value up to nearest page boundary.
 573         return (PAGE * (1 + specified_stack_size / PAGE));
 574     }
 575     return specified_stack_size;
 576 }
 577
 578 long cilkos_atomic_add(volatile long* p, long x)
 579 {
 580     return __sync_add_and_fetch(p, x);
 581 }
 582
 583 /* End os-unix.c */