gcc-4_9/libcilkrts/runtime/os-unix.c

   1 /* os-unix.c                  -*-C-*-
   2  *
   3  *************************************************************************
   4  *
   5  *  @copyright
   6  *  Copyright (C) 2009-2013, Intel Corporation
   7  *  All rights reserved.
   8  *
   9  *  @copyright
  10  *  Redistribution and use in source and binary forms, with or without
  11  *  modification, are permitted provided that the following conditions
  12  *  are met:
  13  *
  14  *    * Redistributions of source code must retain the above copyright
  15  *      notice, this list of conditions and the following disclaimer.
  16  *    * Redistributions in binary form must reproduce the above copyright
  17  *      notice, this list of conditions and the following disclaimer in
  18  *      the documentation and/or other materials provided with the
  19  *      distribution.
  20  *    * Neither the name of Intel Corporation nor the names of its
  21  *      contributors may be used to endorse or promote products derived
  22  *      from this software without specific prior written permission.
  23  *
  24  *  @copyright
  25  *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  26  *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  27  *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  28  *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  29  *  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  30  *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  31  *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
  32  *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  33  *  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  34  *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
  35  *  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  36  *  POSSIBILITY OF SUCH DAMAGE.
  37  **************************************************************************/
  38
  39 #ifdef __linux__
  40     // define _GNU_SOURCE before *any* #include.
  41     // Even <stdint.h> will break later #includes if this macro is not
  42     // already defined when it is #included.
  43 #   define _GNU_SOURCE
  44 #endif
  45
  46 #include "os.h"
  47 #include "bug.h"
  48 #include "cilk_malloc.h"
  49 #include <internal/abi.h>
  50
  51 #if defined __linux__
  52 #   include <sys/sysinfo.h>
  53 #   include <sys/syscall.h>
  54 #elif defined __APPLE__
  55 #   include <sys/sysctl.h>
  56     // Uses sysconf(_SC_NPROCESSORS_ONLN) in verbose output
  57 #elif defined  __FreeBSD__
  58 // No additional include files
  59 #elif defined __CYGWIN__
  60 // Cygwin on Windows - no additional include files
  61 #elif defined  __VXWORKS__
  62 #   include <vxWorks.h>
  63 #   include <vxCpuLib.h>
  64 #   include <taskLib.h>
  65 // Solaris
  66 #elif defined __sun__ && defined __svr4__
  67 #   include <sched.h>
  68 #else
  69 #   error "Unsupported OS"
  70 #endif
  71
  72 #include <stdarg.h>
  73 #include <stddef.h>
  74 #include <stdio.h>
  75 #include <stdlib.h>
  76 #include <string.h>
  77 #include <unistd.h>
  78 #include <pthread.h>
  79 #include <sys/types.h>
  80
  81
  82
  83 // /* Thread-local storage */
  84 // #ifdef _WIN32
  85 // typedef unsigned cilkos_tls_key_t;
  86 // #else
  87 // typedef pthread_key_t cilkos_tls_key_t;
  88 // #endif
  89 // cilkos_tls_key_t cilkos_allocate_tls_key();
  90 // void cilkos_set_tls_pointer(cilkos_tls_key_t key, void* ptr);
  91 // void* cilkos_get_tls_pointer(cilkos_tls_key_t key);
  92
  93 #if !defined CILK_WORKER_TLS
  94 static int cilk_keys_defined;
  95 static pthread_key_t worker_key, pedigree_leaf_key, tbb_interop_key;
  96
  97 #if SUPPORT_GET_CURRENT_FIBER > 0
  98 static pthread_key_t fiber_key;
  99 #endif
 100
 101 static void *serial_worker;
 102
 103
 104 // This destructor is called when a pthread dies to deallocate the
 105 // pedigree node.
 106 static void __cilkrts_pedigree_leaf_destructor(void* pedigree_tls_ptr)
 107 {
 108     __cilkrts_pedigree* pedigree_tls
 109         = (__cilkrts_pedigree*)pedigree_tls_ptr;
 110     if (pedigree_tls) {
 111         // Assert that we have either one or two nodes
 112         // left in the pedigree chain.
 113         // If we have more, then something is going wrong...
 114         CILK_ASSERT(!pedigree_tls->parent || !pedigree_tls->parent->parent);
 115         __cilkrts_free(pedigree_tls);
 116     }
 117 }
 118
 119 void __cilkrts_init_tls_variables(void)
 120 {
 121     int status;
 122     /* This will be called once in serial execution before any
 123        Cilk parallelism so we do not need to worry about races
 124        on cilk_keys_defined. */
 125     if (cilk_keys_defined)
 126         return;
 127     status = pthread_key_create(&worker_key, NULL);
 128     CILK_ASSERT (status == 0);
 129     status = pthread_key_create(&pedigree_leaf_key,
 130                                 __cilkrts_pedigree_leaf_destructor);
 131     CILK_ASSERT (status == 0);
 132     status = pthread_key_create(&tbb_interop_key, NULL);
 133     CILK_ASSERT (status == 0);
 134
 135 #if SUPPORT_GET_CURRENT_FIBER > 0
 136     status = pthread_key_create(&fiber_key, NULL);
 137     CILK_ASSERT (status == 0);
 138 #endif
 139     cilk_keys_defined = 1;
 140     return;
 141 }
 142
 143 COMMON_SYSDEP
 144 void* cilkos_get_current_thread_id(void)
 145 {
 146     return (void*)pthread_self();
 147 }
 148
 149
 150 CILK_ABI_WORKER_PTR __cilkrts_get_tls_worker()
 151 {
 152     if (__builtin_expect(cilk_keys_defined, 1))
 153         return (__cilkrts_worker *)pthread_getspecific(worker_key);
 154     else
 155         return serial_worker;
 156
 157 }
 158
 159 CILK_ABI_WORKER_PTR __cilkrts_get_tls_worker_fast()
 160 {
 161   return (__cilkrts_worker *)pthread_getspecific(worker_key);
 162 }
 163
 164 COMMON_SYSDEP
 165 __cilk_tbb_stack_op_thunk *__cilkrts_get_tls_tbb_interop(void)
 166 {
 167     if (__builtin_expect(cilk_keys_defined, 1))
 168         return (__cilk_tbb_stack_op_thunk *)
 169             pthread_getspecific(tbb_interop_key);
 170     else
 171         return 0;
 172 }
 173
 174 // This counter should be updated atomically.
 175 static int __cilkrts_global_pedigree_tls_counter = -1;
 176
 177 COMMON_SYSDEP
 178 __cilkrts_pedigree *__cilkrts_get_tls_pedigree_leaf(int create_new)
 179 {
 180     __cilkrts_pedigree *pedigree_tls;
 181     if (__builtin_expect(cilk_keys_defined, 1)) {
 182         pedigree_tls =
 183             (struct __cilkrts_pedigree *)pthread_getspecific(pedigree_leaf_key);
 184     }
 185     else {
 186         return 0;
 187     }
 188
 189     if (!pedigree_tls && create_new) {
 190         // This call creates two nodes, X and Y.
 191         // X == pedigree_tls[0] is the leaf node, which gets copied
 192         // in and out of a user worker w when w binds and unbinds.
 193         // Y == pedigree_tls[1] is the root node,
 194         // which is a constant node that represents the user worker
 195         // thread w.
 196         pedigree_tls = (__cilkrts_pedigree*)
 197             __cilkrts_malloc(2 * sizeof(__cilkrts_pedigree));
 198
 199         // This call sets the TLS pointer to the new node.
 200         __cilkrts_set_tls_pedigree_leaf(pedigree_tls);
 201
 202         pedigree_tls[0].rank = 0;
 203         pedigree_tls[0].parent = &pedigree_tls[1];
 204
 205         // Create Y, whose rank begins as the global counter value.
 206         pedigree_tls[1].rank =
 207             __sync_add_and_fetch(&__cilkrts_global_pedigree_tls_counter, 1);
 208
 209         pedigree_tls[1].parent = NULL;
 210         CILK_ASSERT(pedigree_tls[1].rank != -1);
 211     }
 212     return pedigree_tls;
 213 }
 214
 215 #if SUPPORT_GET_CURRENT_FIBER > 0
 216 COMMON_SYSDEP
 217 cilk_fiber_sysdep* cilkos_get_tls_cilk_fiber(void)
 218 {
 219     if (__builtin_expect(cilk_keys_defined, 1))
 220         return (cilk_fiber_sysdep *)pthread_getspecific(fiber_key);
 221     else
 222         return NULL;
 223 }
 224 #endif
 225
 226 COMMON_SYSDEP
 227 void __cilkrts_set_tls_worker(__cilkrts_worker *w)
 228 {
 229     if (__builtin_expect(cilk_keys_defined, 1)) {
 230         int status;
 231         status = pthread_setspecific(worker_key, w);
 232         CILK_ASSERT (status == 0);
 233         return;
 234     }
 235     else
 236     {
 237         serial_worker = w;
 238     }
 239 }
 240
 241 COMMON_SYSDEP
 242 void __cilkrts_set_tls_tbb_interop(__cilk_tbb_stack_op_thunk *t)
 243 {
 244     if (__builtin_expect(cilk_keys_defined, 1)) {
 245         int status;
 246         status = pthread_setspecific(tbb_interop_key, t);
 247         CILK_ASSERT (status == 0);
 248         return;
 249     }
 250     abort();
 251 }
 252
 253 COMMON_SYSDEP
 254 void __cilkrts_set_tls_pedigree_leaf(__cilkrts_pedigree* pedigree_leaf)
 255 {
 256     if (__builtin_expect(cilk_keys_defined, 1)) {
 257         int status;
 258         status = pthread_setspecific(pedigree_leaf_key, pedigree_leaf);
 259         CILK_ASSERT (status == 0);
 260         return;
 261     }
 262     abort();
 263 }
 264
 265 #if SUPPORT_GET_CURRENT_FIBER > 0
 266 COMMON_SYSDEP
 267 void cilkos_set_tls_cilk_fiber(cilk_fiber_sysdep* fiber)
 268 {
 269     if (__builtin_expect(cilk_keys_defined, 1)) {
 270         int status;
 271         status = pthread_setspecific(fiber_key, fiber);
 272         CILK_ASSERT (status == 0);
 273         return;
 274     }
 275     abort();
 276 }
 277 #endif
 278
 279 #else
 280 void __cilkrts_init_tls_variables(void)
 281 {
 282 }
 283 #endif
 284
 285 #if defined (__linux__) && ! defined(__ANDROID__)
 286 /*
 287  * Get the thread id, rather than the pid. In the case of MIC offload, it's
 288  * possible that we have multiple threads entering Cilk, and each has a
 289  * different affinity.
 290  */
 291 static pid_t linux_gettid(void)
 292 {
 293     return syscall(SYS_gettid);
 294 }
 295
 296 /*
 297  * On Linux we look at the thread affinity mask and restrict ourself to one
 298  * thread for each of the hardware contexts to which we are bound.
 299  * Therefore if user does
 300  * % taskset 0-1 cilkProgram
 301  *       # restrict execution to hardware contexts zero and one
 302  * the Cilk program will only use two threads even if it is running on a
 303  * machine that has 32 hardware contexts.
 304  * This is the right thing to do, because the threads are restricted to two
 305  * hardware contexts by the affinity mask set by taskset, and if we were to
 306  * create extra threads they would simply oversubscribe the hardware resources
 307  * we can use.
 308  * This is particularly important on MIC in offload mode, where the affinity
 309  * mask is set by the offload library to force the offload code away from
 310  * cores that have offload support threads running on them.
 311  */
 312 static int linux_get_affinity_count (int tid)
 313 {
 314 #if !defined HAVE_PTHREAD_AFFINITY_NP
 315   return 0;
 316 #else
 317
 318     cpu_set_t process_mask;
 319
 320     // Extract the thread affinity mask
 321     int err = sched_getaffinity (tid, sizeof(process_mask),&process_mask);
 322
 323     if (0 != err)
 324     {
 325         return 0;
 326     }
 327
 328     // We have extracted the mask OK, so now we can count the number of threads
 329     // in it.  This is linear in the maximum number of CPUs available, We
 330     // could do a logarithmic version, if we assume the format of the mask,
 331     // but it's not really worth it. We only call this at thread startup
 332     // anyway.
 333     int available_procs = 0;
 334     int i;
 335     for (i = 0; i < CPU_SETSIZE; i++)
 336     {
 337         if (CPU_ISSET(i, &process_mask))
 338         {
 339             available_procs++;
 340         }
 341     }
 342
 343     return available_procs;
 344 #endif
 345 }
 346 #endif  //  defined (__linux__) && ! defined(__ANDROID__)
 347
 348 /*
 349  * __cilkrts_hardware_cpu_count
 350  *
 351  * Returns the number of available CPUs on this hardware.  This is architecture-
 352  * specific.
 353  */
 354
 355 COMMON_SYSDEP int __cilkrts_hardware_cpu_count(void)
 356 {
 357 #if defined __ANDROID__ || (defined(__sun__) && defined(__svr4__))
 358     return sysconf (_SC_NPROCESSORS_ONLN);
 359 #elif defined __MIC__
 360     /// HACK: Usually, the 3rd and 4th hyperthreads are not beneficial
 361     /// on KNC.  Also, ignore the last core.
 362     int P = sysconf (_SC_NPROCESSORS_ONLN);
 363     return P/2 - 2;
 364 #elif defined __linux__
 365     int affinity_count = linux_get_affinity_count(linux_gettid());
 366
 367     return (0 != affinity_count) ? affinity_count : sysconf (_SC_NPROCESSORS_ONLN);
 368 #elif defined __APPLE__
 369     int count = 0;
 370     int cmd[2] = { CTL_HW, HW_NCPU };
 371     size_t len = sizeof count;
 372     int status = sysctl(cmd, 2, &count, &len, 0, 0);
 373     assert(status >= 0);
 374     assert((unsigned)count == count);
 375
 376     return count;
 377 #elif defined  __FreeBSD__ || defined __CYGWIN__
 378     int ncores = sysconf(_SC_NPROCESSORS_ONLN);
 379
 380     return ncores;
 381     // Just get the number of processors
 382 //    return sysconf(_SC_NPROCESSORS_ONLN);
 383 #elif defined  __VXWORKS__
 384     return __builtin_popcount( vxCpuEnabledGet() );
 385 #else
 386 #error "Unknown architecture"
 387 #endif
 388 }
 389
 390 COMMON_SYSDEP void __cilkrts_sleep(void)
 391 {
 392 #ifdef __VXWORKS__
 393         taskDelay(1);
 394 #else
 395     usleep(1);
 396 #endif
 397 }
 398
 399 COMMON_SYSDEP void __cilkrts_yield(void)
 400 {
 401 #if __APPLE__ || __FreeBSD__ || __VXWORKS__
 402     // On MacOS, call sched_yield to yield quantum.  I'm not sure why we
 403     // don't do this on Linux also.
 404     sched_yield();
 405 #elif defined(__MIC__)
 406     // On MIC, pthread_yield() really trashes things.  Arch's measurements
 407     // showed that calling _mm_delay_32() (or doing nothing) was a better
 408     // option.  Delaying 1024 clock cycles is a reasonable compromise between
 409     // giving up the processor and latency starting up when work becomes
 410     // available
 411     _mm_delay_32(1024);
 412 #elif defined(__ANDROID__) || (defined(__sun__) && defined(__svr4__))
 413     // On Android and Solaris, call sched_yield to yield quantum.  I'm not
 414     // sure why we don't do this on Linux also.
 415     sched_yield();
 416 #else
 417     // On Linux, call pthread_yield (which in turn will call sched_yield)
 418     // to yield quantum.
 419     pthread_yield();
 420 #endif
 421 }
 422
 423 COMMON_SYSDEP __STDNS size_t cilkos_getenv(char* value, __STDNS size_t vallen,
 424                                            const char* varname)
 425 {
 426     CILK_ASSERT(value);
 427     CILK_ASSERT(varname);
 428
 429     const char* envstr = getenv(varname);
 430     if (envstr)
 431     {
 432         size_t len = strlen(envstr);
 433         if (len > vallen - 1)
 434             return len + 1;
 435
 436         strcpy(value, envstr);
 437         return len;
 438     }
 439     else
 440     {
 441         value[0] = '\0';
 442         return 0;
 443     }
 444 }
 445
 446 /*
 447  * Unrecoverable error: Print an error message and abort execution.
 448  */
 449 COMMON_SYSDEP void cilkos_error(const char *fmt, ...)
 450 {
 451     va_list l;
 452     fflush(NULL);
 453     fprintf(stderr, "Cilk error: ");
 454     va_start(l, fmt);
 455     vfprintf(stderr, fmt, l);
 456     va_end(l);
 457     fprintf(stderr, "Exiting.\n");
 458     fflush(stderr);
 459
 460     abort();
 461 }
 462
 463 /*
 464  * Print a warning message and return.
 465  */
 466 COMMON_SYSDEP void cilkos_warning(const char *fmt, ...)
 467 {
 468     va_list l;
 469     fflush(NULL);
 470     fprintf(stderr, "Cilk warning: ");
 471     va_start(l, fmt);
 472     vfprintf(stderr, fmt, l);
 473     va_end(l);
 474     fflush(stderr);
 475 }
 476
 477 static void __attribute__((constructor)) init_once()
 478 {
 479     /*__cilkrts_debugger_notification_internal(CILK_DB_RUNTIME_LOADED);*/
 480     __cilkrts_init_tls_variables();
 481 }
 482
 483
 484 #define PAGE 4096
 485 #define CILK_MIN_STACK_SIZE (4*PAGE)
 486 // Default size for the stacks that we create in Cilk for Unix.
 487 #define CILK_DEFAULT_STACK_SIZE 0x100000
 488
 489 /*
 490  * Convert the user's specified stack size into a "reasonable" value
 491  * for this OS.
 492  */
 493 size_t cilkos_validate_stack_size(size_t specified_stack_size) {
 494     // Convert any negative value to the default.
 495     if (specified_stack_size == 0) {
 496         CILK_ASSERT((CILK_DEFAULT_STACK_SIZE % PAGE) == 0);
 497         return CILK_DEFAULT_STACK_SIZE;
 498     }
 499     // Round values in between 0 and CILK_MIN_STACK_SIZE up to
 500     // CILK_MIN_STACK_SIZE.
 501     if (specified_stack_size <= CILK_MIN_STACK_SIZE) {
 502         return CILK_MIN_STACK_SIZE;
 503     }
 504     if ((specified_stack_size % PAGE) > 0) {
 505         // Round the user's stack size value up to nearest page boundary.
 506         return (PAGE * (1 + specified_stack_size / PAGE));
 507     }
 508     return specified_stack_size;
 509 }
 510
 511 long cilkos_atomic_add(volatile long* p, long x)
 512 {
 513     return __sync_add_and_fetch(p, x);
 514 }
 515
 516 /* End os-unix.c */