1 /* global_state.cpp -*-C++-*-
3 *************************************************************************
6 * Copyright (C) 2009-2013, Intel Corporation
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
14 * * Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * * Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in
18 * the documentation and/or other materials provided with the
20 * * Neither the name of Intel Corporation nor the names of its
21 * contributors may be used to endorse or promote products derived
22 * from this software without specific prior written permission.
25 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
26 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
27 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
28 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
29 * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
30 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
31 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
32 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
33 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
35 * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 * POSSIBILITY OF SUCH DAMAGE.
37 **************************************************************************/
39 #include "global_state.h"
42 #include "metacall_impl.h"
44 #include "cilk/cilk_api.h"
45 #include "cilk_malloc.h"
46 #include "record-replay.h"
48 #include <algorithm> // For max()
58 // TBD: There is a race when multiple threads try to initialize the
59 // user_settable_values??
61 // Set to true if the user settable values portion of the global state
62 // singleton is initialized, even if the rest of the singleton is not
64 int cilkg_user_settable_values_initialized
= false;
68 // Single copy of the global state. Zero-filled until
69 // cilkg_get_user_settable_values() is called and partially-zero-filled until
70 // cilkg_init_global_state() is called. The first field is filled in with
71 // the size of a void* for the debugger and must be valid before initialization
72 global_state_t global_state_singleton
=
74 sizeof(void *), // addr_size
78 // Variables that need to export C-style names
81 // Pointer to the global state singleton.
82 global_state_t
*cilkg_singleton_ptr
= NULL
;
84 // __cilkrts_global_state is exported and referenced by the debugger.
85 // The debugger expects it to be valid when the module loads.
87 global_state_t
*__cilkrts_global_state
= &global_state_singleton
;
90 // Returns true if 'a' and 'b' are equal null-terminated strings
91 inline bool strmatch(const char* a
, const char* b
)
93 return 0 == std::strcmp(a
, b
);
96 // Returns the integer value represented by the null-terminated string at 's'.
97 inline long to_long(const char* s
)
102 return std::strtol(s
, &end
, 0);
106 // Returns true if 'a' and 'b' are equal null-terminated wide-char strings
107 inline bool strmatch(const wchar_t* a
, const wchar_t* b
)
109 return 0 == wcscmp(a
, b
);
112 // Returns true if the multi-byte character string at 'a' represents the same
113 // character sequence as the wide-character string at 'b'. The behavior is
114 // undefined if 'a' contains more than 30 multi-byte characters.
115 bool strmatch(const char* a
, const wchar_t* b
)
117 // Convert 'a' to wide-characters, then compare.
120 errno_t err
= mbstowcs_s(&count
, wa
, a
, 30);
121 CILK_ASSERT(0 == err
);
122 if (err
) return false;
123 return strmatch(wa
, b
);
126 // Returns true if the wide-character string at 'a' represents the same
127 // character sequence as the multi-byte character string at 'b'. The behavior
128 // id undefined if 'b' contains more than 30 multi-byte characters.
130 bool strmatch(const wchar_t* a
, const char* b
)
132 return strmatch(b
, a
);
136 // Returns the integer value represented by the null-terminated wide-char
138 inline long to_long(const wchar_t* s
)
143 return wcstol(s
, &end
, 0);
147 // Check if Cilkscreen or other sequential ptool wants to force reducers.
148 bool always_force_reduce()
150 // Metacall *looks* like a no-op. volatile needed to keep compiler from
151 // optimizing away variable.
152 volatile char not_force_reduce
= '\377';
153 __cilkrts_metacall(METACALL_TOOL_SYSTEM
, HYPER_ZERO_IF_FORCE_REDUCE
,
154 const_cast<char*>(¬_force_reduce
));
155 return ! not_force_reduce
;
158 // Stores the boolean value represented by the null-terminated string at 'val'
159 // into the integer object at 'out'. Returns '__CILKRTS_SET_PARAM_SUCCESS' if
160 // 'val' is "true", "false", "0" or "1" and '__CILKRTS_SET_PARAM_INVALID'
162 template <typename INT_T
, typename CHAR_T
>
163 int store_bool(INT_T
*out
, const CHAR_T
*val
)
165 static const char* const s_zero
= "0";
166 static const char* const s_one
= "1";
167 static const char* const s_true
= "true";
168 static const char* const s_false
= "false";
171 return __CILKRTS_SET_PARAM_INVALID
;
173 if (strmatch(s_false
, val
) || strmatch(s_zero
, val
)) {
175 return __CILKRTS_SET_PARAM_SUCCESS
;
178 if (strmatch(s_true
, val
) || strmatch(s_one
, val
)) {
180 return __CILKRTS_SET_PARAM_SUCCESS
;
183 return __CILKRTS_SET_PARAM_INVALID
;
186 // Stores the integer value represented by the null-terminated string at 'val'
187 // into the integer object at 'out', restricting the result to the range 'min'
188 // to 'max', inclusive. Returns '__CILKRTS_SET_PARAM_SUCCESS' if the conversion
189 // succeeds and is in range, '__CILKRTS_SET_PARAM_XRANGE' if the conversion
190 // succeeds but is out of range, and '__CILKRTS_SET_PARAM_INVALID' otherwise. In
191 // the case of any error, '*out' is unchanged.
192 template <typename INT_T
, typename CHAR_T
>
193 int store_int(INT_T
*out
, const CHAR_T
*val
, INT_T min
, INT_T max
)
196 long val_as_long
= to_long(val
);
197 if (val_as_long
== 0 && errno
!= 0)
198 return __CILKRTS_SET_PARAM_INVALID
;
199 if (val_as_long
< min
|| val_as_long
== LONG_MIN
)
200 return __CILKRTS_SET_PARAM_XRANGE
;
201 else if (val_as_long
> max
|| val_as_long
== LONG_MAX
)
202 return __CILKRTS_SET_PARAM_XRANGE
;
205 return __CILKRTS_SET_PARAM_SUCCESS
;
208 // Implementaton of cilkg_set_param templatized on character type.
209 // Windows will instantiate with both char and wchar_t.
210 // Note that g must have its user settable values set, but need not be fully
212 template <class CHAR_T
>
213 int set_param_imp(global_state_t
* g
, const CHAR_T
* param
, const CHAR_T
* value
)
215 static const char* const s_force_reduce
= "force reduce";
216 static const char* const s_nworkers
= "nworkers";
217 static const char* const s_max_user_workers
= "max user workers";
218 static const char* const s_local_stacks
= "local stacks";
219 static const char* const s_shared_stacks
= "shared stacks";
220 static const char* const s_nstacks
= "nstacks";
221 static const char* const s_stack_size
= "stack size";
223 // We must have a parameter and a value
225 return __CILKRTS_SET_PARAM_INVALID
;
227 return __CILKRTS_SET_PARAM_INVALID
;
229 if (strmatch(param
, s_force_reduce
))
231 // Sets whether we force a reduce operation at every sync. Useful for
232 // debugging reducers. Off by default. Overridden by Cilkscreen
234 // Documented in cilk_api_<os>.h
235 if (always_force_reduce())
236 // Force reduce is set by cilkscreen. User cannot change it.
237 return __CILKRTS_SET_PARAM_LATE
;
239 return store_bool(&g
->force_reduce
, value
);
241 else if (strmatch(param
, s_nworkers
))
243 // Set the total number of workers. Overrides count of cores we get
244 // from the OS and the setting of the CILK_NWORKERS environment
245 // variable. Setting to 0 indicates that the default worker count
248 // Documented in cilk_api_<os>.h
249 if (cilkg_singleton_ptr
)
250 return __CILKRTS_SET_PARAM_LATE
;
252 // Fetch the number of cores. There must be at last 1, since we're
253 // executing on *something*, aren't we!?
254 int hardware_cpu_count
= __cilkrts_hardware_cpu_count();
255 CILK_ASSERT(hardware_cpu_count
> 0);
257 int max_cpu_count
= 16 * hardware_cpu_count
;
258 if (__cilkrts_running_under_sequential_ptool())
260 hardware_cpu_count
= 1;
263 // Allow a value of 0, which means "set to hardware thread count".
264 int ret
= store_int(&g
->P
, value
, 0, max_cpu_count
);
266 g
->P
= hardware_cpu_count
;
269 else if (strmatch(param
, s_max_user_workers
))
271 // ** UNDOCUMENTED **
273 // Sets the number of slots allocated for user worker threads
274 int hardware_cpu_count
= __cilkrts_hardware_cpu_count();
275 CILK_ASSERT (hardware_cpu_count
> 0);
277 return store_int(&g
->max_user_workers
, value
, 1,
278 16 * hardware_cpu_count
);
280 else if (strmatch(param
, s_local_stacks
))
282 // ** UNDOCUMENTED **
284 // Number of stacks we'll hold in the per-worker stack cache. Maximum
285 // value is 42. See __cilkrts_make_global_state for details.
286 return store_int(&g
->fiber_pool_size
, value
, 0, 42);
288 else if (strmatch(param
, s_shared_stacks
))
290 // ** UNDOCUMENTED **
292 // Maximum number of stacks we'll hold in the global stack
293 // cache. Maximum value is 42. See __cilkrts_make_global_state for
295 return store_int(&g
->global_fiber_pool_size
, value
, 0, 42);
297 else if (strmatch(param
, s_nstacks
))
299 // Sets the maximum number of stacks permitted at one time. If the
300 // runtime reaches this maximum, it will cease to allocate stacks and
301 // the app will lose parallelism. 0 means unlimited. Default is
302 // unlimited. Minimum is twice the number of worker threads, though
303 // that cannot be tested at this time.
305 // Undocumented at this time, though there are plans to expose it.
306 // The current implentation is for Linux debugging only and is not
307 // robust enough for users.
308 if (cilkg_singleton_ptr
)
309 return __CILKRTS_SET_PARAM_LATE
;
310 return store_int
<unsigned>(&g
->max_stacks
, value
, 0, INT_MAX
);
312 else if (strmatch(param
, s_stack_size
))
314 // ** UNDOCUMENTED **
316 // Sets the size (in bytes) of the stacks that Cilk creates.
317 // Can only be set before the runtime starts.
318 if (cilkg_singleton_ptr
)
319 return __CILKRTS_SET_PARAM_LATE
;
321 // Maximum value that can be parsed is MAX_INT (32-bit).
322 int ret
= store_int
<size_t>(&g
->stack_size
, value
, 0, INT_MAX
);
324 // Process the value the user set (or 0 if the user didn't set
325 // anything) into something nice for the current OS. This
326 // processing is done immediately and stored into
327 // g->stack_size so that a call to get stack size will return
328 // the value that the runtime will actually use.
329 g
->stack_size
= cilkos_validate_stack_size(g
->stack_size
);
334 // If got here, then didn't match any of the strings
335 return __CILKRTS_SET_PARAM_UNIMP
;
339 int calc_max_user_workers(global_state_t
*g
)
341 // If it's been set by the user, give back what we got
342 if (g
->max_user_workers
> 0)
343 return g
->max_user_workers
;
346 return std::max(3, g
->P
* 2);
349 } // end unnamed namespace
351 __CILKRTS_BEGIN_EXTERN_C
354 * @brief Returns the global state object. If called for the first time,
355 * initializes the user-settable values in the global state, but does not
356 * initialize the rest of the structure.
358 global_state_t
* cilkg_get_user_settable_values()
360 // Environment variable value. More than big enough for a 64-bit signed
364 // Abbreviating &global_state_singleton as g is not only shorter, it also
365 // facilitates grepping for the string "g->", which appears ubiquitously
366 // in the runtime code.
367 global_state_t
* g
= &global_state_singleton
;
369 // TBD: We need synchronization around this loop to prevent
370 // multiple threads from initializing this data.
371 if (! cilkg_user_settable_values_initialized
)
375 // Preserve stealing disabled since it may have been set by the
377 int stealing_disabled
= g
->stealing_disabled
;
379 // All fields will be zero until set. In particular
380 std::memset(g
, 0, sizeof(global_state_t
));
382 // Fetch the number of cores. There must be at last 1, since we're
383 // executing on *something*, aren't we!?
384 int hardware_cpu_count
= __cilkrts_hardware_cpu_count();
385 CILK_ASSERT(hardware_cpu_count
> 0);
387 bool under_ptool
= __cilkrts_running_under_sequential_ptool();
389 hardware_cpu_count
= 1;
391 g
->stealing_disabled
= stealing_disabled
;
392 g
->under_ptool
= under_ptool
;
393 g
->force_reduce
= 0; // Default Off
394 g
->P
= hardware_cpu_count
; // Defaults to hardware CPU count
395 g
->max_user_workers
= 0; // 0 unless set by user
396 g
->fiber_pool_size
= 7; // Arbitrary default
398 g
->global_fiber_pool_size
= 3 * 3* g
->P
; // Arbitrary default
399 // 3*P was the default size of the worker array (including
400 // space for extra user workers). This parameter was chosen
401 // to match previous versions of the runtime.
403 if (4 == sizeof(void *))
404 g
->max_stacks
= 1200; // Only 1GB on 32-bit machines
406 g
->max_stacks
= 2400; // 2GB on 64-bit machines
408 // If we have 2400 1MB stacks, that is 2 gb. If we reach this
409 // limit on a single-socket machine, we may have other
410 // problems. Is 2400 too small for large multicore machines?
412 // TBD(jsukha, 11/27/2012): I set this limit on stacks to be a
413 // value independent of P. When running on a Xeon Phi with
414 // small values of P, I recall seeing a few microbenchmarks
415 // (e.g., fib) where a limit of 10*P seemed to be
416 // unnecessarily slowing things down.
418 // That being said, the code has changed sufficiently that
419 // this observation may no longer be true.
421 // Note: in general, the worst-case number of stacks required
422 // for a Cilk computation with spawn depth "d" on P workers is
423 // O(Pd). Code with unbalanced recursion may run into issues
424 // with this stack usage.
426 g
->max_steal_failures
= 128; // TBD: depend on max_workers?
427 g
->stack_size
= 0; // 0 unless set by the user
429 // Assume no record or replay log for now
430 g
->record_replay_file_name
= NULL
;
431 g
->record_or_replay
= RECORD_REPLAY_NONE
; // set by user
433 if (always_force_reduce())
434 g
->force_reduce
= true;
435 else if (cilkos_getenv(envstr
, sizeof(envstr
), "CILK_FORCE_REDUCE"))
436 store_bool(&g
->force_reduce
, envstr
);
439 g
->P
= 1; // Ignore environment variable if under cilkscreen
440 else if (cilkos_getenv(envstr
, sizeof(envstr
), "CILK_NWORKERS"))
441 // Set P to environment variable, but limit to no less than 1
442 // and no more than 16 times the number of hardware threads.
443 store_int(&g
->P
, envstr
, 1, 16 * hardware_cpu_count
);
445 if (cilkos_getenv(envstr
, sizeof(envstr
), "CILK_MAX_USER_WORKERS"))
446 // Set max_user_workers to environment variable, but limit to no
447 // less than 1 and no more 16 times the number of hardware
448 // threads. If not specified, defaults (somewhat arbitrarily) to
449 // the larger of 3 and twice the number of hardware threads.
450 store_int(&g
->max_user_workers
, envstr
, 1, 16*hardware_cpu_count
);
452 if (cilkos_getenv(envstr
, sizeof(envstr
), "CILK_STEAL_FAILURES"))
453 // Set the number of times a worker should fail to steal before
454 // it looks to see whether it should suspend itself.
455 store_int
<unsigned>(&g
->max_steal_failures
, envstr
, 1, INT_MAX
);
457 // Compute the total number of workers to allocate. Subtract one from
458 // nworkers and user workers so that the first user worker isn't
459 // factored in twice.
461 // total_workers must be computed now to support __cilkrts_get_total_workers
462 g
->total_workers
= g
->P
+ calc_max_user_workers(g
) - 1;
464 #ifdef CILK_RECORD_REPLAY
465 // RecordReplay: See if we've been asked to replay a log
466 len
= cilkos_getenv(envstr
, 0, "CILK_REPLAY_LOG");
469 len
+= 1; // Allow for trailing NUL
470 g
->record_or_replay
= REPLAY_LOG
;
471 g
->record_replay_file_name
= (char *)__cilkrts_malloc(len
);
472 cilkos_getenv(g
->record_replay_file_name
, len
, "CILK_REPLAY_LOG");
475 // RecordReplay: See if we've been asked to record a log
476 len
= cilkos_getenv(envstr
, 0, "CILK_RECORD_LOG");
479 if (RECORD_REPLAY_NONE
!= g
->record_or_replay
)
480 cilkos_warning("CILK_RECORD_LOG ignored since CILK_REPLAY_LOG is defined.\n");
483 len
+= 1; // Allow for trailing NUL
484 g
->record_or_replay
= RECORD_LOG
;
485 g
->record_replay_file_name
= (char *)__cilkrts_malloc(len
);
486 cilkos_getenv(g
->record_replay_file_name
, len
, "CILK_RECORD_LOG");
491 cilkg_user_settable_values_initialized
= true;
497 int cilkg_calc_total_workers()
499 global_state_t
* g
= cilkg_get_user_settable_values();
501 // Compute the total number of workers to allocate. Subtract one from
502 // nworkers and user workers so that the first user worker isn't
503 // factored in twice.
504 return g
->P
+ calc_max_user_workers(g
) - 1;
507 // Should be called while holding the global lock.
508 global_state_t
* cilkg_init_global_state()
510 if (cilkg_singleton_ptr
)
511 return cilkg_singleton_ptr
;
513 // Get partially-initialized global state.
514 global_state_t
* g
= cilkg_get_user_settable_values();
516 if (g
->max_stacks
> 0) {
518 // nstacks is currently honored on non-Windows systems only.
520 // Set an upper bound on the number of stacks that are allocated. If
521 // nstacks is set, each worker gets up to one stack in its cache so that
522 // no one worker can hog all of the free stacks and keep work from being
523 // stolen by the other workers.
525 // nstacks corresponds to the number of stacks that will be allocated by
526 // the runtime apart from the initial stack created for each thread by
527 // the system. Therefore, if a user asks for n stacks, and there are
528 // p workers created, the total number of stacks is actually n + p.
530 // This feature is primarily for MIC which has flat memory
531 // instead of virtual addresses and tends to run out really quickly.
532 // It is not implemented for Windows and it's non-intuitive
533 // interaction with the local stack cache is specifically to help out
536 // About max_stacks / P stacks, except we require at least 1
538 if (((int)g
->max_stacks
/ g
->P
) < g
->fiber_pool_size
)
539 g
->fiber_pool_size
= g
->max_stacks
/ g
->P
;
541 if (g
->fiber_pool_size
<= 0) {
542 g
->fiber_pool_size
= 1;
545 if ((int)g
->max_stacks
< g
->P
)
546 g
->max_stacks
= g
->P
;
548 g
->global_fiber_pool_size
= g
->P
* (g
->fiber_pool_size
+1);
551 // Number of bytes/address - validation for debugger integration
552 g
->addr_size
= sizeof(void *);
554 __cilkrts_init_stats(&g
->stats
);
556 __cilkrts_frame_malloc_global_init(g
);
559 g
->total_workers
= cilkg_calc_total_workers();
560 g
->system_workers
= g
->P
- 1; // system_workers is here for the debugger.
562 g
->workers_running
= 0;
563 g
->ltqsize
= 1024; /* FIXME */
565 g
->stack_size
= cilkos_validate_stack_size(g
->stack_size
);
566 g
->failure_to_allocate_stack
= 0;
572 void cilkg_publish_global_state(global_state_t
* g
)
575 // TBD: which one of these needs to be executed first? I say
576 // cilkg_singleton_ptr needs to be set last, with a mfence in
577 // between, since it is the flag that cilkg_is_published_is
579 __cilkrts_global_state
= g
;
581 cilkg_singleton_ptr
= g
;
584 void cilkg_deinit_global_state()
586 cilkg_singleton_ptr
= NULL
;
587 __cilkrts_global_state
= NULL
;
590 int cilkg_is_published(void)
592 return NULL
!= cilkg_singleton_ptr
;
595 int cilkg_set_param(const char* param
, const char* value
)
597 return set_param_imp(cilkg_get_user_settable_values(), param
, value
);
601 int cilkg_set_param_w(const wchar_t* param
, const wchar_t* value
)
603 return set_param_imp(cilkg_get_user_settable_values(), param
, value
);
608 // C++ scheduler function (that may throw exceptions)
609 typedef void cpp_scheduler_t(__cilkrts_worker
*w
);
612 void __cilkrts_run_scheduler_with_exceptions(__cilkrts_worker
*w
)
614 global_state_t
* g
= cilkg_get_global_state();
615 CILK_ASSERT(g
->scheduler
);
617 cpp_scheduler_t
* scheduler
= (cpp_scheduler_t
*) g
->scheduler
;
622 __cilkrts_bug("Exception escaped Cilk context");
626 __CILKRTS_END_EXTERN_C
628 /* End global_state.cpp */