[gcc/testsuite]
[official-gcc.git] / libcilkrts / runtime / global_state.cpp
blob6c77b5f766af0b45ed67dac628fc74504613a91e
1 /* global_state.cpp -*-C++-*-
3 *************************************************************************
5 * Copyright (C) 2009-2016, Intel Corporation
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
12 * * Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * * Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in
16 * the documentation and/or other materials provided with the
17 * distribution.
18 * * Neither the name of Intel Corporation nor the names of its
19 * contributors may be used to endorse or promote products derived
20 * from this software without specific prior written permission.
22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
25 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
26 * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
28 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
29 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
32 * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
33 * POSSIBILITY OF SUCH DAMAGE.
35 * *********************************************************************
37 * PLEASE NOTE: This file is a downstream copy of a file mainitained in
38 * a repository at cilkplus.org. Changes made to this file that are not
39 * submitted through the contribution process detailed at
40 * http://www.cilkplus.org/submit-cilk-contribution will be lost the next
41 * time that a new version is released. Changes only submitted to the
42 * GNU compiler collection or posted to the git repository at
43 * https://bitbucket.org/intelcilkruntime/intel-cilk-runtime.git are
44 * not tracked.
46 * We welcome your contributions to this open source project. Thank you
47 * for your assistance in helping us improve Cilk Plus.
48 **************************************************************************/
50 #include "global_state.h"
51 #include "os.h"
52 #include "bug.h"
53 #include "metacall_impl.h"
54 #include "stats.h"
55 #include "cilk/cilk_api.h"
56 #include "cilk_malloc.h"
57 #include "record-replay.h"
59 #include <algorithm> // For max()
60 #include <cstring>
61 #include <cstdlib>
62 #include <climits>
63 #include <cerrno>
65 #ifdef _WIN32
66 # include <wchar.h>
67 #endif
69 // TBD: There is a race when multiple threads try to initialize the
70 // user_settable_values??
72 // Set to true if the user settable values portion of the global state
73 // singleton is initialized, even if the rest of the singleton is not
74 // initialized.
75 int cilkg_user_settable_values_initialized = false;
77 namespace {
79 // Single copy of the global state. Zero-filled until
80 // cilkg_get_user_settable_values() is called and partially-zero-filled until
81 // cilkg_init_global_state() is called. The first field is filled in with
82 // the size of a void* for the debugger and must be valid before initialization
83 static global_state_t global_state_singleton =
85 sizeof(void *), // addr_size
86 GLOBAL_STATE_VERSION, // structure version
90 // Variables that need to export C-style names
91 extern "C"
93 // Pointer to the global state singleton.
94 global_state_t *cilkg_singleton_ptr = NULL;
96 // __cilkrts_global_state is exported and referenced by the debugger.
97 // The debugger expects it to be valid when the module loads.
98 // CILK_EXPORT_DATA
99 global_state_t *__cilkrts_global_state = &global_state_singleton;
102 // Returns true if 'a' and 'b' are equal null-terminated strings
103 inline bool strmatch(const char* a, const char* b)
105 return 0 == std::strcmp(a, b);
108 // Returns the integer value represented by the null-terminated, decimal string
109 // at 's'.
111 inline long to_long(const char* s)
113 char *end;
115 errno = 0;
116 return std::strtol(s, &end, 10);
119 #ifdef _WIN32
120 // Returns true if 'a' and 'b' are equal null-terminated wide-char strings
121 inline bool strmatch(const wchar_t* a, const wchar_t* b)
123 return 0 == wcscmp(a, b);
126 // Returns true if the multi-byte character string at 'a' represents the same
127 // character sequence as the wide-character string at 'b'. The behavior is
128 // undefined if 'a' contains more than 30 multi-byte characters.
129 bool strmatch(const char* a, const wchar_t* b)
131 // Convert 'a' to wide-characters, then compare.
132 wchar_t wa[31];
133 std::size_t count;
134 errno_t err = mbstowcs_s(&count, wa, a, 30);
135 CILK_ASSERT(0 == err);
136 if (err) return false;
137 return strmatch(wa, b);
140 // Returns true if the wide-character string at 'a' represents the same
141 // character sequence as the multi-byte character string at 'b'. The behavior
142 // id undefined if 'b' contains more than 30 multi-byte characters.
143 inline
144 bool strmatch(const wchar_t* a, const char* b)
146 return strmatch(b, a);
150 // Returns the integer value represented by the null-terminated wide-char
151 // string at 's'.
152 inline long to_long(const wchar_t* s)
154 wchar_t *end;
156 errno = 0;
157 return wcstol(s, &end, 0);
159 #endif
161 // Check if Cilkscreen or other sequential ptool wants to force reducers.
162 bool always_force_reduce()
164 // Metacall *looks* like a no-op. volatile needed to keep compiler from
165 // optimizing away variable.
166 volatile char not_force_reduce = '\377';
167 __cilkrts_metacall(METACALL_TOOL_SYSTEM, HYPER_ZERO_IF_FORCE_REDUCE,
168 const_cast<char*>(&not_force_reduce));
169 return ! not_force_reduce;
172 // Stores the boolean value represented by the null-terminated string at 'val'
173 // into the integer object at 'out'. Returns '__CILKRTS_SET_PARAM_SUCCESS' if
174 // 'val' is "true", "false", "0" or "1" and '__CILKRTS_SET_PARAM_INVALID'
175 // otherwise.
176 template <typename INT_T, typename CHAR_T>
177 int store_bool(INT_T *out, const CHAR_T *val)
179 static const char* const s_zero = "0";
180 static const char* const s_one = "1";
181 static const char* const s_true = "true";
182 static const char* const s_false = "false";
184 if (val == 0)
185 return __CILKRTS_SET_PARAM_INVALID;
187 if (strmatch(s_false, val) || strmatch(s_zero, val)) {
188 *out = 0;
189 return __CILKRTS_SET_PARAM_SUCCESS;
192 if (strmatch(s_true, val) || strmatch(s_one, val)) {
193 *out = 1;
194 return __CILKRTS_SET_PARAM_SUCCESS;
197 return __CILKRTS_SET_PARAM_INVALID;
200 // Stores the integer value represented by the null-terminated string at 'val'
201 // into the integer object at 'out', restricting the result to the range 'min'
202 // to 'max', inclusive. Returns '__CILKRTS_SET_PARAM_SUCCESS' if the conversion
203 // succeeds and is in range, '__CILKRTS_SET_PARAM_XRANGE' if the conversion
204 // succeeds but is out of range, and '__CILKRTS_SET_PARAM_INVALID' otherwise. In
205 // the case of any error, '*out' is unchanged.
206 template <typename INT_T, typename CHAR_T>
207 int store_int(INT_T *out, const CHAR_T *val, INT_T min, INT_T max)
209 errno = 0;
210 long val_as_long = to_long(val);
211 if (val_as_long == 0 && errno != 0)
212 return __CILKRTS_SET_PARAM_INVALID;
213 if (val_as_long < min || val_as_long == LONG_MIN)
214 return __CILKRTS_SET_PARAM_XRANGE;
215 else if (val_as_long > max || val_as_long == LONG_MAX)
216 return __CILKRTS_SET_PARAM_XRANGE;
218 *out = val_as_long;
219 return __CILKRTS_SET_PARAM_SUCCESS;
222 // Implementaton of cilkg_set_param templatized on character type.
223 // Windows will instantiate with both char and wchar_t.
224 // Note that g must have its user settable values set, but need not be fully
225 // initialized.
226 template <class CHAR_T>
227 int set_param_imp(global_state_t* g, const CHAR_T* param, const CHAR_T* value)
229 static const char* const s_force_reduce = "force reduce";
230 static const char* const s_nworkers = "nworkers";
231 static const char* const s_max_user_workers = "max user workers";
232 static const char* const s_local_stacks = "local stacks";
233 static const char* const s_shared_stacks = "shared stacks";
234 static const char* const s_nstacks = "nstacks";
235 static const char* const s_stack_size = "stack size";
237 // We must have a parameter and a value
238 if (0 == param)
239 return __CILKRTS_SET_PARAM_INVALID;
240 if (0 == value)
241 return __CILKRTS_SET_PARAM_INVALID;
243 if (strmatch(param, s_force_reduce))
245 // Sets whether we force a reduce operation at every sync. Useful for
246 // debugging reducers. Off by default. Overridden by Cilkscreen
248 // Documented in cilk_api_<os>.h
249 if (always_force_reduce())
250 // Force reduce is set by cilkscreen. User cannot change it.
251 return __CILKRTS_SET_PARAM_LATE;
253 return store_bool(&g->force_reduce, value);
255 else if (strmatch(param, s_nworkers))
257 // Set the total number of workers. Overrides count of cores we get
258 // from the OS and the setting of the CILK_NWORKERS environment
259 // variable. Setting to 0 indicates that the default worker count
260 // should be used.
262 // Documented in cilk_api_<os>.h
263 if (cilkg_singleton_ptr)
264 return __CILKRTS_SET_PARAM_LATE;
266 // Fetch the number of cores. There must be at last 1, since we're
267 // executing on *something*, aren't we!?
268 int hardware_cpu_count = __cilkrts_hardware_cpu_count();
269 CILK_ASSERT(hardware_cpu_count > 0);
271 int max_cpu_count = 16 * hardware_cpu_count;
272 if (__cilkrts_running_under_sequential_ptool())
274 hardware_cpu_count = 1;
275 max_cpu_count = 1;
277 // Allow a value of 0, which means "set to hardware thread count".
278 int ret = store_int(&g->P, value, 0, max_cpu_count);
279 if (0 == g->P)
280 g->P = hardware_cpu_count;
281 return ret;
283 else if (strmatch(param, s_max_user_workers))
285 // ** UNDOCUMENTED **
287 // Sets the number of slots allocated for user worker threads
288 int hardware_cpu_count = __cilkrts_hardware_cpu_count();
289 CILK_ASSERT (hardware_cpu_count > 0);
291 return store_int(&g->max_user_workers, value, 1,
292 16 * hardware_cpu_count);
294 else if (strmatch(param, s_local_stacks))
296 // ** UNDOCUMENTED **
298 // Number of stacks we'll hold in the per-worker stack cache. Maximum
299 // value is 42. See __cilkrts_make_global_state for details.
300 return store_int(&g->fiber_pool_size, value, 0, 42);
302 else if (strmatch(param, s_shared_stacks))
304 // ** UNDOCUMENTED **
306 // Maximum number of stacks we'll hold in the global stack
307 // cache. Maximum value is 42. See __cilkrts_make_global_state for
308 // details.
309 return store_int(&g->global_fiber_pool_size, value, 0, 42);
311 else if (strmatch(param, s_nstacks))
313 // Sets the maximum number of stacks permitted at one time. If the
314 // runtime reaches this maximum, it will cease to allocate stacks and
315 // the app will lose parallelism. 0 means unlimited. Default is
316 // unlimited. Minimum is twice the number of worker threads, though
317 // that cannot be tested at this time.
319 // Undocumented at this time, though there are plans to expose it.
320 // The current implentation is for Linux debugging only and is not
321 // robust enough for users.
322 if (cilkg_singleton_ptr)
323 return __CILKRTS_SET_PARAM_LATE;
324 return store_int<unsigned>(&g->max_stacks, value, 0, INT_MAX);
326 else if (strmatch(param, s_stack_size))
328 // ** UNDOCUMENTED **
330 // Sets the size (in bytes) of the stacks that Cilk creates.
331 // Can only be set before the runtime starts.
332 if (cilkg_singleton_ptr)
333 return __CILKRTS_SET_PARAM_LATE;
335 // Maximum value that can be parsed is MAX_INT (32-bit).
336 int ret = store_int<size_t>(&g->stack_size, value, 0, INT_MAX);
338 // Process the value the user set (or 0 if the user didn't set
339 // anything) into something nice for the current OS. This
340 // processing is done immediately and stored into
341 // g->stack_size so that a call to get stack size will return
342 // the value that the runtime will actually use.
343 g->stack_size = cilkos_validate_stack_size(g->stack_size);
344 return ret;
348 // If got here, then didn't match any of the strings
349 return __CILKRTS_SET_PARAM_UNIMP;
352 inline
353 int calc_max_user_workers(global_state_t *g)
355 // If it's been set by the user, give back what we got
356 if (g->max_user_workers > 0)
357 return g->max_user_workers;
359 // Calculate it
360 return std::max(3, g->P * 2);
363 } // end unnamed namespace
365 __CILKRTS_BEGIN_EXTERN_C
368 * @brief Returns the global state object. If called for the first time,
369 * initializes the user-settable values in the global state, but does not
370 * initialize the rest of the structure.
372 global_state_t* cilkg_get_user_settable_values()
374 // Environment variable value. More than big enough for a 64-bit signed
375 // integer.
376 char envstr[24];
378 // Abbreviating &global_state_singleton as g is not only shorter, it also
379 // facilitates grepping for the string "g->", which appears ubiquitously
380 // in the runtime code.
381 global_state_t* g = &global_state_singleton;
383 // TBD: We need synchronization around this loop to prevent
384 // multiple threads from initializing this data.
385 if (! cilkg_user_settable_values_initialized)
387 size_t len;
389 // Preserve stealing disabled since it may have been set by the
390 // debugger
391 int stealing_disabled = g->stealing_disabled;
393 // All fields will be zero until set. In particular
394 std::memset(g, 0, sizeof(global_state_t));
396 // Fetch the number of cores. There must be at last 1, since we're
397 // executing on *something*, aren't we!?
398 int hardware_cpu_count = __cilkrts_hardware_cpu_count();
399 CILK_ASSERT(hardware_cpu_count > 0);
401 bool under_ptool = __cilkrts_running_under_sequential_ptool();
402 if (under_ptool)
403 hardware_cpu_count = 1;
405 g->stealing_disabled = stealing_disabled;
406 g->under_ptool = under_ptool;
407 g->force_reduce = 0; // Default Off
408 g->P = hardware_cpu_count; // Defaults to hardware CPU count
409 g->max_user_workers = 0; // 0 unless set by user
410 g->fiber_pool_size = 7; // Arbitrary default
412 g->global_fiber_pool_size = 3 * 3* g->P; // Arbitrary default
413 // 3*P was the default size of the worker array (including
414 // space for extra user workers). This parameter was chosen
415 // to match previous versions of the runtime.
417 if (4 == sizeof(void *))
418 g->max_stacks = 1200; // Only 1GB on 32-bit machines
419 else
420 g->max_stacks = 2400; // 2GB on 64-bit machines
422 // If we have 2400 1MB stacks, that is 2 gb. If we reach this
423 // limit on a single-socket machine, we may have other
424 // problems. Is 2400 too small for large multicore machines?
426 // TBD(jsukha, 11/27/2012): I set this limit on stacks to be a
427 // value independent of P. When running on a Xeon Phi with
428 // small values of P, I recall seeing a few microbenchmarks
429 // (e.g., fib) where a limit of 10*P seemed to be
430 // unnecessarily slowing things down.
432 // That being said, the code has changed sufficiently that
433 // this observation may no longer be true.
435 // Note: in general, the worst-case number of stacks required
436 // for a Cilk computation with spawn depth "d" on P workers is
437 // O(Pd). Code with unbalanced recursion may run into issues
438 // with this stack usage.
440 g->max_steal_failures = 128; // TBD: depend on max_workers?
441 g->stack_size = 0; // 0 unless set by the user
443 // Assume no record or replay log for now
444 g->record_replay_file_name = NULL;
445 g->record_or_replay = RECORD_REPLAY_NONE; // set by user
447 if (always_force_reduce())
448 g->force_reduce = true;
449 else if (cilkos_getenv(envstr, sizeof(envstr), "CILK_FORCE_REDUCE"))
450 store_bool(&g->force_reduce, envstr);
452 if (under_ptool)
453 g->P = 1; // Ignore environment variable if under cilkscreen
454 else if (cilkos_getenv(envstr, sizeof(envstr), "CILK_NWORKERS"))
455 // Set P to environment variable, but limit to no less than 1
456 // and no more than 16 times the number of hardware threads.
457 store_int(&g->P, envstr, 1, 16 * hardware_cpu_count);
459 if (cilkos_getenv(envstr, sizeof(envstr), "CILK_MAX_USER_WORKERS"))
460 // Set max_user_workers to environment variable, but limit to no
461 // less than 1 and no more 16 times the number of hardware
462 // threads. If not specified, defaults (somewhat arbitrarily) to
463 // the larger of 3 and twice the number of hardware threads.
464 store_int(&g->max_user_workers, envstr, 1, 16*hardware_cpu_count);
466 if (cilkos_getenv(envstr, sizeof(envstr), "CILK_STEAL_FAILURES"))
467 // Set the number of times a worker should fail to steal before
468 // it looks to see whether it should suspend itself.
469 store_int<unsigned>(&g->max_steal_failures, envstr, 1, INT_MAX);
471 // Compute the total number of workers to allocate. Subtract one from
472 // nworkers and user workers so that the first user worker isn't
473 // factored in twice.
475 // total_workers must be computed now to support __cilkrts_get_total_workers
476 g->total_workers = g->P + calc_max_user_workers(g) - 1;
478 #ifdef CILK_RECORD_REPLAY
479 // RecordReplay: See if we've been asked to replay a log
480 len = cilkos_getenv(envstr, 0, "CILK_REPLAY_LOG");
481 if (len > 0)
483 len += 1; // Allow for trailing NUL
484 g->record_or_replay = REPLAY_LOG;
485 g->record_replay_file_name = (char *)__cilkrts_malloc(len);
486 cilkos_getenv(g->record_replay_file_name, len, "CILK_REPLAY_LOG");
489 // RecordReplay: See if we've been asked to record a log
490 len = cilkos_getenv(envstr, 0, "CILK_RECORD_LOG");
491 if (len > 0)
493 if (RECORD_REPLAY_NONE != g->record_or_replay)
494 cilkos_warning("CILK_RECORD_LOG ignored since CILK_REPLAY_LOG is defined.\n");
495 else
497 len += 1; // Allow for trailing NUL
498 g->record_or_replay = RECORD_LOG;
499 g->record_replay_file_name = (char *)__cilkrts_malloc(len);
500 cilkos_getenv(g->record_replay_file_name, len, "CILK_RECORD_LOG");
503 #endif
505 cilkg_user_settable_values_initialized = true;
508 return g;
511 int cilkg_calc_total_workers()
513 global_state_t* g = cilkg_get_user_settable_values();
515 // Compute the total number of workers to allocate. Subtract one from
516 // nworkers and user workers so that the first user worker isn't
517 // factored in twice.
518 return g->P + calc_max_user_workers(g) - 1;
521 // Should be called while holding the global lock.
522 global_state_t* cilkg_init_global_state()
524 if (cilkg_singleton_ptr)
525 return cilkg_singleton_ptr;
527 // Get partially-initialized global state.
528 global_state_t* g = cilkg_get_user_settable_values();
530 if (g->max_stacks > 0) {
532 // nstacks is currently honored on non-Windows systems only.
534 // Set an upper bound on the number of stacks that are allocated. If
535 // nstacks is set, each worker gets up to one stack in its cache so that
536 // no one worker can hog all of the free stacks and keep work from being
537 // stolen by the other workers.
539 // nstacks corresponds to the number of stacks that will be allocated by
540 // the runtime apart from the initial stack created for each thread by
541 // the system. Therefore, if a user asks for n stacks, and there are
542 // p workers created, the total number of stacks is actually n + p.
544 // This feature is primarily for MIC which has flat memory
545 // instead of virtual addresses and tends to run out really quickly.
546 // It is not implemented for Windows and it's non-intuitive
547 // interaction with the local stack cache is specifically to help out
548 // MIC.
550 // About max_stacks / P stacks, except we require at least 1
551 // per pool.
552 if (((int)g->max_stacks / g->P) < g->fiber_pool_size)
553 g->fiber_pool_size = g->max_stacks / g->P;
555 if (g->fiber_pool_size <= 0) {
556 g->fiber_pool_size = 1;
559 if ((int)g->max_stacks < g->P)
560 g->max_stacks = g->P;
562 g->global_fiber_pool_size = g->P * (g->fiber_pool_size+1);
565 // Number of bytes/address - validation for debugger integration
566 g->addr_size = sizeof(void *);
568 __cilkrts_init_stats(&g->stats);
570 __cilkrts_frame_malloc_global_init(g);
572 g->Q = 0;
573 g->total_workers = cilkg_calc_total_workers();
574 g->system_workers = g->P - 1; // system_workers is here for the debugger.
575 g->work_done = 0;
576 g->workers_running = 0;
577 g->ltqsize = 1024; /* FIXME */
579 g->stack_size = cilkos_validate_stack_size(g->stack_size);
580 g->failure_to_allocate_stack = 0;
582 return g;
585 void cilkg_publish_global_state(global_state_t* g)
587 // TBD: which one of these needs to be executed first? I say
588 // cilkg_singleton_ptr needs to be set last, with a mfence in
589 // between, since it is the flag that cilkg_is_published_is
590 // checking for.
591 __cilkrts_global_state = g;
592 __cilkrts_fence();
593 cilkg_singleton_ptr = g;
596 void cilkg_deinit_global_state()
598 cilkg_singleton_ptr = NULL;
600 // The pointer to the global state needs to remain valid for the
601 // debugger. Thus, we can't clear the following pointer.
602 // __cilkrts_global_state = NULL;
605 // We also don't reset the global state, so that if we resume
606 // execution after ending Cilk, user set variables (e.g., # of
607 // workers) remains valid.
610 int cilkg_is_published(void)
612 return NULL != cilkg_singleton_ptr;
615 int cilkg_set_param(const char* param, const char* value)
617 return set_param_imp(cilkg_get_user_settable_values(), param, value);
620 #ifdef _WIN32
621 int cilkg_set_param_w(const wchar_t* param, const wchar_t* value)
623 return set_param_imp(cilkg_get_user_settable_values(), param, value);
625 #endif
627 extern "C++" {
628 // C++ scheduler function (that may throw exceptions)
629 typedef void cpp_scheduler_t(__cilkrts_worker *w);
632 void __cilkrts_run_scheduler_with_exceptions(__cilkrts_worker *w)
634 global_state_t* g = cilkg_get_global_state();
635 CILK_ASSERT(g->scheduler);
637 cpp_scheduler_t* scheduler = (cpp_scheduler_t*) g->scheduler;
639 try {
640 scheduler(w);
641 } catch (...) {
642 __cilkrts_bug("Exception escaped Cilk context");
646 __CILKRTS_END_EXTERN_C
648 /* End global_state.cpp */