PR rtl-optimization/84308
[official-gcc.git] / liboffloadmic / runtime / offload_engine.cpp
blobb2de56c68eb6effa15cfc073f7b6c48a949945c0
1 /*
2 Copyright (c) 2014-2016 Intel Corporation. All Rights Reserved.
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions
6 are met:
8 * Redistributions of source code must retain the above copyright
9 notice, this list of conditions and the following disclaimer.
10 * Redistributions in binary form must reproduce the above copyright
11 notice, this list of conditions and the following disclaimer in the
12 documentation and/or other materials provided with the distribution.
13 * Neither the name of Intel Corporation nor the names of its
14 contributors may be used to endorse or promote products derived
15 from this software without specific prior written permission.
17 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 #include "offload_engine.h"
32 #include <signal.h>
33 #include <errno.h>
34 #include <sys/stat.h>
35 #include <sys/types.h>
37 #include <algorithm>
38 #include <vector>
40 #include "offload_host.h"
41 #include "offload_table.h"
42 #include "offload_iterator.h"
44 #if defined(HOST_WINNT)
45 #define PATH_SEPARATOR ";"
46 #else
47 #define PATH_SEPARATOR ":"
48 #endif
50 // Static members of Stream class must be described somewhere.
51 // This members describe the list of all streams defined in programm
52 // via call to _Offload_stream_create.
53 uint64_t Stream::m_streams_count = 0;
54 StreamMap Stream::all_streams;
55 mutex_t Stream::m_stream_lock;
56 char* mic_library_path = 0;
58 const char* Engine::m_func_names[Engine::c_funcs_total] =
60 "server_compute",
61 #ifdef MYO_SUPPORT
62 "server_myoinit",
63 "server_myofini",
64 #endif // MYO_SUPPORT
65 "server_init",
66 "server_var_table_size",
67 "server_var_table_copy",
68 "server_set_stream_affinity"
71 // Symbolic representation of system signals. Fix for CQ233593
72 const char* Engine::c_signal_names[Engine::c_signal_max] =
74 "Unknown SIGNAL",
75 "SIGHUP", /* 1, Hangup (POSIX). */
76 "SIGINT", /* 2, Interrupt (ANSI). */
77 "SIGQUIT", /* 3, Quit (POSIX). */
78 "SIGILL", /* 4, Illegal instruction (ANSI). */
79 "SIGTRAP", /* 5, Trace trap (POSIX). */
80 "SIGABRT", /* 6, Abort (ANSI). */
81 "SIGBUS", /* 7, BUS error (4.2 BSD). */
82 "SIGFPE", /* 8, Floating-point exception (ANSI). */
83 "SIGKILL", /* 9, Kill, unblockable (POSIX). */
84 "SIGUSR1", /* 10, User-defined signal 1 (POSIX). */
85 "SIGSEGV", /* 11, Segmentation violation (ANSI). */
86 "SIGUSR2", /* 12, User-defined signal 2 (POSIX). */
87 "SIGPIPE", /* 13, Broken pipe (POSIX). */
88 "SIGALRM", /* 14, Alarm clock (POSIX). */
89 "SIGTERM", /* 15, Termination (ANSI). */
90 "SIGSTKFLT", /* 16, Stack fault. */
91 "SIGCHLD", /* 17, Child status has changed (POSIX). */
92 "SIGCONT", /* 18, Continue (POSIX). */
93 "SIGSTOP", /* 19, Stop, unblockable (POSIX). */
94 "SIGTSTP", /* 20, Keyboard stop (POSIX). */
95 "SIGTTIN", /* 21, Background read from tty (POSIX). */
96 "SIGTTOU", /* 22, Background write to tty (POSIX). */
97 "SIGURG", /* 23, Urgent condition on socket (4.2 BSD). */
98 "SIGXCPU", /* 24, CPU limit exceeded (4.2 BSD). */
99 "SIGXFSZ", /* 25, File size limit exceeded (4.2 BSD). */
100 "SIGVTALRM", /* 26, Virtual alarm clock (4.2 BSD). */
101 "SIGPROF", /* 27, Profiling alarm clock (4.2 BSD). */
102 "SIGWINCH", /* 28, Window size change (4.3 BSD, Sun). */
103 "SIGIO", /* 29, I/O now possible (4.2 BSD). */
104 "SIGPWR", /* 30, Power failure restart (System V). */
105 "SIGSYS" /* 31, Bad system call. */
108 void Engine::init(void)
110 if (!m_ready) {
111 mutex_locker_t locker(m_lock);
113 if (!m_ready) {
114 // start process if not done yet
115 if (m_process == 0) {
116 init_process();
119 // load penging images
120 load_libraries();
122 // and (re)build pointer table
123 init_ptr_data();
125 // it is ready now
126 m_ready = true;
128 // Inform the debugger
129 if (__dbg_is_attached) {
130 __dbg_target_so_loaded();
136 void Engine::print_stream_cpu_list(const char * str)
138 int count = 0;
139 char buffer[1024];
140 CpuEl* cpu_el = m_cpu_head;
142 OFFLOAD_DEBUG_TRACE(3,
143 "%s : cpu list as Index(Count) for the streams is :\n", str);
144 buffer[0] = 0;
145 for (int i = 0; i < m_num_threads; i++) {
146 cpu_el = m_cpus + i;
147 if (m_assigned_cpus == 0 || (*m_assigned_cpus)[i]) {
148 count++;
149 sprintf(buffer + strlen(buffer), "%d(%d) ", CPU_INDEX(cpu_el), cpu_el->count);
150 if (count % 20 == 0) {
151 OFFLOAD_DEBUG_TRACE(3, "%s\n", buffer);
152 buffer[0] = 0;
156 if (count % 20 != 0) {
157 OFFLOAD_DEBUG_TRACE(3, "%s\n", buffer);
161 void Engine::init_process(void)
163 COIENGINE engine;
164 COIRESULT res;
165 const char **environ;
166 char buf[4096]; // For exe path name
167 char* mic_device_main = 0;
169 // create environment for the target process
170 environ = (const char**) mic_env_vars.create_environ_for_card(m_index);
171 if (environ != 0) {
172 for (const char **p = environ; *p != 0; p++) {
173 OFFLOAD_DEBUG_TRACE(3, "Env Var for card %d: %s\n", m_index, *p);
177 // Create execution context in the specified device
178 OFFLOAD_DEBUG_TRACE(2, "Getting device %d (engine %d) handle\n", m_index,
179 m_physical_index);
180 res = COI::EngineGetHandle(COI_ISA_MIC, m_physical_index, &engine);
181 check_result(res, c_get_engine_handle, m_index, res);
183 // Get engine info on threads and cores.
184 // The values of core number and thread number will be used later at stream
185 // creation by call to _Offload_stream_create(device,number_of_cpus).
187 COI_ENGINE_INFO engine_info;
189 res = COI::EngineGetInfo(engine, sizeof(COI_ENGINE_INFO), &engine_info);
190 check_result(res, c_get_engine_info, m_index, res);
191 if (mic_library_path == 0 ) {
192 if (engine_info.ISA == COI_DEVICE_KNC) {
193 mic_library_path = knc_library_path;
195 else if (engine_info.ISA == COI_DEVICE_KNL) {
196 mic_library_path = knl_library_path;
198 else {
199 LIBOFFLOAD_ERROR(c_unknown_mic_device_type);
203 // m_cpus is the list of all available threads.
204 // At the begining all threads made available through OFFLOAD_DEVICES
205 // or all threads existed at the engine if OFFLOAD_DEVICES isn't set.
206 // m_cpu_head points to the head of the m_cpus list.
207 // m_cpus is ordered by number of streams using the thread.
208 // m_cpu_head points to the least used thread.
209 // After creating and destroying a stream the m_cpus list must be fixed
210 // to be ordered.
212 m_cpus = (CpuEl*)malloc(engine_info.NumThreads * sizeof(CpuEl));
213 if (m_cpus == NULL)
214 LIBOFFLOAD_ERROR(c_malloc);
215 memset(m_cpus, 0, engine_info.NumThreads * sizeof(CpuEl));
216 CpuEl* prev_cpu = NULL;
218 for (int i = 0; i < engine_info.NumThreads; i++) {
219 if (m_assigned_cpus == 0 || (*m_assigned_cpus)[i]) {
220 if (prev_cpu) {
221 prev_cpu->next = m_cpus + i;
223 else {
224 m_cpu_head = m_cpus + i;
226 m_cpus[i].prev = prev_cpu;
227 m_cpus[i].count = 0;
228 prev_cpu = m_cpus + i;
232 // The following values will be used at pipeline creation for streams
233 m_num_cores = engine_info.NumCores;
234 m_num_threads = engine_info.NumThreads;
236 print_stream_cpu_list("init_process");
238 // Check if OFFLOAD_DMA_CHANNEL_COUNT is set to 2
239 // Only the value 2 is supported in 16.0
240 if (mic_dma_channel_count == 2) {
241 if (COI::ProcessConfigureDMA) {
242 // Set DMA channels using COI API
243 COI::ProcessConfigureDMA(2, COI::DMA_MODE_READ_WRITE);
245 else {
246 // Set environment variable COI_DMA_CHANNEL_COUNT
247 // use putenv instead of setenv as Windows has no setenv.
248 // Note: putenv requires its argument can't be freed or modified.
249 // So no free after call to putenv or elsewhere.
250 char * env_var = strdup("COI_DMA_CHANNEL_COUNT=2");
251 if (env_var == NULL)
252 LIBOFFLOAD_ERROR(c_malloc);
253 putenv(env_var);
257 // Target executable is not available then use compiler provided offload_main
258 if (__target_exe == 0) {
259 // find target executable to be used if main application is not an
260 // offload build application.
261 const char *base_name = "offload_main";
262 if (mic_library_path != 0) {
263 char *buf = strdup(mic_library_path);
264 if (buf == NULL)
265 LIBOFFLOAD_ERROR(c_malloc);
266 char *try_name = (char*) alloca(strlen(mic_library_path) +
267 strlen(base_name) + 2);
268 char *dir, *ptr;
270 for (dir = strtok_r(buf, PATH_SEPARATOR, &ptr); dir != 0;
271 dir = strtok_r(0, PATH_SEPARATOR, &ptr)) {
272 // compose a full path
273 sprintf(try_name, "%s/%s", dir, base_name);
275 // check if such file exists
276 struct stat st;
277 if (stat(try_name, &st) == 0 && S_ISREG(st.st_mode)) {
278 mic_device_main = strdup(try_name);
279 if (mic_device_main == NULL)
280 LIBOFFLOAD_ERROR(c_malloc);
281 break;
284 free(buf);
286 if (mic_device_main == 0) {
287 LIBOFFLOAD_ERROR(c_report_no_target_exe, "offload_main");
288 exit(1);
291 OFFLOAD_DEBUG_TRACE(2,
292 "Loading target executable %s\n",mic_device_main);
294 res = COI::ProcessCreateFromFile(
295 engine, // in_Engine
296 mic_device_main, // in_pBinaryName
297 0, // in_Argc
298 0, // in_ppArgv
299 environ == 0, // in_DupEnv
300 environ, // in_ppAdditionalEnv
301 mic_proxy_io, // in_ProxyActive
302 mic_proxy_fs_root, // in_ProxyfsRoot
303 mic_buffer_size, // in_BufferSpace
304 mic_library_path, // in_LibrarySearchPath
305 &m_process // out_pProcess
308 else {
309 // Target executable should be available by the time when we
310 // attempt to initialize the device
312 // Need the full path of the FAT exe for VTUNE
314 #ifndef TARGET_WINNT
315 ssize_t len = readlink("/proc/self/exe", buf,1000);
316 #else
317 int len = GetModuleFileName(NULL, buf,1000);
318 #endif // TARGET_WINNT
319 if (len == -1) {
320 LIBOFFLOAD_ERROR(c_report_no_host_exe);
321 exit(1);
323 else if (len > 999) {
324 LIBOFFLOAD_ERROR(c_report_path_buff_overflow);
325 exit(1);
327 buf[len] = '\0';
330 OFFLOAD_DEBUG_TRACE(2,
331 "Loading target executable \"%s\" from %p, size %lld, host file %s\n",
332 __target_exe->name, __target_exe->data, __target_exe->size,
333 buf);
335 res = COI::ProcessCreateFromMemory(
336 engine, // in_Engine
337 __target_exe->name, // in_pBinaryName
338 __target_exe->data, // in_pBinaryBuffer
339 __target_exe->size, // in_BinaryBufferLength,
340 0, // in_Argc
341 0, // in_ppArgv
342 environ == 0, // in_DupEnv
343 environ, // in_ppAdditionalEnv
344 mic_proxy_io, // in_ProxyActive
345 mic_proxy_fs_root, // in_ProxyfsRoot
346 mic_buffer_size, // in_BufferSpace
347 mic_library_path, // in_LibrarySearchPath
348 buf, // in_FileOfOrigin
349 -1, // in_FileOfOriginOffset use -1 to indicate to
350 // COI that is is a FAT binary
351 &m_process // out_pProcess
354 check_result(res, c_process_create, m_index, res);
356 if ((mic_4k_buffer_size != 0) || (mic_2m_buffer_size !=0)) {
357 // available only in MPSS 4.2 and greater
358 if (COI::ProcessSetCacheSize != 0 ) {
359 int flags;
360 // Need compiler to use MPSS 3.2 or greater to get these
361 // definition so currently hardcoding it
362 // COI_CACHE_ACTION_GROW_NOW && COI_CACHE_MODE_ONDEMAND_SYNC;
363 flags = 0x00020002;
364 res = COI::ProcessSetCacheSize(
365 m_process, // in_Process
366 mic_2m_buffer_size, // in_HugePagePoolSize
367 flags, // inHugeFlags
368 mic_4k_buffer_size, // in_SmallPagePoolSize
369 flags, // inSmallFlags
370 0, // in_NumDependencies
371 0, // in_pDependencies
372 0 // out_PCompletion
374 OFFLOAD_DEBUG_TRACE(2,
375 "Reserve target buffers 4K pages = %d 2M pages = %d\n",
376 mic_4k_buffer_size, mic_2m_buffer_size);
377 check_result(res, c_process_set_cache_size, m_index, res);
379 else {
380 OFFLOAD_DEBUG_TRACE(2,
381 "Reserve target buffers not supported in current MPSS\n");
385 // get function handles
386 res = COI::ProcessGetFunctionHandles(m_process, c_funcs_total,
387 m_func_names, m_funcs);
388 check_result(res, c_process_get_func_handles, m_index, res);
390 // initialize device side
391 pid_t pid = init_device();
393 // For IDB
394 if (__dbg_is_attached) {
395 // TODO: we have in-memory executable now.
396 // Check with IDB team what should we provide them now?
397 if (__target_exe == 0) {
398 strcpy(__dbg_target_exe_name, "offload_main");
400 else {
401 if (strlen(__target_exe->name) < MAX_TARGET_NAME) {
402 strcpy(__dbg_target_exe_name, __target_exe->name);
405 __dbg_target_so_pid = pid;
406 __dbg_target_id = m_physical_index;
407 // The call to __dbg_target_so_loaded() is moved
408 // to Engine:init so all the libraries are loaded before
409 // informing debugger so debugger can access them.
410 // __dbg_target_so_loaded();
414 void Engine::fini_process(bool verbose)
416 if (m_process != 0) {
417 uint32_t sig;
418 int8_t ret;
420 // destroy target process
421 OFFLOAD_DEBUG_TRACE(2, "Destroying process on the device %d\n",
422 m_index);
424 COIRESULT res = COI::ProcessDestroy(m_process, -1, 0, &ret, &sig);
425 m_process = 0;
427 if (res == COI_SUCCESS) {
428 OFFLOAD_DEBUG_TRACE(3, "Device process: signal %d, exit code %d\n",
429 sig, ret);
430 if (verbose) {
431 if (sig != 0) {
432 LIBOFFLOAD_ERROR(
433 c_mic_process_exit_sig, m_index, sig,
434 c_signal_names[sig >= c_signal_max ? 0 : sig]);
436 else {
437 LIBOFFLOAD_ERROR(c_mic_process_exit_ret, m_index, ret);
441 // for idb
442 if (__dbg_is_attached) {
443 __dbg_target_so_unloaded();
446 else {
447 if (verbose) {
448 LIBOFFLOAD_ERROR(c_mic_process_exit, m_index);
454 void Engine::load_libraries()
456 // load libraries collected so far
457 for (TargetImageList::iterator it = m_images.begin();
458 it != m_images.end(); it++) {
459 OFFLOAD_DEBUG_TRACE(2,
460 "Loading library \"%s\" from %p, size %llu, host file %s\n",
461 it->name, it->data, it->size, it->origin);
463 // load library to the device
464 COILIBRARY lib;
465 COIRESULT res;
466 res = COI::ProcessLoadLibraryFromMemory(m_process,
467 it->data,
468 it->size,
469 it->name,
470 mic_library_path,
471 it->origin,
472 (it->origin) ? -1 : 0,
473 COI_LOADLIBRARY_V1_FLAGS,
474 &lib);
475 m_dyn_libs.push_front(DynLib(it->name, it->data, lib));
477 if (res != COI_SUCCESS && res != COI_ALREADY_EXISTS) {
478 check_result(res, c_load_library, it->origin, m_index, res);
481 m_images.clear();
484 void Engine::unload_library(const void *data, const char *name)
486 if (m_process == 0) {
487 return;
489 for (DynLibList::iterator it = m_dyn_libs.begin();
490 it != m_dyn_libs.end(); it++) {
491 if (it->data == data) {
492 COIRESULT res;
493 OFFLOAD_DEBUG_TRACE(2,
494 "Unloading library \"%s\"\n",name);
495 res = COI::ProcessUnloadLibrary(m_process,it->lib);
496 m_dyn_libs.erase(it);
497 if (res != COI_SUCCESS) {
498 check_result(res, c_unload_library, m_index, res);
500 return;
505 static bool target_entry_cmp(
506 const VarList::BufEntry &l,
507 const VarList::BufEntry &r
510 const char *l_name = reinterpret_cast<const char*>(l.name);
511 const char *r_name = reinterpret_cast<const char*>(r.name);
512 return strcmp(l_name, r_name) < 0;
515 static bool host_entry_cmp(
516 const VarTable::Entry *l,
517 const VarTable::Entry *r
520 return strcmp(l->name, r->name) < 0;
523 void Engine::init_ptr_data(void)
525 COIRESULT res;
526 COIEVENT event;
528 // Prepare table of host entries
529 std::vector<const VarTable::Entry*> host_table(
530 Iterator(__offload_vars.get_head()),
531 Iterator());
533 // no need to do anything further is host table is empty
534 if (host_table.size() <= 0) {
535 return;
538 // Get var table entries from the target.
539 // First we need to get size for the buffer to copy data
540 struct {
541 int64_t nelems;
542 int64_t length;
543 } params;
545 res = COI::PipelineRunFunction(get_pipeline(),
546 m_funcs[c_func_var_table_size],
547 0, 0, 0,
548 0, 0,
549 0, 0,
550 &params, sizeof(params),
551 &event);
552 check_result(res, c_pipeline_run_func, m_index, res);
554 res = COI::EventWait(1, &event, -1, 1, 0, 0);
555 check_result(res, c_event_wait, res);
557 if (params.length == 0) {
558 return;
561 // create buffer for target entries and copy data to host
562 COIBUFFER buffer;
563 res = COI::BufferCreate(params.length, COI_BUFFER_NORMAL, 0, 0, 1,
564 &m_process, &buffer);
565 check_result(res, c_buf_create, m_index, res);
567 COI_ACCESS_FLAGS flags = COI_SINK_WRITE;
568 res = COI::PipelineRunFunction(get_pipeline(),
569 m_funcs[c_func_var_table_copy],
570 1, &buffer, &flags,
571 0, 0,
572 &params.nelems, sizeof(params.nelems),
573 0, 0,
574 &event);
575 check_result(res, c_pipeline_run_func, m_index, res);
577 res = COI::EventWait(1, &event, -1, 1, 0, 0);
578 check_result(res, c_event_wait, res);
580 // patch names in target data
581 VarList::BufEntry *target_table;
582 COIMAPINSTANCE map_inst;
583 res = COI::BufferMap(buffer, 0, params.length, COI_MAP_READ_ONLY, 0, 0,
584 0, &map_inst,
585 reinterpret_cast<void**>(&target_table));
586 check_result(res, c_buf_map, res);
588 VarList::table_patch_names(target_table, params.nelems);
590 // and sort entries
591 std::sort(target_table, target_table + params.nelems, target_entry_cmp);
592 std::sort(host_table.begin(), host_table.end(), host_entry_cmp);
594 // merge host and target entries and enter matching vars map
595 std::vector<const VarTable::Entry*>::const_iterator hi =
596 host_table.begin();
597 std::vector<const VarTable::Entry*>::const_iterator he =
598 host_table.end();
599 const VarList::BufEntry *ti = target_table;
600 const VarList::BufEntry *te = target_table + params.nelems;
602 while (hi != he && ti != te) {
603 int res = strcmp((*hi)->name, reinterpret_cast<const char*>(ti->name));
604 if (res == 0) {
605 bool is_new;
606 // add matching entry to var map
607 PtrData *ptr = insert_ptr_data((*hi)->addr, (*hi)->size, is_new);
609 // store address for new entries
610 if (is_new) {
611 ptr->mic_addr = ti->addr;
612 ptr->is_static = true;
613 ptr->var_alloc_type = (*hi)->var_alloc_type;
615 ptr->alloc_ptr_data_lock.unlock();
616 hi++;
617 ti++;
619 else if (res < 0) {
620 hi++;
622 else {
623 ti++;
627 // cleanup
628 res = COI::BufferUnmap(map_inst, 0, 0, 0);
629 check_result(res, c_buf_unmap, res);
631 res = COI::BufferDestroy(buffer);
632 check_result(res, c_buf_destroy, res);
635 COIRESULT Engine::compute(
636 _Offload_stream stream,
637 const std::list<COIBUFFER> &buffers,
638 const void* data,
639 uint16_t data_size,
640 void* ret,
641 uint16_t ret_size,
642 uint32_t num_deps,
643 const COIEVENT* deps,
644 COIEVENT* event
645 ) /* const */
647 COIBUFFER *bufs;
648 COI_ACCESS_FLAGS *flags;
649 COIRESULT res;
651 // convert buffers list to array
652 int num_bufs = buffers.size();
653 if (num_bufs > 0) {
654 bufs = (COIBUFFER*) alloca(num_bufs * sizeof(COIBUFFER));
655 flags = (COI_ACCESS_FLAGS*) alloca(num_bufs *
656 sizeof(COI_ACCESS_FLAGS));
658 int i = 0;
659 for (std::list<COIBUFFER>::const_iterator it = buffers.begin();
660 it != buffers.end(); it++) {
661 bufs[i] = *it;
663 // TODO: this should be fixed
664 flags[i++] = COI_SINK_WRITE;
667 else {
668 bufs = 0;
669 flags = 0;
671 COIPIPELINE pipeline = (stream == no_stream) ?
672 get_pipeline() :
673 get_pipeline(stream);
674 // start computation
675 res = COI::PipelineRunFunction(pipeline,
676 m_funcs[c_func_compute],
677 num_bufs, bufs, flags,
678 num_deps, deps,
679 data, data_size,
680 ret, ret_size,
681 event);
682 return res;
685 pid_t Engine::init_device(void)
687 struct init_data {
688 int device_index;
689 int devices_total;
690 int console_level;
691 int offload_report_level;
692 } data;
693 COIRESULT res;
694 COIEVENT event;
695 pid_t pid;
697 OFFLOAD_DEBUG_TRACE_1(2, 0, c_offload_init,
698 "Initializing device with logical index %d "
699 "and physical index %d\n",
700 m_index, m_physical_index);
702 // setup misc data
703 data.device_index = m_index;
704 data.devices_total = mic_engines_total;
705 data.console_level = console_enabled;
706 data.offload_report_level = offload_report_level;
708 res = COI::PipelineRunFunction(get_pipeline(),
709 m_funcs[c_func_init],
710 0, 0, 0, 0, 0,
711 &data, sizeof(data),
712 &pid, sizeof(pid),
713 &event);
714 check_result(res, c_pipeline_run_func, m_index, res);
716 res = COI::EventWait(1, &event, -1, 1, 0, 0);
717 check_result(res, c_event_wait, res);
719 OFFLOAD_DEBUG_TRACE(2, "Device process pid is %d\n", pid);
721 return pid;
724 // data associated with each thread
725 struct Thread {
726 Thread(long* addr_coipipe_counter) {
727 m_addr_coipipe_counter = addr_coipipe_counter;
728 memset(m_pipelines, 0, sizeof(m_pipelines));
731 ~Thread() {
732 #ifndef TARGET_WINNT
733 __sync_sub_and_fetch(m_addr_coipipe_counter, 1);
734 #else // TARGET_WINNT
735 _InterlockedDecrement(m_addr_coipipe_counter);
736 #endif // TARGET_WINNT
737 for (int i = 0; i < mic_engines_total; i++) {
738 if (m_pipelines[i] != 0) {
739 COI::PipelineDestroy(m_pipelines[i]);
744 COIPIPELINE get_pipeline(int index) const {
745 return m_pipelines[index];
748 void set_pipeline(int index, COIPIPELINE pipeline) {
749 m_pipelines[index] = pipeline;
752 AutoSet& get_auto_vars() {
753 return m_auto_vars;
756 private:
757 long* m_addr_coipipe_counter;
758 AutoSet m_auto_vars;
759 COIPIPELINE m_pipelines[MIC_ENGINES_MAX];
762 COIPIPELINE Engine::get_pipeline(void)
764 Thread* thread = (Thread*) thread_getspecific(mic_thread_key);
765 if (thread == 0) {
766 thread = new Thread(&m_proc_number);
767 thread_setspecific(mic_thread_key, thread);
770 COIPIPELINE pipeline = thread->get_pipeline(m_index);
771 if (pipeline == 0) {
772 COIRESULT res;
773 int proc_num;
775 #ifndef TARGET_WINNT
776 proc_num = __sync_fetch_and_add(&m_proc_number, 1);
777 #else // TARGET_WINNT
778 proc_num = _InterlockedIncrement(&m_proc_number);
779 #endif // TARGET_WINNT
781 if (proc_num > COI_PIPELINE_MAX_PIPELINES) {
782 LIBOFFLOAD_ERROR(c_coipipe_max_number, COI_PIPELINE_MAX_PIPELINES);
783 LIBOFFLOAD_ABORT;
786 // Create pipeline for this thread
787 if (m_assigned_cpus == 0) {
788 // If m_assigned_cpus is NULL, it implies all threads
789 // Create the pipeline with no CPU mask
790 res = COI::PipelineCreate(m_process, 0, mic_stack_size, &pipeline);
791 } else {
792 // Create COI CPU mask
793 COI_CPU_MASK in_Mask;
794 res = COI::PipelineClearCPUMask(in_Mask);
795 check_result(res, c_clear_cpu_mask, m_index, res);
797 int threads_per_core = m_num_threads / m_num_cores;
799 // Available threads are defined by examining of m_assigned_cpus bitset.
800 // We skip thread 0.
801 for (int i = 1; i < m_num_threads; i++) {
802 // For available thread i m_assigned_cpus[i] is equal to 1
803 if ((*m_assigned_cpus)[i]) {
804 COI_CPU_MASK_SET(i, in_Mask);
807 OFFLOAD_DEBUG_TRACE(2, "COIPipelineCreate Mask for this CPU thread\n"
808 "%016lx %016lx %016lx %016lx\n%016lx %016lx %016lx %016lx\n"
809 "%016lx %016lx %016lx %016lx\n%016lx %016lx %016lx %016lx\n",
810 in_Mask[0], in_Mask[1], in_Mask[2], in_Mask[3],
811 in_Mask[4], in_Mask[5], in_Mask[6], in_Mask[7],
812 in_Mask[8], in_Mask[9], in_Mask[10], in_Mask[11],
813 in_Mask[12], in_Mask[13], in_Mask[14], in_Mask[15]);
815 // Create the pipeline with allowable CPUs
816 res = COI::PipelineCreate(m_process, in_Mask, mic_stack_size, &pipeline);
818 check_result(res, c_pipeline_create, m_index, res);
819 thread->set_pipeline(m_index, pipeline);
821 return pipeline;
824 Stream* Stream::find_stream(uint64_t handle, bool remove)
826 Stream *stream = 0;
828 m_stream_lock.lock();
830 StreamMap::iterator it = all_streams.find(handle);
831 if (it != all_streams.end()) {
832 stream = it->second;
833 if (remove) {
834 all_streams.erase(it);
838 m_stream_lock.unlock();
839 return stream;
842 void Engine::move_cpu_el_after(CpuEl* cpu_what, CpuEl* cpu_after)
844 if (cpu_what == cpu_after) {
845 return;
847 CpuEl* cpu_prev = cpu_what->prev;
849 // remove cpu_what
850 if (!cpu_prev) {
851 m_cpu_head = cpu_what->next;
853 else {
854 cpu_prev->next = cpu_what->next;
856 if (cpu_what->next) {
857 cpu_what->next->prev = cpu_prev;
860 // insert cpu_what after cpu_after
861 cpu_what->prev = cpu_after;
862 cpu_what->next = cpu_after->next;
863 if (cpu_after->next) {
864 cpu_after->next->prev = cpu_what;
866 cpu_after->next = cpu_what;
869 COIPIPELINE Engine::get_pipeline(_Offload_stream handle)
871 Stream * stream = Stream::find_stream(handle, false);
873 if (!stream) {
874 LIBOFFLOAD_ERROR(c_offload_no_stream, m_index);
875 LIBOFFLOAD_ABORT;
878 COIPIPELINE pipeline = stream->get_pipeline();
880 if (pipeline == 0) {
881 COIRESULT res;
882 int proc_num;
883 COI_CPU_MASK in_Mask ;
885 #ifndef TARGET_WINNT
886 proc_num = __sync_fetch_and_add(&m_proc_number, 1);
887 #else // TARGET_WINNT
888 proc_num = _InterlockedIncrement(&m_proc_number);
889 #endif // TARGET_WINNT
891 if (proc_num > COI_PIPELINE_MAX_PIPELINES) {
892 LIBOFFLOAD_ERROR(c_coipipe_max_number, COI_PIPELINE_MAX_PIPELINES);
893 LIBOFFLOAD_ABORT;
896 m_stream_lock.lock();
898 // start process if not done yet
899 if (m_process == 0) {
900 init_process();
903 // create CPUmask
904 res = COI::PipelineClearCPUMask(in_Mask);
905 check_result(res, c_clear_cpu_mask, m_index, res);
907 int stream_cpu_num = stream->get_cpu_number();
909 stream->m_stream_cpus.reset();
911 int threads_per_core = m_num_threads / m_num_cores;
914 // Available threads is taken from m_cpus list.
915 // m_cpu_head points to the head of m_cpus.
916 // the elements of m_cpus is ordered by the number of usage in streams.
918 CpuEl *cpu_el = m_cpu_head;
919 CpuEl *cpu_used_el, *cpu_used_prev, *cpu_prev;
921 for (int i = 0; i < stream_cpu_num; i++) {
922 COI_CPU_MASK_SET(CPU_INDEX(cpu_el), in_Mask);
923 stream->m_stream_cpus.set(CPU_INDEX(cpu_el));
924 //If the number of availabale threads is less than stream_cpu_num,
925 // the stream_cpu_num is restricted to this number.
926 if (!cpu_el->next) {
927 break;
929 if (i + 1 < stream_cpu_num) {
930 cpu_el = cpu_el->next;
934 // assertion : cpu_el points to the last used thread
935 cpu_used_el = cpu_el;
936 while (cpu_used_el) {
937 cpu_used_el->count++;
938 cpu_el = cpu_prev = cpu_used_el;
939 cpu_used_prev = cpu_used_el->prev;
940 if (!cpu_el->next) {
941 cpu_used_el = cpu_used_prev;
942 continue;
945 while (cpu_el) {
946 if (cpu_used_el->count < cpu_el->count) {
947 break;
949 // Equal used threads are ordered by thread number to
950 // assign to a stream as contiguous threads as possible.
951 else if (cpu_used_el->count == cpu_el->count &&
952 CPU_INDEX(cpu_used_el) < CPU_INDEX(cpu_el)) {
953 break;
955 cpu_prev = cpu_el;
956 cpu_el = cpu_el->next;
958 if (cpu_used_el != cpu_prev) {
959 move_cpu_el_after(cpu_used_el, cpu_prev);
961 cpu_used_el = cpu_used_prev;
963 print_stream_cpu_list("get_pipeline");
965 // create pipeline for this thread
966 OFFLOAD_DEBUG_TRACE(2, "COIPipelineCreate Mask for this Stream\n"
967 "%016lx %016lx %016lx %016lx\n%016lx %016lx %016lx %016lx\n"
968 "%016lx %016lx %016lx %016lx\n%016lx %016lx %016lx %016lx\n",
969 in_Mask[0], in_Mask[1], in_Mask[2], in_Mask[3],
970 in_Mask[4], in_Mask[5], in_Mask[6], in_Mask[7],
971 in_Mask[8], in_Mask[9], in_Mask[10], in_Mask[11],
972 in_Mask[12], in_Mask[13], in_Mask[14], in_Mask[15]);
973 res = COI::PipelineCreate(m_process, in_Mask,
974 mic_stack_size, &pipeline);
975 check_result(res, c_pipeline_create, m_index, res);
977 // Set stream's affinities
979 struct affinity_spec affinity_spec;
980 char* affinity_type;
981 int i;
983 // "compact" by default
984 affinity_spec.affinity_type = affinity_compact;
986 // Check if user has specified type of affinity
987 if ((affinity_type = getenv("OFFLOAD_STREAM_AFFINITY")) !=
988 NULL)
990 char affinity_str[16];
991 int affinity_str_len;
993 OFFLOAD_DEBUG_TRACE(2,
994 "User has specified OFFLOAD_STREAM_AFFINITY=%s\n",
995 affinity_type);
997 // Set type of affinity requested
998 affinity_str_len = strlen(affinity_type);
999 for (i=0; i<affinity_str_len && i<15; i++)
1001 affinity_str[i] = tolower(affinity_type[i]);
1003 affinity_str[i] = '\0';
1004 if (strcmp(affinity_str, "compact") == 0) {
1005 affinity_spec.affinity_type = affinity_compact;
1006 OFFLOAD_DEBUG_TRACE(2, "Setting affinity=compact\n");
1007 } else if (strcmp(affinity_str, "scatter") == 0) {
1008 affinity_spec.affinity_type = affinity_scatter;
1009 OFFLOAD_DEBUG_TRACE(2, "Setting affinity=scatter\n");
1010 } else {
1011 LIBOFFLOAD_ERROR(c_incorrect_affinity, affinity_str);
1012 affinity_spec.affinity_type = affinity_compact;
1013 OFFLOAD_DEBUG_TRACE(2, "Setting affinity=compact\n");
1016 // Make flat copy of sink mask because COI's mask is opaque
1017 for (i=0; i<16; i++) {
1018 affinity_spec.sink_mask[i] = in_Mask[i];
1020 // Set number of cores and threads
1021 affinity_spec.num_cores = m_num_cores;
1022 affinity_spec.num_threads = m_num_threads;
1024 COIEVENT event;
1025 res = COI::PipelineRunFunction(pipeline,
1026 m_funcs[c_func_set_stream_affinity],
1027 0, 0, 0,
1028 0, 0,
1029 &affinity_spec, sizeof(affinity_spec),
1030 0, 0,
1031 &event);
1032 check_result(res, c_pipeline_run_func, m_index, res);
1034 res = COI::EventWait(1, &event, -1, 1, 0, 0);
1035 check_result(res, c_event_wait, res);
1038 m_stream_lock.unlock();
1039 stream->set_pipeline(pipeline);
1041 return pipeline;
1044 void Engine::stream_destroy(_Offload_stream handle)
1046 // get stream
1047 Stream * stream = Stream::find_stream(handle, true);
1049 if (stream) {
1050 // return cpus for future use
1051 for (int i = 0; i < m_num_threads; i++) {
1052 if (stream->m_stream_cpus.test(i)) {
1053 CpuEl *cpu_el = m_cpus + i;
1054 CpuEl *cpu_first_el = cpu_el;
1055 // decrease count of thread "i" and move its CpuEl to the
1056 // proper place into the ordered list
1057 cpu_el->count--;
1058 while (cpu_el->prev) {
1059 if (cpu_first_el->count > cpu_el->prev->count) {
1060 break;
1062 else if (cpu_first_el->count == cpu_el->prev->count &&
1063 CPU_INDEX(cpu_first_el) > CPU_INDEX(cpu_el->prev)) {
1064 break;
1066 cpu_el = cpu_el->prev;
1068 cpu_el = cpu_el->prev;
1069 // If cpu_el for thread "i" must be moved in the list
1070 if (cpu_first_el != cpu_el) {
1071 // Thread "i" is used the least times. It must be set as
1072 // the m_cpu_head.
1073 if (!cpu_el) {
1074 if (!cpu_first_el->prev) {
1075 continue;
1077 // remove cpu_el.
1078 cpu_first_el->prev->next = cpu_first_el->next;
1079 if (cpu_first_el->next) {
1080 cpu_first_el->next->prev = cpu_first_el->prev;
1082 // make cpu_first_el as new m_cpu_head
1083 cpu_first_el->prev = NULL;
1084 cpu_first_el->next = m_cpu_head;
1085 m_cpu_head->prev = cpu_first_el;
1086 m_cpu_head = cpu_first_el;
1088 else {
1089 move_cpu_el_after(cpu_first_el, cpu_el);
1094 print_stream_cpu_list("stream_destroy");
1095 delete stream;
1097 else {
1098 LIBOFFLOAD_ERROR(c_offload_no_stream, m_index);
1099 LIBOFFLOAD_ABORT;
1103 uint64_t Engine::get_thread_id(void)
1105 Thread* thread = (Thread*) thread_getspecific(mic_thread_key);
1106 if (thread == 0) {
1107 thread = new Thread(&m_proc_number);
1108 thread_setspecific(mic_thread_key, thread);
1111 return reinterpret_cast<uint64_t>(thread);
1114 AutoSet& Engine::get_auto_vars(void)
1116 Thread* thread = (Thread*) thread_getspecific(mic_thread_key);
1117 if (thread == 0) {
1118 thread = new Thread(&m_proc_number);
1119 thread_setspecific(mic_thread_key, thread);
1122 return thread->get_auto_vars();
1125 void Engine::destroy_thread_data(void *data)
1127 delete static_cast<Thread*>(data);