2 Copyright (c) 2014-2016 Intel Corporation. All Rights Reserved.
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions
8 * Redistributions of source code must retain the above copyright
9 notice, this list of conditions and the following disclaimer.
10 * Redistributions in binary form must reproduce the above copyright
11 notice, this list of conditions and the following disclaimer in the
12 documentation and/or other materials provided with the distribution.
13 * Neither the name of Intel Corporation nor the names of its
14 contributors may be used to endorse or promote products derived
15 from this software without specific prior written permission.
17 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 #include "offload_engine.h"
35 #include <sys/types.h>
40 #include "offload_host.h"
41 #include "offload_table.h"
42 #include "offload_iterator.h"
44 #if defined(HOST_WINNT)
45 #define PATH_SEPARATOR ";"
47 #define PATH_SEPARATOR ":"
50 // Static members of Stream class must be described somewhere.
51 // This members describe the list of all streams defined in programm
52 // via call to _Offload_stream_create.
53 uint64_t Stream::m_streams_count
= 0;
54 StreamMap
Stream::all_streams
;
55 mutex_t
Stream::m_stream_lock
;
56 char* mic_library_path
= 0;
58 const char* Engine::m_func_names
[Engine::c_funcs_total
] =
66 "server_var_table_size",
67 "server_var_table_copy",
68 "server_set_stream_affinity"
71 // Symbolic representation of system signals. Fix for CQ233593
72 const char* Engine::c_signal_names
[Engine::c_signal_max
] =
75 "SIGHUP", /* 1, Hangup (POSIX). */
76 "SIGINT", /* 2, Interrupt (ANSI). */
77 "SIGQUIT", /* 3, Quit (POSIX). */
78 "SIGILL", /* 4, Illegal instruction (ANSI). */
79 "SIGTRAP", /* 5, Trace trap (POSIX). */
80 "SIGABRT", /* 6, Abort (ANSI). */
81 "SIGBUS", /* 7, BUS error (4.2 BSD). */
82 "SIGFPE", /* 8, Floating-point exception (ANSI). */
83 "SIGKILL", /* 9, Kill, unblockable (POSIX). */
84 "SIGUSR1", /* 10, User-defined signal 1 (POSIX). */
85 "SIGSEGV", /* 11, Segmentation violation (ANSI). */
86 "SIGUSR2", /* 12, User-defined signal 2 (POSIX). */
87 "SIGPIPE", /* 13, Broken pipe (POSIX). */
88 "SIGALRM", /* 14, Alarm clock (POSIX). */
89 "SIGTERM", /* 15, Termination (ANSI). */
90 "SIGSTKFLT", /* 16, Stack fault. */
91 "SIGCHLD", /* 17, Child status has changed (POSIX). */
92 "SIGCONT", /* 18, Continue (POSIX). */
93 "SIGSTOP", /* 19, Stop, unblockable (POSIX). */
94 "SIGTSTP", /* 20, Keyboard stop (POSIX). */
95 "SIGTTIN", /* 21, Background read from tty (POSIX). */
96 "SIGTTOU", /* 22, Background write to tty (POSIX). */
97 "SIGURG", /* 23, Urgent condition on socket (4.2 BSD). */
98 "SIGXCPU", /* 24, CPU limit exceeded (4.2 BSD). */
99 "SIGXFSZ", /* 25, File size limit exceeded (4.2 BSD). */
100 "SIGVTALRM", /* 26, Virtual alarm clock (4.2 BSD). */
101 "SIGPROF", /* 27, Profiling alarm clock (4.2 BSD). */
102 "SIGWINCH", /* 28, Window size change (4.3 BSD, Sun). */
103 "SIGIO", /* 29, I/O now possible (4.2 BSD). */
104 "SIGPWR", /* 30, Power failure restart (System V). */
105 "SIGSYS" /* 31, Bad system call. */
108 void Engine::init(void)
111 mutex_locker_t
locker(m_lock
);
114 // start process if not done yet
115 if (m_process
== 0) {
119 // load penging images
122 // and (re)build pointer table
128 // Inform the debugger
129 if (__dbg_is_attached
) {
130 __dbg_target_so_loaded();
136 void Engine::print_stream_cpu_list(const char * str
)
140 CpuEl
* cpu_el
= m_cpu_head
;
142 OFFLOAD_DEBUG_TRACE(3,
143 "%s : cpu list as Index(Count) for the streams is :\n", str
);
145 for (int i
= 0; i
< m_num_threads
; i
++) {
147 if (m_assigned_cpus
== 0 || (*m_assigned_cpus
)[i
]) {
149 sprintf(buffer
+ strlen(buffer
), "%d(%d) ", CPU_INDEX(cpu_el
), cpu_el
->count
);
150 if (count
% 20 == 0) {
151 OFFLOAD_DEBUG_TRACE(3, "%s\n", buffer
);
156 if (count
% 20 != 0) {
157 OFFLOAD_DEBUG_TRACE(3, "%s\n", buffer
);
161 void Engine::init_process(void)
165 const char **environ
;
166 char buf
[4096]; // For exe path name
167 char* mic_device_main
= 0;
169 // create environment for the target process
170 environ
= (const char**) mic_env_vars
.create_environ_for_card(m_index
);
172 for (const char **p
= environ
; *p
!= 0; p
++) {
173 OFFLOAD_DEBUG_TRACE(3, "Env Var for card %d: %s\n", m_index
, *p
);
177 // Create execution context in the specified device
178 OFFLOAD_DEBUG_TRACE(2, "Getting device %d (engine %d) handle\n", m_index
,
180 res
= COI::EngineGetHandle(COI_ISA_MIC
, m_physical_index
, &engine
);
181 check_result(res
, c_get_engine_handle
, m_index
, res
);
183 // Get engine info on threads and cores.
184 // The values of core number and thread number will be used later at stream
185 // creation by call to _Offload_stream_create(device,number_of_cpus).
187 COI_ENGINE_INFO engine_info
;
189 res
= COI::EngineGetInfo(engine
, sizeof(COI_ENGINE_INFO
), &engine_info
);
190 check_result(res
, c_get_engine_info
, m_index
, res
);
191 if (mic_library_path
== 0 ) {
192 if (engine_info
.ISA
== COI_DEVICE_KNC
) {
193 mic_library_path
= knc_library_path
;
195 else if (engine_info
.ISA
== COI_DEVICE_KNL
) {
196 mic_library_path
= knl_library_path
;
199 LIBOFFLOAD_ERROR(c_unknown_mic_device_type
);
203 // m_cpus is the list of all available threads.
204 // At the begining all threads made available through OFFLOAD_DEVICES
205 // or all threads existed at the engine if OFFLOAD_DEVICES isn't set.
206 // m_cpu_head points to the head of the m_cpus list.
207 // m_cpus is ordered by number of streams using the thread.
208 // m_cpu_head points to the least used thread.
209 // After creating and destroying a stream the m_cpus list must be fixed
212 m_cpus
= (CpuEl
*)malloc(engine_info
.NumThreads
* sizeof(CpuEl
));
214 LIBOFFLOAD_ERROR(c_malloc
);
215 memset(m_cpus
, 0, engine_info
.NumThreads
* sizeof(CpuEl
));
216 CpuEl
* prev_cpu
= NULL
;
218 for (int i
= 0; i
< engine_info
.NumThreads
; i
++) {
219 if (m_assigned_cpus
== 0 || (*m_assigned_cpus
)[i
]) {
221 prev_cpu
->next
= m_cpus
+ i
;
224 m_cpu_head
= m_cpus
+ i
;
226 m_cpus
[i
].prev
= prev_cpu
;
228 prev_cpu
= m_cpus
+ i
;
232 // The following values will be used at pipeline creation for streams
233 m_num_cores
= engine_info
.NumCores
;
234 m_num_threads
= engine_info
.NumThreads
;
236 print_stream_cpu_list("init_process");
238 // Check if OFFLOAD_DMA_CHANNEL_COUNT is set to 2
239 // Only the value 2 is supported in 16.0
240 if (mic_dma_channel_count
== 2) {
241 if (COI::ProcessConfigureDMA
) {
242 // Set DMA channels using COI API
243 COI::ProcessConfigureDMA(2, COI::DMA_MODE_READ_WRITE
);
246 // Set environment variable COI_DMA_CHANNEL_COUNT
247 // use putenv instead of setenv as Windows has no setenv.
248 // Note: putenv requires its argument can't be freed or modified.
249 // So no free after call to putenv or elsewhere.
250 char * env_var
= strdup("COI_DMA_CHANNEL_COUNT=2");
252 LIBOFFLOAD_ERROR(c_malloc
);
257 // Target executable is not available then use compiler provided offload_main
258 if (__target_exe
== 0) {
259 // find target executable to be used if main application is not an
260 // offload build application.
261 const char *base_name
= "offload_main";
262 if (mic_library_path
!= 0) {
263 char *buf
= strdup(mic_library_path
);
265 LIBOFFLOAD_ERROR(c_malloc
);
266 char *try_name
= (char*) alloca(strlen(mic_library_path
) +
267 strlen(base_name
) + 2);
270 for (dir
= strtok_r(buf
, PATH_SEPARATOR
, &ptr
); dir
!= 0;
271 dir
= strtok_r(0, PATH_SEPARATOR
, &ptr
)) {
272 // compose a full path
273 sprintf(try_name
, "%s/%s", dir
, base_name
);
275 // check if such file exists
277 if (stat(try_name
, &st
) == 0 && S_ISREG(st
.st_mode
)) {
278 mic_device_main
= strdup(try_name
);
279 if (mic_device_main
== NULL
)
280 LIBOFFLOAD_ERROR(c_malloc
);
286 if (mic_device_main
== 0) {
287 LIBOFFLOAD_ERROR(c_report_no_target_exe
, "offload_main");
291 OFFLOAD_DEBUG_TRACE(2,
292 "Loading target executable %s\n",mic_device_main
);
294 res
= COI::ProcessCreateFromFile(
296 mic_device_main
, // in_pBinaryName
299 environ
== 0, // in_DupEnv
300 environ
, // in_ppAdditionalEnv
301 mic_proxy_io
, // in_ProxyActive
302 mic_proxy_fs_root
, // in_ProxyfsRoot
303 mic_buffer_size
, // in_BufferSpace
304 mic_library_path
, // in_LibrarySearchPath
305 &m_process
// out_pProcess
309 // Target executable should be available by the time when we
310 // attempt to initialize the device
312 // Need the full path of the FAT exe for VTUNE
315 ssize_t len
= readlink("/proc/self/exe", buf
,1000);
317 int len
= GetModuleFileName(NULL
, buf
,1000);
318 #endif // TARGET_WINNT
320 LIBOFFLOAD_ERROR(c_report_no_host_exe
);
323 else if (len
> 999) {
324 LIBOFFLOAD_ERROR(c_report_path_buff_overflow
);
330 OFFLOAD_DEBUG_TRACE(2,
331 "Loading target executable \"%s\" from %p, size %lld, host file %s\n",
332 __target_exe
->name
, __target_exe
->data
, __target_exe
->size
,
335 res
= COI::ProcessCreateFromMemory(
337 __target_exe
->name
, // in_pBinaryName
338 __target_exe
->data
, // in_pBinaryBuffer
339 __target_exe
->size
, // in_BinaryBufferLength,
342 environ
== 0, // in_DupEnv
343 environ
, // in_ppAdditionalEnv
344 mic_proxy_io
, // in_ProxyActive
345 mic_proxy_fs_root
, // in_ProxyfsRoot
346 mic_buffer_size
, // in_BufferSpace
347 mic_library_path
, // in_LibrarySearchPath
348 buf
, // in_FileOfOrigin
349 -1, // in_FileOfOriginOffset use -1 to indicate to
350 // COI that is is a FAT binary
351 &m_process
// out_pProcess
354 check_result(res
, c_process_create
, m_index
, res
);
356 if ((mic_4k_buffer_size
!= 0) || (mic_2m_buffer_size
!=0)) {
357 // available only in MPSS 4.2 and greater
358 if (COI::ProcessSetCacheSize
!= 0 ) {
360 // Need compiler to use MPSS 3.2 or greater to get these
361 // definition so currently hardcoding it
362 // COI_CACHE_ACTION_GROW_NOW && COI_CACHE_MODE_ONDEMAND_SYNC;
364 res
= COI::ProcessSetCacheSize(
365 m_process
, // in_Process
366 mic_2m_buffer_size
, // in_HugePagePoolSize
367 flags
, // inHugeFlags
368 mic_4k_buffer_size
, // in_SmallPagePoolSize
369 flags
, // inSmallFlags
370 0, // in_NumDependencies
371 0, // in_pDependencies
374 OFFLOAD_DEBUG_TRACE(2,
375 "Reserve target buffers 4K pages = %d 2M pages = %d\n",
376 mic_4k_buffer_size
, mic_2m_buffer_size
);
377 check_result(res
, c_process_set_cache_size
, m_index
, res
);
380 OFFLOAD_DEBUG_TRACE(2,
381 "Reserve target buffers not supported in current MPSS\n");
385 // get function handles
386 res
= COI::ProcessGetFunctionHandles(m_process
, c_funcs_total
,
387 m_func_names
, m_funcs
);
388 check_result(res
, c_process_get_func_handles
, m_index
, res
);
390 // initialize device side
391 pid_t pid
= init_device();
394 if (__dbg_is_attached
) {
395 // TODO: we have in-memory executable now.
396 // Check with IDB team what should we provide them now?
397 if (__target_exe
== 0) {
398 strcpy(__dbg_target_exe_name
, "offload_main");
401 if (strlen(__target_exe
->name
) < MAX_TARGET_NAME
) {
402 strcpy(__dbg_target_exe_name
, __target_exe
->name
);
405 __dbg_target_so_pid
= pid
;
406 __dbg_target_id
= m_physical_index
;
407 // The call to __dbg_target_so_loaded() is moved
408 // to Engine:init so all the libraries are loaded before
409 // informing debugger so debugger can access them.
410 // __dbg_target_so_loaded();
414 void Engine::fini_process(bool verbose
)
416 if (m_process
!= 0) {
420 // destroy target process
421 OFFLOAD_DEBUG_TRACE(2, "Destroying process on the device %d\n",
424 COIRESULT res
= COI::ProcessDestroy(m_process
, -1, 0, &ret
, &sig
);
427 if (res
== COI_SUCCESS
) {
428 OFFLOAD_DEBUG_TRACE(3, "Device process: signal %d, exit code %d\n",
433 c_mic_process_exit_sig
, m_index
, sig
,
434 c_signal_names
[sig
>= c_signal_max
? 0 : sig
]);
437 LIBOFFLOAD_ERROR(c_mic_process_exit_ret
, m_index
, ret
);
442 if (__dbg_is_attached
) {
443 __dbg_target_so_unloaded();
448 LIBOFFLOAD_ERROR(c_mic_process_exit
, m_index
);
454 void Engine::load_libraries()
456 // load libraries collected so far
457 for (TargetImageList::iterator it
= m_images
.begin();
458 it
!= m_images
.end(); it
++) {
459 OFFLOAD_DEBUG_TRACE(2,
460 "Loading library \"%s\" from %p, size %llu, host file %s\n",
461 it
->name
, it
->data
, it
->size
, it
->origin
);
463 // load library to the device
466 res
= COI::ProcessLoadLibraryFromMemory(m_process
,
472 (it
->origin
) ? -1 : 0,
473 COI_LOADLIBRARY_V1_FLAGS
,
475 m_dyn_libs
.push_front(DynLib(it
->name
, it
->data
, lib
));
477 if (res
!= COI_SUCCESS
&& res
!= COI_ALREADY_EXISTS
) {
478 check_result(res
, c_load_library
, it
->origin
, m_index
, res
);
484 void Engine::unload_library(const void *data
, const char *name
)
486 if (m_process
== 0) {
489 for (DynLibList::iterator it
= m_dyn_libs
.begin();
490 it
!= m_dyn_libs
.end(); it
++) {
491 if (it
->data
== data
) {
493 OFFLOAD_DEBUG_TRACE(2,
494 "Unloading library \"%s\"\n",name
);
495 res
= COI::ProcessUnloadLibrary(m_process
,it
->lib
);
496 m_dyn_libs
.erase(it
);
497 if (res
!= COI_SUCCESS
) {
498 check_result(res
, c_unload_library
, m_index
, res
);
505 static bool target_entry_cmp(
506 const VarList::BufEntry
&l
,
507 const VarList::BufEntry
&r
510 const char *l_name
= reinterpret_cast<const char*>(l
.name
);
511 const char *r_name
= reinterpret_cast<const char*>(r
.name
);
512 return strcmp(l_name
, r_name
) < 0;
515 static bool host_entry_cmp(
516 const VarTable::Entry
*l
,
517 const VarTable::Entry
*r
520 return strcmp(l
->name
, r
->name
) < 0;
523 void Engine::init_ptr_data(void)
528 // Prepare table of host entries
529 std::vector
<const VarTable::Entry
*> host_table(
530 Iterator(__offload_vars
.get_head()),
533 // no need to do anything further is host table is empty
534 if (host_table
.size() <= 0) {
538 // Get var table entries from the target.
539 // First we need to get size for the buffer to copy data
545 res
= COI::PipelineRunFunction(get_pipeline(),
546 m_funcs
[c_func_var_table_size
],
550 ¶ms
, sizeof(params
),
552 check_result(res
, c_pipeline_run_func
, m_index
, res
);
554 res
= COI::EventWait(1, &event
, -1, 1, 0, 0);
555 check_result(res
, c_event_wait
, res
);
557 if (params
.length
== 0) {
561 // create buffer for target entries and copy data to host
563 res
= COI::BufferCreate(params
.length
, COI_BUFFER_NORMAL
, 0, 0, 1,
564 &m_process
, &buffer
);
565 check_result(res
, c_buf_create
, m_index
, res
);
567 COI_ACCESS_FLAGS flags
= COI_SINK_WRITE
;
568 res
= COI::PipelineRunFunction(get_pipeline(),
569 m_funcs
[c_func_var_table_copy
],
572 ¶ms
.nelems
, sizeof(params
.nelems
),
575 check_result(res
, c_pipeline_run_func
, m_index
, res
);
577 res
= COI::EventWait(1, &event
, -1, 1, 0, 0);
578 check_result(res
, c_event_wait
, res
);
580 // patch names in target data
581 VarList::BufEntry
*target_table
;
582 COIMAPINSTANCE map_inst
;
583 res
= COI::BufferMap(buffer
, 0, params
.length
, COI_MAP_READ_ONLY
, 0, 0,
585 reinterpret_cast<void**>(&target_table
));
586 check_result(res
, c_buf_map
, res
);
588 VarList::table_patch_names(target_table
, params
.nelems
);
591 std::sort(target_table
, target_table
+ params
.nelems
, target_entry_cmp
);
592 std::sort(host_table
.begin(), host_table
.end(), host_entry_cmp
);
594 // merge host and target entries and enter matching vars map
595 std::vector
<const VarTable::Entry
*>::const_iterator hi
=
597 std::vector
<const VarTable::Entry
*>::const_iterator he
=
599 const VarList::BufEntry
*ti
= target_table
;
600 const VarList::BufEntry
*te
= target_table
+ params
.nelems
;
602 while (hi
!= he
&& ti
!= te
) {
603 int res
= strcmp((*hi
)->name
, reinterpret_cast<const char*>(ti
->name
));
606 // add matching entry to var map
607 PtrData
*ptr
= insert_ptr_data((*hi
)->addr
, (*hi
)->size
, is_new
);
609 // store address for new entries
611 ptr
->mic_addr
= ti
->addr
;
612 ptr
->is_static
= true;
613 ptr
->var_alloc_type
= (*hi
)->var_alloc_type
;
615 ptr
->alloc_ptr_data_lock
.unlock();
628 res
= COI::BufferUnmap(map_inst
, 0, 0, 0);
629 check_result(res
, c_buf_unmap
, res
);
631 res
= COI::BufferDestroy(buffer
);
632 check_result(res
, c_buf_destroy
, res
);
635 COIRESULT
Engine::compute(
636 _Offload_stream stream
,
637 const std::list
<COIBUFFER
> &buffers
,
643 const COIEVENT
* deps
,
648 COI_ACCESS_FLAGS
*flags
;
651 // convert buffers list to array
652 int num_bufs
= buffers
.size();
654 bufs
= (COIBUFFER
*) alloca(num_bufs
* sizeof(COIBUFFER
));
655 flags
= (COI_ACCESS_FLAGS
*) alloca(num_bufs
*
656 sizeof(COI_ACCESS_FLAGS
));
659 for (std::list
<COIBUFFER
>::const_iterator it
= buffers
.begin();
660 it
!= buffers
.end(); it
++) {
663 // TODO: this should be fixed
664 flags
[i
++] = COI_SINK_WRITE
;
671 COIPIPELINE pipeline
= (stream
== no_stream
) ?
673 get_pipeline(stream
);
675 res
= COI::PipelineRunFunction(pipeline
,
676 m_funcs
[c_func_compute
],
677 num_bufs
, bufs
, flags
,
685 pid_t
Engine::init_device(void)
691 int offload_report_level
;
697 OFFLOAD_DEBUG_TRACE_1(2, 0, c_offload_init
,
698 "Initializing device with logical index %d "
699 "and physical index %d\n",
700 m_index
, m_physical_index
);
703 data
.device_index
= m_index
;
704 data
.devices_total
= mic_engines_total
;
705 data
.console_level
= console_enabled
;
706 data
.offload_report_level
= offload_report_level
;
708 res
= COI::PipelineRunFunction(get_pipeline(),
709 m_funcs
[c_func_init
],
714 check_result(res
, c_pipeline_run_func
, m_index
, res
);
716 res
= COI::EventWait(1, &event
, -1, 1, 0, 0);
717 check_result(res
, c_event_wait
, res
);
719 OFFLOAD_DEBUG_TRACE(2, "Device process pid is %d\n", pid
);
724 // data associated with each thread
726 Thread(long* addr_coipipe_counter
) {
727 m_addr_coipipe_counter
= addr_coipipe_counter
;
728 memset(m_pipelines
, 0, sizeof(m_pipelines
));
733 __sync_sub_and_fetch(m_addr_coipipe_counter
, 1);
734 #else // TARGET_WINNT
735 _InterlockedDecrement(m_addr_coipipe_counter
);
736 #endif // TARGET_WINNT
737 for (int i
= 0; i
< mic_engines_total
; i
++) {
738 if (m_pipelines
[i
] != 0) {
739 COI::PipelineDestroy(m_pipelines
[i
]);
744 COIPIPELINE
get_pipeline(int index
) const {
745 return m_pipelines
[index
];
748 void set_pipeline(int index
, COIPIPELINE pipeline
) {
749 m_pipelines
[index
] = pipeline
;
752 AutoSet
& get_auto_vars() {
757 long* m_addr_coipipe_counter
;
759 COIPIPELINE m_pipelines
[MIC_ENGINES_MAX
];
762 COIPIPELINE
Engine::get_pipeline(void)
764 Thread
* thread
= (Thread
*) thread_getspecific(mic_thread_key
);
766 thread
= new Thread(&m_proc_number
);
767 thread_setspecific(mic_thread_key
, thread
);
770 COIPIPELINE pipeline
= thread
->get_pipeline(m_index
);
776 proc_num
= __sync_fetch_and_add(&m_proc_number
, 1);
777 #else // TARGET_WINNT
778 proc_num
= _InterlockedIncrement(&m_proc_number
);
779 #endif // TARGET_WINNT
781 if (proc_num
> COI_PIPELINE_MAX_PIPELINES
) {
782 LIBOFFLOAD_ERROR(c_coipipe_max_number
, COI_PIPELINE_MAX_PIPELINES
);
786 // Create pipeline for this thread
787 if (m_assigned_cpus
== 0) {
788 // If m_assigned_cpus is NULL, it implies all threads
789 // Create the pipeline with no CPU mask
790 res
= COI::PipelineCreate(m_process
, 0, mic_stack_size
, &pipeline
);
792 // Create COI CPU mask
793 COI_CPU_MASK in_Mask
;
794 res
= COI::PipelineClearCPUMask(in_Mask
);
795 check_result(res
, c_clear_cpu_mask
, m_index
, res
);
797 int threads_per_core
= m_num_threads
/ m_num_cores
;
799 // Available threads are defined by examining of m_assigned_cpus bitset.
801 for (int i
= 1; i
< m_num_threads
; i
++) {
802 // For available thread i m_assigned_cpus[i] is equal to 1
803 if ((*m_assigned_cpus
)[i
]) {
804 COI_CPU_MASK_SET(i
, in_Mask
);
807 OFFLOAD_DEBUG_TRACE(2, "COIPipelineCreate Mask for this CPU thread\n"
808 "%016lx %016lx %016lx %016lx\n%016lx %016lx %016lx %016lx\n"
809 "%016lx %016lx %016lx %016lx\n%016lx %016lx %016lx %016lx\n",
810 in_Mask
[0], in_Mask
[1], in_Mask
[2], in_Mask
[3],
811 in_Mask
[4], in_Mask
[5], in_Mask
[6], in_Mask
[7],
812 in_Mask
[8], in_Mask
[9], in_Mask
[10], in_Mask
[11],
813 in_Mask
[12], in_Mask
[13], in_Mask
[14], in_Mask
[15]);
815 // Create the pipeline with allowable CPUs
816 res
= COI::PipelineCreate(m_process
, in_Mask
, mic_stack_size
, &pipeline
);
818 check_result(res
, c_pipeline_create
, m_index
, res
);
819 thread
->set_pipeline(m_index
, pipeline
);
824 Stream
* Stream::find_stream(uint64_t handle
, bool remove
)
828 m_stream_lock
.lock();
830 StreamMap::iterator it
= all_streams
.find(handle
);
831 if (it
!= all_streams
.end()) {
834 all_streams
.erase(it
);
838 m_stream_lock
.unlock();
842 void Engine::move_cpu_el_after(CpuEl
* cpu_what
, CpuEl
* cpu_after
)
844 if (cpu_what
== cpu_after
) {
847 CpuEl
* cpu_prev
= cpu_what
->prev
;
851 m_cpu_head
= cpu_what
->next
;
854 cpu_prev
->next
= cpu_what
->next
;
856 if (cpu_what
->next
) {
857 cpu_what
->next
->prev
= cpu_prev
;
860 // insert cpu_what after cpu_after
861 cpu_what
->prev
= cpu_after
;
862 cpu_what
->next
= cpu_after
->next
;
863 if (cpu_after
->next
) {
864 cpu_after
->next
->prev
= cpu_what
;
866 cpu_after
->next
= cpu_what
;
869 COIPIPELINE
Engine::get_pipeline(_Offload_stream handle
)
871 Stream
* stream
= Stream::find_stream(handle
, false);
874 LIBOFFLOAD_ERROR(c_offload_no_stream
, m_index
);
878 COIPIPELINE pipeline
= stream
->get_pipeline();
883 COI_CPU_MASK in_Mask
;
886 proc_num
= __sync_fetch_and_add(&m_proc_number
, 1);
887 #else // TARGET_WINNT
888 proc_num
= _InterlockedIncrement(&m_proc_number
);
889 #endif // TARGET_WINNT
891 if (proc_num
> COI_PIPELINE_MAX_PIPELINES
) {
892 LIBOFFLOAD_ERROR(c_coipipe_max_number
, COI_PIPELINE_MAX_PIPELINES
);
896 m_stream_lock
.lock();
898 // start process if not done yet
899 if (m_process
== 0) {
904 res
= COI::PipelineClearCPUMask(in_Mask
);
905 check_result(res
, c_clear_cpu_mask
, m_index
, res
);
907 int stream_cpu_num
= stream
->get_cpu_number();
909 stream
->m_stream_cpus
.reset();
911 int threads_per_core
= m_num_threads
/ m_num_cores
;
914 // Available threads is taken from m_cpus list.
915 // m_cpu_head points to the head of m_cpus.
916 // the elements of m_cpus is ordered by the number of usage in streams.
918 CpuEl
*cpu_el
= m_cpu_head
;
919 CpuEl
*cpu_used_el
, *cpu_used_prev
, *cpu_prev
;
921 for (int i
= 0; i
< stream_cpu_num
; i
++) {
922 COI_CPU_MASK_SET(CPU_INDEX(cpu_el
), in_Mask
);
923 stream
->m_stream_cpus
.set(CPU_INDEX(cpu_el
));
924 //If the number of availabale threads is less than stream_cpu_num,
925 // the stream_cpu_num is restricted to this number.
929 if (i
+ 1 < stream_cpu_num
) {
930 cpu_el
= cpu_el
->next
;
934 // assertion : cpu_el points to the last used thread
935 cpu_used_el
= cpu_el
;
936 while (cpu_used_el
) {
937 cpu_used_el
->count
++;
938 cpu_el
= cpu_prev
= cpu_used_el
;
939 cpu_used_prev
= cpu_used_el
->prev
;
941 cpu_used_el
= cpu_used_prev
;
946 if (cpu_used_el
->count
< cpu_el
->count
) {
949 // Equal used threads are ordered by thread number to
950 // assign to a stream as contiguous threads as possible.
951 else if (cpu_used_el
->count
== cpu_el
->count
&&
952 CPU_INDEX(cpu_used_el
) < CPU_INDEX(cpu_el
)) {
956 cpu_el
= cpu_el
->next
;
958 if (cpu_used_el
!= cpu_prev
) {
959 move_cpu_el_after(cpu_used_el
, cpu_prev
);
961 cpu_used_el
= cpu_used_prev
;
963 print_stream_cpu_list("get_pipeline");
965 // create pipeline for this thread
966 OFFLOAD_DEBUG_TRACE(2, "COIPipelineCreate Mask for this Stream\n"
967 "%016lx %016lx %016lx %016lx\n%016lx %016lx %016lx %016lx\n"
968 "%016lx %016lx %016lx %016lx\n%016lx %016lx %016lx %016lx\n",
969 in_Mask
[0], in_Mask
[1], in_Mask
[2], in_Mask
[3],
970 in_Mask
[4], in_Mask
[5], in_Mask
[6], in_Mask
[7],
971 in_Mask
[8], in_Mask
[9], in_Mask
[10], in_Mask
[11],
972 in_Mask
[12], in_Mask
[13], in_Mask
[14], in_Mask
[15]);
973 res
= COI::PipelineCreate(m_process
, in_Mask
,
974 mic_stack_size
, &pipeline
);
975 check_result(res
, c_pipeline_create
, m_index
, res
);
977 // Set stream's affinities
979 struct affinity_spec affinity_spec
;
983 // "compact" by default
984 affinity_spec
.affinity_type
= affinity_compact
;
986 // Check if user has specified type of affinity
987 if ((affinity_type
= getenv("OFFLOAD_STREAM_AFFINITY")) !=
990 char affinity_str
[16];
991 int affinity_str_len
;
993 OFFLOAD_DEBUG_TRACE(2,
994 "User has specified OFFLOAD_STREAM_AFFINITY=%s\n",
997 // Set type of affinity requested
998 affinity_str_len
= strlen(affinity_type
);
999 for (i
=0; i
<affinity_str_len
&& i
<15; i
++)
1001 affinity_str
[i
] = tolower(affinity_type
[i
]);
1003 affinity_str
[i
] = '\0';
1004 if (strcmp(affinity_str
, "compact") == 0) {
1005 affinity_spec
.affinity_type
= affinity_compact
;
1006 OFFLOAD_DEBUG_TRACE(2, "Setting affinity=compact\n");
1007 } else if (strcmp(affinity_str
, "scatter") == 0) {
1008 affinity_spec
.affinity_type
= affinity_scatter
;
1009 OFFLOAD_DEBUG_TRACE(2, "Setting affinity=scatter\n");
1011 LIBOFFLOAD_ERROR(c_incorrect_affinity
, affinity_str
);
1012 affinity_spec
.affinity_type
= affinity_compact
;
1013 OFFLOAD_DEBUG_TRACE(2, "Setting affinity=compact\n");
1016 // Make flat copy of sink mask because COI's mask is opaque
1017 for (i
=0; i
<16; i
++) {
1018 affinity_spec
.sink_mask
[i
] = in_Mask
[i
];
1020 // Set number of cores and threads
1021 affinity_spec
.num_cores
= m_num_cores
;
1022 affinity_spec
.num_threads
= m_num_threads
;
1025 res
= COI::PipelineRunFunction(pipeline
,
1026 m_funcs
[c_func_set_stream_affinity
],
1029 &affinity_spec
, sizeof(affinity_spec
),
1032 check_result(res
, c_pipeline_run_func
, m_index
, res
);
1034 res
= COI::EventWait(1, &event
, -1, 1, 0, 0);
1035 check_result(res
, c_event_wait
, res
);
1038 m_stream_lock
.unlock();
1039 stream
->set_pipeline(pipeline
);
1044 void Engine::stream_destroy(_Offload_stream handle
)
1047 Stream
* stream
= Stream::find_stream(handle
, true);
1050 // return cpus for future use
1051 for (int i
= 0; i
< m_num_threads
; i
++) {
1052 if (stream
->m_stream_cpus
.test(i
)) {
1053 CpuEl
*cpu_el
= m_cpus
+ i
;
1054 CpuEl
*cpu_first_el
= cpu_el
;
1055 // decrease count of thread "i" and move its CpuEl to the
1056 // proper place into the ordered list
1058 while (cpu_el
->prev
) {
1059 if (cpu_first_el
->count
> cpu_el
->prev
->count
) {
1062 else if (cpu_first_el
->count
== cpu_el
->prev
->count
&&
1063 CPU_INDEX(cpu_first_el
) > CPU_INDEX(cpu_el
->prev
)) {
1066 cpu_el
= cpu_el
->prev
;
1068 cpu_el
= cpu_el
->prev
;
1069 // If cpu_el for thread "i" must be moved in the list
1070 if (cpu_first_el
!= cpu_el
) {
1071 // Thread "i" is used the least times. It must be set as
1074 if (!cpu_first_el
->prev
) {
1078 cpu_first_el
->prev
->next
= cpu_first_el
->next
;
1079 if (cpu_first_el
->next
) {
1080 cpu_first_el
->next
->prev
= cpu_first_el
->prev
;
1082 // make cpu_first_el as new m_cpu_head
1083 cpu_first_el
->prev
= NULL
;
1084 cpu_first_el
->next
= m_cpu_head
;
1085 m_cpu_head
->prev
= cpu_first_el
;
1086 m_cpu_head
= cpu_first_el
;
1089 move_cpu_el_after(cpu_first_el
, cpu_el
);
1094 print_stream_cpu_list("stream_destroy");
1098 LIBOFFLOAD_ERROR(c_offload_no_stream
, m_index
);
1103 uint64_t Engine::get_thread_id(void)
1105 Thread
* thread
= (Thread
*) thread_getspecific(mic_thread_key
);
1107 thread
= new Thread(&m_proc_number
);
1108 thread_setspecific(mic_thread_key
, thread
);
1111 return reinterpret_cast<uint64_t>(thread
);
1114 AutoSet
& Engine::get_auto_vars(void)
1116 Thread
* thread
= (Thread
*) thread_getspecific(mic_thread_key
);
1118 thread
= new Thread(&m_proc_number
);
1119 thread_setspecific(mic_thread_key
, thread
);
1122 return thread
->get_auto_vars();
1125 void Engine::destroy_thread_data(void *data
)
1127 delete static_cast<Thread
*>(data
);