* pt.c (tsubst_decl <FUNCTION_DECL>): Move var decls to
[official-gcc.git] / liboffloadmic / runtime / offload_host.cpp
blobcab08c43550d7e7872f466c510f5b9aac91c7f20
1 /*
2 Copyright (c) 2014-2016 Intel Corporation. All Rights Reserved.
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions
6 are met:
8 * Redistributions of source code must retain the above copyright
9 notice, this list of conditions and the following disclaimer.
10 * Redistributions in binary form must reproduce the above copyright
11 notice, this list of conditions and the following disclaimer in the
12 documentation and/or other materials provided with the distribution.
13 * Neither the name of Intel Corporation nor the names of its
14 contributors may be used to endorse or promote products derived
15 from this software without specific prior written permission.
17 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 // Forward declaration as the following 2 functions are declared as friend
32 // in offload_engine.h.
33 // CLANG does not like static to been after friend declaration.
34 static void __offload_init_library_once(void);
35 static void __offload_fini_library(void);
37 #include "offload_host.h"
38 #ifdef MYO_SUPPORT
39 #include "offload_myo_host.h"
40 #endif
42 #include <malloc.h>
43 #ifndef TARGET_WINNT
44 #include <alloca.h>
45 #include <elf.h>
46 #endif // TARGET_WINNT
47 #include <errno.h>
48 #include <fcntl.h>
49 #include <stdlib.h>
50 #include <string.h>
51 #include <sys/stat.h>
52 #include <sys/types.h>
54 #include <algorithm>
55 #include <bitset>
56 #include <iostream>
58 #if defined(HOST_WINNT)
59 #define PATH_SEPARATOR ";"
60 #else
61 #define PATH_SEPARATOR ":"
62 #endif
64 #define GET_OFFLOAD_NUMBER(timer_data) \
65 timer_data? timer_data->offload_number : 0
67 static void (*task_completion_callback)(void *);
69 extern "C" {
70 #ifdef TARGET_WINNT
71 // Windows does not support imports from libraries without actually
72 // including them as dependence. We don't want to include in the
73 // dependence since is it used only for Fortran when traceback is enabled.
74 // Chose to implement it with GetProcAddress.
75 #define FORTRAN_TRACE_BACK win_for__continue_traceback
76 int win_for__continue_traceback( _Offload_result coi_offload_result )
78 HINSTANCE hDLL;
79 int (* TraceBackRoutine)(_Offload_result value);
81 hDLL = LoadLibrary("libifcoremd.dll");
82 if (hDLL != 0) {
83 TraceBackRoutine = (int (*)(_Offload_result)) GetProcAddress(hDLL,
84 "for__continue_traceback");
85 if (TraceBackRoutine != 0) {
86 return TraceBackRoutine(coi_offload_result);
88 else {
89 OFFLOAD_TRACE(3,
90 "Cannot find for__continue_traceback routine in libifcorert.dll\n");
91 exit(1);
94 else {
95 OFFLOAD_TRACE(3, "Cannot load libifcorert.dll\n");
96 exit(1);
98 return 0;
101 #else // TARGET_WINNT
103 #define FORTRAN_TRACE_BACK for__continue_traceback
105 // for__continue_traceback is provided as a dummy to resolve link time symbols
106 // for C/C++ programs. For Fortran the actual fortran library function in
107 // libifcore.so is used.
108 #pragma weak for__continue_traceback
109 int for__continue_traceback( _Offload_result coi_offload_result )
111 OFFLOAD_TRACE(3,
112 "liboffload function for_continue_traceback should not be called.\n");
113 exit(1);
115 #endif //TARGET_WINNT
116 } // extern "C"
118 #ifdef TARGET_WINNT
119 // Small subset of ELF declarations for Windows which is needed to compile
120 // this file. ELF header is used to understand what binary type is contained
121 // in the target image - shared library or executable.
123 typedef uint16_t Elf64_Half;
124 typedef uint32_t Elf64_Word;
125 typedef uint64_t Elf64_Addr;
126 typedef uint64_t Elf64_Off;
128 #define EI_NIDENT 16
130 #define ET_EXEC 2
131 #define ET_DYN 3
133 typedef struct
135 unsigned char e_ident[EI_NIDENT];
136 Elf64_Half e_type;
137 Elf64_Half e_machine;
138 Elf64_Word e_version;
139 Elf64_Addr e_entry;
140 Elf64_Off e_phoff;
141 Elf64_Off e_shoff;
142 Elf64_Word e_flags;
143 Elf64_Half e_ehsize;
144 Elf64_Half e_phentsize;
145 Elf64_Half e_phnum;
146 Elf64_Half e_shentsize;
147 Elf64_Half e_shnum;
148 Elf64_Half e_shstrndx;
149 } Elf64_Ehdr;
150 #endif // TARGET_WINNT
152 // Host console and file logging
153 const char *prefix;
154 int console_enabled = 0;
155 int offload_number = 0;
157 static const char *htrace_envname = "H_TRACE";
158 static const char *offload_report_envname = "OFFLOAD_REPORT";
159 static const char *timer_envname = "H_TIME";
161 // DMA channel count used by COI and set via
162 // OFFLOAD_DMA_CHANNEL_COUNT environment variable
163 uint32_t mic_dma_channel_count;
165 // Trace information
166 static const char* vardesc_direction_as_string[] = {
167 "NOCOPY",
168 "IN",
169 "OUT",
170 "INOUT"
172 static const char* vardesc_type_as_string[] = {
173 "unknown",
174 "data",
175 "data_ptr",
176 "func_ptr",
177 "void_ptr",
178 "string_ptr",
179 "dv",
180 "dv_data",
181 "dv_data_slice",
182 "dv_ptr",
183 "dv_ptr_data",
184 "dv_ptr_data_slice",
185 "cean_var",
186 "cean_var_ptr",
187 "c_data_ptr_array",
188 "c_extended_type",
189 "c_func_ptr_array",
190 "c_void_ptr_array",
191 "c_string_ptr_array",
192 "c_data_ptr_ptr",
193 "c_func_ptr_ptr",
194 "c_void_ptr_ptr",
195 "c_string_ptr_ptr",
196 "c_cean_var_ptr_ptr",
199 Engine* mic_engines = 0;
200 uint32_t mic_engines_total = 0;
201 pthread_key_t mic_thread_key;
202 MicEnvVar mic_env_vars;
203 uint64_t cpu_frequency = 0;
205 // MIC_STACKSIZE
206 uint32_t mic_stack_size = 12 * 1024 * 1024;
208 // MIC_BUFFERSIZE
209 uint64_t mic_buffer_size = 0;
211 // Preallocated 4K page memory size for buffers on MIC
212 uint64_t mic_4k_buffer_size = 0;
214 // Preallocated 2M page memory size for buffers on MIC
215 uint64_t mic_2m_buffer_size = 0;
218 // LD_LIBRARY_PATH for KNC
219 char* knc_library_path = 0;
221 // LD_LIBRARY_PATH for KNL
222 char* knl_library_path = 0;
225 // MIC_PROXY_IO
226 bool mic_proxy_io = true;
228 // MIC_PROXY_FS_ROOT
229 char* mic_proxy_fs_root = 0;
231 // Threshold for creating buffers with large pages. Buffer is created
232 // with large pages hint if its size exceeds the threshold value.
233 // By default large pages are disabled right now (by setting default
234 // value for threshold to MAX) due to HSD 4114629.
235 uint64_t __offload_use_2mb_buffers = 0xffffffffffffffffULL;
236 static const char *mic_use_2mb_buffers_envname =
237 "MIC_USE_2MB_BUFFERS";
239 static uint64_t __offload_use_async_buffer_write = 2 * 1024 * 1024;
240 static const char *mic_use_async_buffer_write_envname =
241 "MIC_USE_ASYNC_BUFFER_WRITE";
243 static uint64_t __offload_use_async_buffer_read = 2 * 1024 * 1024;
244 static const char *mic_use_async_buffer_read_envname =
245 "MIC_USE_ASYNC_BUFFER_READ";
247 // device initialization type
248 OffloadInitType __offload_init_type = c_init_on_offload_all;
249 static const char *offload_init_envname = "OFFLOAD_INIT";
251 // active wait
252 static bool __offload_active_wait = true;
253 static const char *offload_active_wait_envname = "OFFLOAD_ACTIVE_WAIT";
255 // wait even for asynchronous offload
256 // true for now still the performance issue with COI is not fixed
257 static bool __offload_always_wait = true;
258 static const char *offload_always_wait_envname = "OFFLOAD_ALWAYS_WAIT";
260 // OMP_DEFAULT_DEVICE
261 int __omp_device_num = 0;
262 static const char *omp_device_num_envname = "OMP_DEFAULT_DEVICE";
264 //OFFLOAD_PARALLEL_COPY
265 static bool __offload_parallel_copy = false;
266 static const char *parallel_copy_envname = "OFFLOAD_PARALLEL_COPY";
268 //Use COI interface for noncontiguous transfer if it exists.
269 static bool __offload_use_coi_noncontiguous_transfer = false;
270 static const char *use_coi_noncontiguous_transfer_envname =
271 "MIC_USE_COI_MULTI_D";
273 // The list of pending target libraries
274 static bool __target_libs;
275 static TargetImageList __target_libs_list;
276 static mutex_t __target_libs_lock;
277 static mutex_t stack_alloc_lock;
278 static mutex_t lock_complete;
280 // Set of OffloadDescriptors of asynchronous offloads that are not destroyed
281 std::map<void *, bool> offload_descr_map;
283 // Target executable
284 TargetImage* __target_exe;
285 // is true if last loaded image is dll
286 bool __current_image_is_dll = false;
287 // is true if myo library is loaded when dll is loaded
288 bool __myo_init_in_so = false;
290 // Print readable offload flags
291 static void trace_offload_flags(
292 OffloadHostTimerData* timer_data,
293 OffloadFlags offload_flags
296 // Sized big enough for all flag names
297 char fbuffer[256];
298 bool first = true;
299 if (!OFFLOAD_DO_TRACE && (console_enabled >= 1)) {
300 sprintf(fbuffer, " OffloadFlags=(");
301 if (offload_flags.bits.fortran_traceback) {
302 sprintf(fbuffer+strlen(fbuffer), "fortran_traceback");
303 first = false;
305 if (offload_flags.bits.omp_async) {
306 sprintf(fbuffer+strlen(fbuffer), first ? "omp_async" : ",omp_async");
307 first = false;
309 OFFLOAD_DEBUG_TRACE_1(1,
310 GET_OFFLOAD_NUMBER(timer_data), c_offload_init_func,
311 "%s)\n", fbuffer);
315 // Print readable varDesc flags
316 static void trace_varDesc_flags(
317 OffloadHostTimerData* timer_data,
318 varDescFlags offload_flags
321 // Sized big enough for all flag names
322 char fbuffer[256];
323 bool first = true;
324 if (!OFFLOAD_DO_TRACE && (console_enabled >= 1)) {
325 sprintf(fbuffer, " varDescFlags=(");
326 if (offload_flags.is_static) {
327 sprintf(fbuffer+strlen(fbuffer), "is_static");
328 first = false;
330 if (offload_flags.is_static_dstn) {
331 sprintf(fbuffer+strlen(fbuffer),
332 first ? "is_static_dstn" : ",is_static_dstn");
333 first = false;
335 if (offload_flags.has_length) {
336 sprintf(fbuffer+strlen(fbuffer),
337 first ? "has_length" : ",has_length");
338 first = false;
340 if (offload_flags.is_stack_buf) {
341 sprintf(fbuffer+strlen(fbuffer),
342 first ? "is_stack_buf" : ",is_stack_buf");
343 first = false;
345 if (offload_flags.targetptr) {
346 sprintf(fbuffer+strlen(fbuffer),
347 first ? "targetptr" : ",targetptr");
348 first = false;
350 if (offload_flags.preallocated) {
351 sprintf(fbuffer+strlen(fbuffer),
352 first ? "preallocated" : ",preallocated");
353 first = false;
355 if (offload_flags.is_pointer) {
356 sprintf(fbuffer+strlen(fbuffer),
357 first ? "is_pointer" : ",is_pointer");
358 first = false;
360 if (offload_flags.sink_addr) {
361 sprintf(fbuffer+strlen(fbuffer),
362 first ? "sink_addr" : ",sink_addr");
363 first = false;
365 if (offload_flags.alloc_disp) {
366 sprintf(fbuffer+strlen(fbuffer),
367 first ? "alloc_disp" : ",alloc_disp");
368 first = false;
370 if (offload_flags.is_noncont_src) {
371 sprintf(fbuffer+strlen(fbuffer),
372 first ? "is_noncont_src" : ",is_noncont_src");
373 first = false;
375 if (offload_flags.is_noncont_dst) {
376 sprintf(fbuffer+strlen(fbuffer),
377 first ? "is_noncont_dst" : ",is_noncont_dst");
378 first = false;
380 if (offload_flags.always_copy) {
381 sprintf(fbuffer+strlen(fbuffer),
382 first ? "always_copy" : ",always_copy");
383 first = false;
385 if (offload_flags.always_delete) {
386 sprintf(fbuffer+strlen(fbuffer),
387 first ? "always_delete" : ",always_delete");
388 first = false;
390 if (offload_flags.is_non_cont_struct) {
391 sprintf(fbuffer+strlen(fbuffer),
392 first ? "is_non_cont_struct" : ",is_non_cont_struct");
393 first = false;
395 if (offload_flags.pin) {
396 sprintf(fbuffer+strlen(fbuffer),
397 first ? "pin" : ",pin");
398 first = false;
400 if (offload_flags.is_device_ptr) {
401 sprintf(fbuffer+strlen(fbuffer),
402 first ? "is_device_ptr" : ",is_device_ptr");
403 first = false;
405 if (offload_flags.use_device_ptr) {
406 sprintf(fbuffer+strlen(fbuffer),
407 first ? "use_device_ptr" : ",use_device_ptr");
409 OFFLOAD_DEBUG_TRACE_1(1,
410 GET_OFFLOAD_NUMBER(timer_data), c_offload_init_func,
411 "%s)\n", fbuffer);
415 static char * offload_get_src_base(void * ptr, uint8_t type)
417 char *base;
418 if (VAR_TYPE_IS_PTR(type)) {
419 base = *static_cast<char**>(ptr);
421 else if (VAR_TYPE_IS_SCALAR(type)) {
422 base = static_cast<char*>(ptr);
424 else if (VAR_TYPE_IS_DV_DATA_SLICE(type) || VAR_TYPE_IS_DV_DATA(type)) {
425 ArrDesc *dvp;
426 if (VAR_TYPE_IS_DV_DATA_SLICE(type)) {
427 const Arr_Desc *ap = static_cast<const Arr_Desc*>(ptr);
428 dvp = (type == c_dv_data_slice) ?
429 reinterpret_cast<ArrDesc*>(ap->base) :
430 *reinterpret_cast<ArrDesc**>(ap->base);
432 else {
433 dvp = (type == c_dv_data) ?
434 static_cast<ArrDesc*>(ptr) :
435 *static_cast<ArrDesc**>(ptr);
437 base = reinterpret_cast<char*>(dvp->Base);
439 else {
440 base = NULL;
442 return base;
445 void OffloadDescriptor::report_coi_error(error_types msg, COIRESULT res)
447 // special case for the 'process died' error
448 if (res == COI_PROCESS_DIED) {
449 m_device.fini_process(true);
451 else {
452 switch (msg) {
453 case c_buf_create:
454 if (res == COI_OUT_OF_MEMORY) {
455 msg = c_buf_create_out_of_mem;
457 /* fallthru */
459 case c_buf_create_from_mem:
460 case c_buf_get_address:
461 case c_pipeline_create:
462 case c_pipeline_run_func:
463 LIBOFFLOAD_ERROR(msg, m_device.get_logical_index(), res);
464 break;
466 case c_buf_read:
467 case c_buf_write:
468 case c_buf_copy:
469 case c_buf_map:
470 case c_buf_unmap:
471 case c_buf_destroy:
472 case c_buf_set_state:
473 LIBOFFLOAD_ERROR(msg, res);
474 break;
476 default:
477 break;
481 exit(1);
484 _Offload_result OffloadDescriptor::translate_coi_error(COIRESULT res) const
486 switch (res) {
487 case COI_SUCCESS:
488 return OFFLOAD_SUCCESS;
490 case COI_PROCESS_DIED:
491 return OFFLOAD_PROCESS_DIED;
493 case COI_OUT_OF_MEMORY:
494 return OFFLOAD_OUT_OF_MEMORY;
496 default:
497 return OFFLOAD_ERROR;
501 // is_targetptr == 0 && is_prealloc == 0 - allocation of pointer data;
502 // is_targetptr == 1 && is_prealloc == 0 - allocation of target memory:
503 // allocate memory at target; use its value as base in target table.
504 // is_targetptr == 1 && is_prealloc == 1 - use preallocated target memory:
505 // base - is address at target of preallocated memory; use its value as
506 // base in target table.
508 bool OffloadDescriptor::alloc_ptr_data(
509 PtrData* &ptr_data,
510 void *base,
511 int64_t disp,
512 int64_t size,
513 int64_t alloc_disp,
514 int align,
515 bool is_targptr,
516 bool is_prealloc,
517 bool pin
520 // total length of base
521 int64_t length = size;
522 bool is_new;
523 COIBUFFER targptr_buf;
524 COIRESULT res;
525 uint32_t buffer_flags = 0;
526 char * base_disp = reinterpret_cast<char *>(base) + disp;
528 // create buffer with large pages if data length exceeds
529 // large page threshold
530 if (length >= __offload_use_2mb_buffers) {
531 buffer_flags = COI_OPTIMIZE_HUGE_PAGE_SIZE;
533 // Allocate memory at target for targetptr without preallocated as we need
534 // its address as base argument in call to m_device.insert_ptr_data
535 if (is_targptr && !is_prealloc) {
536 length = alloc_disp ? length : size + disp;
537 res = COI::BufferCreate(
538 length,
539 COI_BUFFER_OPENCL,
540 buffer_flags,
543 &m_device.get_process(),
544 &targptr_buf);
545 if (res != COI_SUCCESS) {
546 if (m_status != 0) {
547 m_status->result = translate_coi_error(res);
549 else if (m_is_mandatory) {
550 report_coi_error(c_buf_create, res);
552 return false;
555 res = COI::BufferGetSinkAddress(
556 targptr_buf, reinterpret_cast<uint64_t *>(&base));
557 if (res != COI_SUCCESS) {
558 if (m_status != 0) {
559 m_status->result = translate_coi_error(res);
561 else if (m_is_mandatory) {
562 report_coi_error(c_buf_get_address, res);
564 return false;
568 OFFLOAD_TRACE(3, "Creating association for data: addr %p, length %lld\n",
569 alloc_disp ? base : base_disp,
570 alloc_disp ? length : size + disp);
572 // add new entry
574 ptr_data = is_targptr ?
575 m_device.find_targetptr_data(base_disp) :
576 m_device.find_ptr_data(base_disp);
577 // if ptr_data is found just need to check it for overlapping
578 if (ptr_data) {
579 is_new = false;
580 base = base_disp;
582 else {
583 // If association is not found we must create it.
584 length = alloc_disp ? length : size + disp;
585 ptr_data = is_targptr ?
586 m_device.insert_targetptr_data(base, length, is_new) :
587 m_device.insert_ptr_data(base, length, is_new);
589 if (is_new) {
591 OFFLOAD_TRACE(3, "Added new association\n");
593 if (length > 0) {
594 OffloadTimer timer(get_timer_data(), c_offload_host_alloc_buffers);
596 // align should be a power of 2
597 if (!pin && !is_targptr &&
598 align > 0 && (align & (align - 1)) == 0) {
599 // offset within mic_buffer. Can do offset optimization
600 // only when source address alignment satisfies requested
601 // alignment on the target (cq172736).
602 if ((reinterpret_cast<intptr_t>(base) & (align - 1)) == 0) {
603 ptr_data->mic_offset =
604 reinterpret_cast<intptr_t>(base) & 4095;
608 // buffer size and flags
609 uint64_t buffer_size = length + ptr_data->mic_offset;
611 // For targetptr there is no CPU buffer
612 if (pin || !is_targptr) {
613 // create CPU buffer
614 OFFLOAD_DEBUG_TRACE_1(3,
615 GET_OFFLOAD_NUMBER(get_timer_data()),
616 c_offload_create_buf_host,
617 "Creating buffer from source memory %p, "
618 "length %lld\n", base, length);
620 // result is not checked because we can continue without cpu
621 // buffer. In this case we will use COIBufferRead/Write
622 // instead of COIBufferCopy.
624 COI::BufferCreateFromMemory(length,
625 COI_BUFFER_OPENCL,
627 base,
629 &m_device.get_process(),
630 &ptr_data->cpu_buf);
633 // create MIC buffer
634 if (is_prealloc) {
635 OFFLOAD_DEBUG_TRACE_1(3,
636 GET_OFFLOAD_NUMBER(get_timer_data()),
637 c_offload_create_buf_mic,
638 "Creating buffer from sink memory: "
639 "addr %p, size %lld, offset %d, flags 0x%x\n",
640 base, buffer_size, ptr_data->mic_offset,
641 buffer_flags);
642 res = COI::BufferCreateFromMemory(ptr_data->cpu_addr.length(),
643 COI_BUFFER_NORMAL,
644 COI_SINK_MEMORY,
645 base,
647 &m_device.get_process(),
648 &ptr_data->mic_buf);
649 if (res != COI_SUCCESS) {
650 if (m_status != 0) {
651 m_status->result = translate_coi_error(res);
653 else if (m_is_mandatory) {
654 report_coi_error(c_buf_create, res);
656 ptr_data->alloc_ptr_data_lock.unlock();
657 return false;
660 else if (is_targptr) {
661 ptr_data->mic_buf = targptr_buf;
663 else if (!pin) {
664 OFFLOAD_DEBUG_TRACE_1(3,
665 GET_OFFLOAD_NUMBER(get_timer_data()),
666 c_offload_create_buf_mic,
667 "Creating buffer for sink: size %lld, offset %d, "
668 "flags =0x%x\n", buffer_size,
669 ptr_data->mic_offset, buffer_flags);
670 res = COI::BufferCreate(buffer_size,
671 COI_BUFFER_NORMAL,
672 buffer_flags,
675 &m_device.get_process(),
676 &ptr_data->mic_buf);
677 if (res != COI_SUCCESS) {
678 if (m_status != 0) {
679 m_status->result = translate_coi_error(res);
681 else if (m_is_mandatory) {
682 report_coi_error(c_buf_create, res);
684 ptr_data->alloc_ptr_data_lock.unlock();
685 return false;
689 if (!pin) {
690 // make buffer valid on the device.
691 res = COI::BufferSetState(ptr_data->mic_buf,
692 m_device.get_process(),
693 COI_BUFFER_VALID,
694 COI_BUFFER_NO_MOVE,
695 0, 0, 0);
696 if (res != COI_SUCCESS) {
697 if (m_status != 0) {
698 m_status->result = translate_coi_error(res);
700 else if (m_is_mandatory) {
701 report_coi_error(c_buf_set_state, res);
703 ptr_data->alloc_ptr_data_lock.unlock();
704 return false;
707 res = COI::BufferSetState(ptr_data->mic_buf,
708 COI_PROCESS_SOURCE,
709 COI_BUFFER_INVALID,
710 COI_BUFFER_NO_MOVE,
711 0, 0, 0);
712 if (res != COI_SUCCESS) {
713 if (m_status != 0) {
714 m_status->result = translate_coi_error(res);
716 else if (m_is_mandatory) {
717 report_coi_error(c_buf_set_state, res);
719 ptr_data->alloc_ptr_data_lock.unlock();
720 return false;
724 ptr_data->alloc_disp = alloc_disp;
725 ptr_data->alloc_ptr_data_lock.unlock();
727 else {
728 mutex_locker_t locker(ptr_data->alloc_ptr_data_lock);
730 OFFLOAD_TRACE(3, "Found existing association: addr %p, length %lld, "
731 "is_static %d\n",
732 ptr_data->cpu_addr.start(), ptr_data->cpu_addr.length(),
733 ptr_data->is_static);
735 // This is not a new entry. Make sure that provided address range fits
736 // into existing one.
737 MemRange addr_range(base, length);
738 if (!ptr_data->cpu_addr.contains(addr_range)) {
739 LIBOFFLOAD_ERROR(c_bad_ptr_mem_alloc, base, length,
740 const_cast<void *>(ptr_data->cpu_addr.start()),
741 ptr_data->cpu_addr.length());
742 exit(1);
745 // if the entry is associated with static data it may not have buffers
746 // created because they are created on demand.
747 if (ptr_data->is_static && !init_static_ptr_data(ptr_data)) {
748 return false;
752 return true;
755 bool OffloadDescriptor::find_ptr_data(
756 PtrData* &ptr_data,
757 void *in_base,
758 int64_t disp,
759 int64_t size,
760 bool is_targetptr,
761 bool report_error
764 // total length of base
765 int64_t length = size;
766 char *base = reinterpret_cast<char *>(in_base) + disp;
768 OFFLOAD_TRACE(3, "Looking for association for data: addr %p, "
769 "length %lld\n", base, length);
771 // find existing association in pointer table
772 ptr_data = is_targetptr ?
773 m_device.find_targetptr_data(base) :
774 m_device.find_ptr_data(base);
775 if (ptr_data == 0) {
776 if (report_error) {
777 LIBOFFLOAD_ERROR(c_no_ptr_data, base);
778 exit(1);
780 OFFLOAD_TRACE(3, "Association does not exist\n");
781 return true;
784 OFFLOAD_TRACE(3, "Found association: base %p, length %lld, is_static %d\n",
785 ptr_data->cpu_addr.start(), ptr_data->cpu_addr.length(),
786 ptr_data->is_static);
788 // make sure that provided address range fits into existing one
789 MemRange addr_range(base, length);
790 if (!ptr_data->cpu_addr.contains(addr_range)) {
791 if (report_error) {
792 LIBOFFLOAD_ERROR(c_bad_ptr_mem_range, base, length,
793 const_cast<void *>(ptr_data->cpu_addr.start()),
794 ptr_data->cpu_addr.length());
795 exit(1);
797 OFFLOAD_TRACE(3, "Existing association partially overlaps with "
798 "data address range\n");
799 ptr_data = 0;
800 return true;
803 // if the entry is associated with static data it may not have buffers
804 // created because they are created on demand.
805 if (ptr_data->is_static && !init_static_ptr_data(ptr_data)) {
806 return false;
809 return true;
812 void OffloadDescriptor::find_device_ptr(
813 int64_t* &device_ptr,
814 void *host_ptr
817 PtrData* ptr_data;
818 char *base = reinterpret_cast<char *>(host_ptr);
820 OFFLOAD_TRACE(3, "Looking for association for data: addr %p\n", base);
822 // find existing association in pointer table
823 ptr_data = m_device.find_ptr_data(base);
825 // MIC address should have been assigned.
826 // For now assume does not exist and get the addr
827 // if ((ptr_data == 0) || ptr_data->mic_addr) {
829 if (ptr_data == 0) {
830 OFFLOAD_TRACE(3, "Association does not exist\n");
831 LIBOFFLOAD_ERROR(c_no_ptr_data, base);
832 exit(1);
834 if (!ptr_data->mic_addr) {
835 COIRESULT res = COI::BufferGetSinkAddress(ptr_data->mic_buf,
836 &ptr_data->mic_addr);
837 if (res != COI_SUCCESS) {
838 if (m_status != 0)
839 m_status->result = translate_coi_error(res);
840 report_coi_error(c_buf_get_address, res);
844 device_ptr = (int64_t *) ptr_data->mic_addr;
846 OFFLOAD_TRACE(3, "Found association: host_ptr %p, device_ptr = %p\n",
847 ptr_data->cpu_addr.start(), device_ptr);
850 bool OffloadDescriptor::init_static_ptr_data(PtrData *ptr_data)
852 OffloadTimer timer(get_timer_data(), c_offload_host_alloc_buffers);
854 if (ptr_data->cpu_buf == 0) {
855 OFFLOAD_TRACE(3, "Creating buffer from source memory %llx\n",
856 ptr_data->cpu_addr.start());
858 COIRESULT res = COI::BufferCreateFromMemory(
859 ptr_data->cpu_addr.length(),
860 COI_BUFFER_OPENCL,
862 const_cast<void*>(ptr_data->cpu_addr.start()),
863 1, &m_device.get_process(),
864 &ptr_data->cpu_buf);
866 if (res != COI_SUCCESS) {
867 if (m_status != 0) {
868 m_status->result = translate_coi_error(res);
869 return false;
871 report_coi_error(c_buf_create_from_mem, res);
875 if (ptr_data->mic_buf == 0) {
876 OFFLOAD_TRACE(3, "Creating buffer from sink memory %llx\n",
877 ptr_data->mic_addr);
879 COIRESULT res = COI::BufferCreateFromMemory(
880 ptr_data->cpu_addr.length(),
881 COI_BUFFER_NORMAL,
882 COI_SINK_MEMORY,
883 reinterpret_cast<void*>(ptr_data->mic_addr),
884 1, &m_device.get_process(),
885 &ptr_data->mic_buf);
887 if (res != COI_SUCCESS) {
888 if (m_status != 0) {
889 m_status->result = translate_coi_error(res);
890 return false;
892 report_coi_error(c_buf_create_from_mem, res);
896 return true;
899 bool OffloadDescriptor::init_mic_address(PtrData *ptr_data)
901 if (ptr_data->mic_buf != 0 && ptr_data->mic_addr == 0) {
902 COIRESULT res = COI::BufferGetSinkAddress(ptr_data->mic_buf,
903 &ptr_data->mic_addr);
904 if (res != COI_SUCCESS) {
905 if (m_status != 0) {
906 m_status->result = translate_coi_error(res);
908 else if (m_is_mandatory) {
909 report_coi_error(c_buf_get_address, res);
911 return false;
914 return true;
917 bool OffloadDescriptor::nullify_target_stack(
918 COIBUFFER targ_buf,
919 uint64_t size
922 char * ptr = (char*)malloc(size);
923 if (ptr == NULL)
924 LIBOFFLOAD_ERROR(c_malloc);
925 COIRESULT res;
927 memset(ptr, 0, size);
928 res = COI::BufferWrite(
929 targ_buf,
931 ptr,
932 size,
933 COI_COPY_UNSPECIFIED,
934 0, 0, 0);
935 free(ptr);
936 if (res != COI_SUCCESS) {
937 if (m_status != 0) {
938 m_status->result = translate_coi_error(res);
939 return false;
941 report_coi_error(c_buf_write, res);
943 return true;
946 static void print_persistList_item(
947 const char *msg,
948 PersistData *cur_el
951 OFFLOAD_TRACE(4, "%s\n", msg);
952 OFFLOAD_TRACE(4, " stack_cpu_addr = %p\n", cur_el->stack_cpu_addr);
953 OFFLOAD_TRACE(4, " routine_id = %d\n", cur_el->routine_id);
954 OFFLOAD_TRACE(4, " thread_id = %lld\n", cur_el->thread_id);
955 OFFLOAD_TRACE(4, " stack_ptr_data = %p\n", cur_el->stack_ptr_data);
956 OFFLOAD_TRACE(4, " MIC buffer = %p\n", cur_el->stack_ptr_data->mic_buf);
957 OFFLOAD_TRACE(4, " MIC addr = %p\n", cur_el->stack_ptr_data->mic_addr);
958 OFFLOAD_TRACE(4, " cpu_stack_addr = %p\n", cur_el->cpu_stack_addr);
961 static mutex_t stack_memory_manager_lock;
963 bool OffloadDescriptor::offload_stack_memory_manager(
964 const void * stack_begin,
965 int routine_id,
966 int buf_size,
967 int align,
968 bool thread_specific_function_locals,
969 bool *is_new)
971 //mutex_locker_t locker(stack_alloc_lock);
972 stack_memory_manager_lock.lock();
974 PersistData * new_el;
975 PersistDataList::iterator it_begin = m_device.m_persist_list.begin();
976 PersistDataList::iterator it_end;
977 int erase = 0;
978 uint64_t cur_thread_id = m_device.get_thread_id();
980 OFFLOAD_TRACE(3, "offload_stack_memory_manager("
981 "stack_begin=%p, routine_id=%d, buf_size=%d,"
982 "align=%d, thread_specific_function_locals=%d, bool=%p)\n",
983 stack_begin, routine_id, buf_size,
984 align, thread_specific_function_locals, is_new);
985 OFFLOAD_TRACE(3, "cur_thread_id=%lld\n", cur_thread_id);
986 *is_new = false;
988 for (PersistDataList::iterator it = m_device.m_persist_list.begin();
989 it != m_device.m_persist_list.end(); it++) {
990 PersistData cur_el = *it;
992 print_persistList_item("Current element in persist list:", &cur_el);
993 if (stack_begin > it->stack_cpu_addr) {
994 if (cur_thread_id == cur_el.thread_id) {
995 // this stack data must be destroyed
996 m_destroy_stack.push_front(cur_el.stack_ptr_data);
997 it_end = it;
998 erase++;
999 OFFLOAD_TRACE(3, "Current element below TOS: so delete\n");
1002 else if (stack_begin == it->stack_cpu_addr) {
1003 if (routine_id != it-> routine_id) {
1004 // this stack data must be destroyed
1005 // because the current function is a dynamic sibling
1006 m_destroy_stack.push_front(cur_el.stack_ptr_data);
1007 it_end = it;
1008 erase++;
1009 OFFLOAD_TRACE(3, "Current element is sibling: so delete\n");
1010 break;
1012 else if (!thread_specific_function_locals ||
1013 cur_thread_id == cur_el.thread_id) {
1014 // stack data is reused
1015 m_stack_ptr_data = it->stack_ptr_data;
1016 if (erase > 0) {
1017 // all obsolete stack sections must be erased from the list
1018 m_device.m_persist_list.erase(it_begin, ++it_end);
1019 m_in_datalen +=
1020 erase * sizeof(new_el->stack_ptr_data->mic_addr);
1022 OFFLOAD_TRACE(3, "Reuse of stack buffer with addr %p\n",
1023 m_stack_ptr_data->mic_addr);
1024 stack_memory_manager_lock.unlock();
1025 return true;
1028 else if (stack_begin < it->stack_cpu_addr &&
1029 cur_thread_id == cur_el.thread_id) {
1030 OFFLOAD_TRACE(3, "Current element is above TOS\n");
1031 break;
1035 if (erase > 0) {
1036 // all obsolete stack sections must be erased from the list
1037 m_device.m_persist_list.erase(it_begin, ++it_end);
1038 m_in_datalen += erase * sizeof(new_el->stack_ptr_data->mic_addr);
1040 // new stack table is created
1041 new_el = new PersistData(stack_begin, routine_id, buf_size, cur_thread_id);
1042 // create MIC buffer
1043 COIRESULT res;
1044 uint32_t buffer_flags = 0;
1046 // create buffer with large pages if data length exceeds
1047 // large page threshold
1048 if (buf_size >= __offload_use_2mb_buffers) {
1049 buffer_flags = COI_OPTIMIZE_HUGE_PAGE_SIZE;
1051 res = COI::BufferCreate(buf_size,
1052 COI_BUFFER_NORMAL,
1053 buffer_flags,
1056 &m_device.get_process(),
1057 &new_el->stack_ptr_data->mic_buf);
1058 if (res != COI_SUCCESS) {
1059 if (m_status != 0) {
1060 m_status->result = translate_coi_error(res);
1062 else if (m_is_mandatory) {
1063 report_coi_error(c_buf_create, res);
1065 stack_memory_manager_lock.unlock();
1066 return false;
1068 // make buffer valid on the device.
1069 res = COI::BufferSetState(new_el->stack_ptr_data->mic_buf,
1070 m_device.get_process(),
1071 COI_BUFFER_VALID,
1072 COI_BUFFER_NO_MOVE,
1073 0, 0, 0);
1074 if (res != COI_SUCCESS) {
1075 if (m_status != 0) {
1076 m_status->result = translate_coi_error(res);
1078 else if (m_is_mandatory) {
1079 report_coi_error(c_buf_set_state, res);
1081 stack_memory_manager_lock.unlock();
1082 return false;
1084 res = COI::BufferSetState(new_el->stack_ptr_data->mic_buf,
1085 COI_PROCESS_SOURCE,
1086 COI_BUFFER_INVALID,
1087 COI_BUFFER_NO_MOVE,
1088 0, 0, 0);
1089 if (res != COI_SUCCESS) {
1090 if (m_status != 0) {
1091 m_status->result = translate_coi_error(res);
1093 else if (m_is_mandatory) {
1094 report_coi_error(c_buf_set_state, res);
1096 stack_memory_manager_lock.unlock();
1097 return false;
1099 // persistence algorithm requires target stack initialy to be nullified
1100 if (!nullify_target_stack(new_el->stack_ptr_data->mic_buf, buf_size)) {
1101 stack_memory_manager_lock.unlock();
1102 return false;
1105 m_stack_ptr_data = new_el->stack_ptr_data;
1106 init_mic_address(m_stack_ptr_data);
1107 OFFLOAD_TRACE(3, "Allocating stack buffer with addr %p\n",
1108 m_stack_ptr_data->mic_addr);
1109 m_device.m_persist_list.push_front(*new_el);
1110 init_mic_address(new_el->stack_ptr_data);
1111 *is_new = true;
1113 stack_memory_manager_lock.unlock();
1114 return true;
1117 // Search through persistent stack buffers
1118 // for the top-of-stack buffer for this thread
1119 char* OffloadDescriptor::get_this_threads_cpu_stack_addr(
1120 const void * stack_begin,
1121 int routine_id,
1122 bool thread_specific_function_locals
1125 uint64_t cur_thread_id = m_device.get_thread_id();
1126 char* matched = 0;
1128 OFFLOAD_TRACE(3, "get_this_threads_cpu_stack_addr("
1129 "stack_begin=%p, routine_id=%d, thread_specific_function_locals=%d)\n",
1130 stack_begin, routine_id, thread_specific_function_locals);
1131 OFFLOAD_TRACE(3, "cur_thread_id=%lld\n", cur_thread_id);
1133 stack_memory_manager_lock.lock();
1134 for (PersistDataList::iterator it = m_device.m_persist_list.begin();
1135 it != m_device.m_persist_list.end(); it++)
1137 PersistData cur_el = *it;
1138 print_persistList_item("Current element in persist list:", &cur_el);
1139 if (stack_begin == cur_el.stack_cpu_addr)
1141 // For OpenMP shared function locals matching is done without
1142 // regard to thread id. But, we return the last match, which
1143 // corresponds to the outer stack.
1144 if (!thread_specific_function_locals)
1146 matched = cur_el.cpu_stack_addr;
1147 continue;
1149 // For non-OpenMP shared function-local variables
1150 // the thread-id must match
1151 if (cur_thread_id == cur_el.thread_id)
1153 matched = cur_el.cpu_stack_addr;
1154 break;
1158 stack_memory_manager_lock.unlock();
1159 if (matched != 0)
1161 OFFLOAD_TRACE(3, "get_this_threads_cpu_stack_addr() => %p\n", matched);
1162 return matched;
1165 OFFLOAD_TRACE(1,
1166 "Could not find persistent data; expect Read/Write failure\n");
1167 return 0;
1170 // Search through persistent stack buffers
1171 // for the top-of-stack MIC buffer for this thread
1172 PtrData* OffloadDescriptor::get_this_threads_mic_stack_addr(
1173 const void * stack_begin,
1174 int routine_id,
1175 bool thread_specific_function_locals
1178 uint64_t cur_thread_id = m_device.get_thread_id();
1179 PtrData* matched = 0;
1181 OFFLOAD_TRACE(3, "get_this_threads_mic_stack_addr("
1182 "stack_begin=%p, routine_id=%d, thread_specific_function_locals=%d)\n",
1183 stack_begin, routine_id, thread_specific_function_locals);
1184 OFFLOAD_TRACE(3, "cur_thread_id=%lld\n", cur_thread_id);
1186 stack_memory_manager_lock.lock();
1187 for (PersistDataList::iterator it = m_device.m_persist_list.begin();
1188 it != m_device.m_persist_list.end(); it++)
1190 PersistData cur_el = *it;
1191 print_persistList_item("Current element in persist list:", &cur_el);
1192 if (stack_begin == cur_el.stack_cpu_addr)
1194 // For OpenMP shared function locals matching is done without
1195 // regard to thread id. But, we return the last match, which
1196 // corresponds to the outer stack.
1197 if (!thread_specific_function_locals)
1199 matched = cur_el.stack_ptr_data;
1200 continue;
1202 // For non-OpenMP shared function-local variables
1203 // the thread-id must match
1204 if (cur_thread_id == cur_el.thread_id)
1206 matched = cur_el.stack_ptr_data;
1207 break;
1211 stack_memory_manager_lock.unlock();
1212 if (matched != 0)
1214 OFFLOAD_TRACE(3, "get_this_threads_mic_stack_addr() => %p\n", matched);
1215 return matched;
1218 OFFLOAD_TRACE(1,
1219 "Could not find persistent data; expect Read/Write failure\n");
1220 return 0;
1223 void OffloadDescriptor::setup_use_device_ptr(int i)
1225 PtrData *ptr_data;
1226 ArrDesc *dvp;
1227 void *base;
1228 if (m_vars_extra[i].type_src == c_dv_ptr) {
1229 dvp = *static_cast<ArrDesc**>(m_vars[i].ptr);
1230 base = reinterpret_cast<void*>(dvp->Base);
1232 else {
1233 base = *static_cast<void**>(m_vars[i].ptr);
1235 if (m_vars[i].direction.in) {
1236 int64_t *device_ptr;
1237 bool is_new = true;
1239 find_device_ptr(device_ptr, base);
1241 // Create a entry in targetptr table using device_ptr
1242 // as lookup for later recover the host pointer
1243 ptr_data = m_device.insert_targetptr_data(device_ptr,
1244 0, is_new);
1246 // Actually the base is a host pointer and cpu_addr is
1247 // device pointer. This is special case where the 2
1248 // address usage is reversed to enable using existing
1249 // PtrData structure instead of adding new fields.
1250 ptr_data->mic_addr = (uint64_t) base;
1252 ptr_data->alloc_ptr_data_lock.unlock();
1254 // Replace host pointer with device pointer
1255 if (m_vars_extra[i].type_src == c_dv_ptr) {
1256 dvp->Base = reinterpret_cast<dv_size>(device_ptr);
1258 else {
1259 *static_cast<void**>(m_vars[i].ptr) = device_ptr;
1262 else if (m_vars[i].direction.out) {
1263 // For use_device_ptr and out find associated host ptr
1264 // and assign to host ptr
1265 ptr_data = m_device.find_targetptr_data(base);
1266 if (!ptr_data) {
1267 LIBOFFLOAD_ERROR(c_no_ptr_data, base);
1268 exit(1);
1270 if (m_vars_extra[i].type_src == c_dv_ptr) {
1271 dvp->Base = ptr_data->mic_addr;
1273 else {
1274 *static_cast<void**>(m_vars[i].ptr) =
1275 reinterpret_cast<void*>(ptr_data->mic_addr);
1277 m_device.remove_targetptr_data(
1278 ptr_data->cpu_addr.start());
1282 bool OffloadDescriptor::setup_descriptors(
1283 VarDesc *vars,
1284 VarDesc2 *vars2,
1285 int vars_total,
1286 int entry_id,
1287 const void *stack_addr
1290 COIRESULT res;
1291 // To enable caching the CPU stack base address for stack variables
1292 char* this_threads_cpu_stack_addr = 0;
1293 // To properly deal with non-OpenMP threading and function-local variables
1294 // For OpenMP threading we support all function-locals in shared mode only
1295 bool thread_specific_function_locals = !omp_in_parallel();
1297 OffloadTimer timer(get_timer_data(), c_offload_host_setup_buffers);
1298 // make a copy of variable descriptors
1299 m_vars_total = vars_total;
1300 if (vars_total > 0) {
1301 m_vars = (VarDesc*) malloc(m_vars_total * sizeof(VarDesc));
1302 if (m_vars == NULL)
1303 LIBOFFLOAD_ERROR(c_malloc);
1304 memcpy(m_vars, vars, m_vars_total * sizeof(VarDesc));
1305 m_vars_extra = (VarExtra*) malloc(m_vars_total * sizeof(VarExtra));
1306 if (m_vars_extra == NULL)
1307 LIBOFFLOAD_ERROR(c_malloc);
1310 // dependencies
1311 m_in_deps_allocated = m_vars_total + 1;
1312 m_in_deps = (COIEVENT*) malloc(sizeof(COIEVENT) * m_in_deps_allocated);
1313 if (m_in_deps == NULL)
1314 LIBOFFLOAD_ERROR(c_malloc);
1315 if (m_vars_total > 0) {
1316 m_out_deps_allocated = m_vars_total;
1317 m_out_deps = (COIEVENT*) malloc(sizeof(COIEVENT) * m_out_deps_allocated);
1318 if (m_out_deps == NULL)
1319 LIBOFFLOAD_ERROR(c_malloc);
1321 // copyin/copyout data length
1322 m_in_datalen = 0;
1323 m_out_datalen = 0;
1325 // First pass over variable descriptors
1326 // - Calculate size of the input and output non-pointer data
1327 // - Allocate buffers for input and output pointers
1328 for (int i = 0; i < m_vars_total; i++) {
1329 void* alloc_base = NULL;
1330 int64_t alloc_disp = 0;
1331 int64_t alloc_size = 0;
1332 bool src_is_for_mic = (m_vars[i].direction.out ||
1333 m_vars[i].into == NULL);
1334 bool src_is_for_host = (m_vars[i].direction.in ||
1335 m_vars[i].into == NULL);
1336 const char *var_sname = "";
1337 if (vars2 != NULL && i < vars_total) {
1338 if (vars2[i].sname != NULL) {
1339 var_sname = vars2[i].sname;
1343 // instead of m_vars[i].type.src we will use m_vars_extra[i].type_src
1344 if (m_vars[i].type.src == c_extended_type) {
1345 VarDescExtendedType *etype =
1346 reinterpret_cast<VarDescExtendedType*>(m_vars[i].ptr);
1347 m_vars_extra[i].type_src = etype->extended_type;
1348 m_vars[i].ptr = etype->ptr;
1350 else {
1351 m_vars_extra[i].type_src = m_vars[i].type.src;
1353 // instead of m_vars[i].type.dst we will use m_vars_extra[i].type_dst
1354 if (m_vars[i].type.dst == c_extended_type) {
1355 VarDescExtendedType *etype =
1356 reinterpret_cast<VarDescExtendedType*>(m_vars[i].into);
1357 if (etype) {
1358 m_vars_extra[i].type_dst = etype->extended_type;
1359 m_vars[i].into = etype->ptr;
1361 else {
1362 m_vars_extra[i].type_dst = m_vars_extra[i].type_src;
1365 else {
1366 m_vars_extra[i].type_dst = m_vars[i].type.dst;
1368 OFFLOAD_TRACE(2, " VarDesc %d, var=%s, %s, %s\n",
1369 i, var_sname,
1370 vardesc_direction_as_string[m_vars[i].direction.bits],
1371 vardesc_type_as_string[m_vars_extra[i].type_src]);
1372 if (vars2 != NULL && i < vars_total && vars2[i].dname != NULL) {
1373 OFFLOAD_TRACE(2, " into=%s, %s\n", vars2[i].dname,
1374 vardesc_type_as_string[m_vars_extra[i].type_dst]);
1376 OFFLOAD_TRACE(2,
1377 " type_src=%d, type_dstn=%d, direction=%d, "
1378 "alloc_if=%d, free_if=%d, align=%d, mic_offset=%d, flags=0x%x, "
1379 "offset=%lld, size=%lld, count/disp=%lld, ptr=%p, into=%p\n",
1380 m_vars_extra[i].type_src,
1381 m_vars_extra[i].type_dst,
1382 m_vars[i].direction.bits,
1383 m_vars[i].alloc_if,
1384 m_vars[i].free_if,
1385 m_vars[i].align,
1386 m_vars[i].mic_offset,
1387 m_vars[i].flags.bits,
1388 m_vars[i].offset,
1389 m_vars[i].size,
1390 m_vars[i].count,
1391 m_vars[i].ptr,
1392 m_vars[i].into);
1393 // If any varDesc flags bits set, show them
1394 if (console_enabled >= 1 && m_vars[i].flags.bits != 0) {
1395 trace_varDesc_flags(get_timer_data(), m_vars[i].flags);
1398 // preallocated implies targetptr
1399 if (m_vars[i].flags.preallocated) {
1400 // targetptr preallocated alloc_if(1) may not be used with
1401 // an in clause
1402 if (m_vars[i].direction.in && m_vars[i].alloc_if) {
1403 LIBOFFLOAD_ERROR(c_in_with_preallocated);
1404 exit(1);
1406 m_vars[i].flags.targetptr = 1;
1408 if (m_vars[i].alloc != NULL) {
1409 // array descriptor
1410 const Arr_Desc *ap =
1411 static_cast<const Arr_Desc*>(m_vars[i].alloc);
1413 // debug dump
1414 ARRAY_DESC_DUMP(" ", "ALLOC", ap, 0, 1);
1416 __arr_data_offset_and_length(ap, alloc_disp, alloc_size);
1418 alloc_base = reinterpret_cast<void*>(ap->base);
1421 m_vars_extra[i].alloc = m_vars[i].alloc;
1422 m_vars_extra[i].auto_data = 0;
1423 m_vars_extra[i].cpu_disp = 0;
1424 m_vars_extra[i].cpu_offset = 0;
1425 m_vars_extra[i].src_data = 0;
1426 m_vars_extra[i].read_rng_src = 0;
1427 m_vars_extra[i].read_rng_dst = 0;
1428 m_vars_extra[i].omp_last_event_type = c_last_not;
1429 // flag is_arr_ptr_el is 1 only for var_descs generated
1430 // for c_data_ptr_array type
1431 if (i < vars_total) {
1432 m_vars_extra[i].is_arr_ptr_el = 0;
1434 if (TYPE_IS_PTR_TO_PTR(m_vars_extra[i].type_src) ||
1435 TYPE_IS_PTR_TO_PTR(m_vars_extra[i].type_dst) ||
1436 m_vars[i].flags.is_pointer) {
1437 m_vars_extra[i].pointer_offset = m_vars[i].offset;
1438 m_vars[i].offset = 0;
1439 m_in_datalen += sizeof(m_vars[i].offset);
1442 switch (m_vars_extra[i].type_src) {
1443 case c_data_ptr_array:
1445 const Arr_Desc *ap;
1446 const VarDesc3 *vd3 =
1447 static_cast<const VarDesc3*>(m_vars[i].ptr);
1448 int flags = vd3->array_fields;
1449 OFFLOAD_TRACE(2,
1450 " pointer array flags = %04x\n", flags);
1451 OFFLOAD_TRACE(2,
1452 " pointer array type is %s\n",
1453 vardesc_type_as_string[flags & 0x3f]);
1454 ap = static_cast<const Arr_Desc*>(vd3->ptr_array);
1455 ARRAY_DESC_DUMP(" ", "ptr array", ap,
1456 m_vars[i].flags.is_pointer, 1);
1457 if (m_vars[i].into) {
1458 ap = static_cast<const Arr_Desc*>(m_vars[i].into);
1459 ARRAY_DESC_DUMP(
1460 " ", "into array", ap, 0, 1);
1462 if ((flags & (1<<flag_align_is_array)) != 0) {
1463 ap = static_cast<const Arr_Desc*>(vd3->align_array);
1464 ARRAY_DESC_DUMP(
1465 " ", "align array", ap, 0, 1);
1467 if ((flags & (1<<flag_alloc_if_is_array)) != 0) {
1468 ap = static_cast<const Arr_Desc*>(vd3->alloc_if_array);
1469 ARRAY_DESC_DUMP(
1470 " ", "alloc_if array", ap, 0, 1);
1472 if ((flags & (1<<flag_free_if_is_array)) != 0) {
1473 ap = static_cast<const Arr_Desc*>(vd3->free_if_array);
1474 ARRAY_DESC_DUMP(
1475 " ", "free_if array", ap, 0, 1);
1477 if ((flags & (1<<flag_extent_start_is_array)) != 0) {
1478 ap = static_cast<const Arr_Desc*>(vd3->extent_start);
1479 ARRAY_DESC_DUMP(
1480 " ", "extent_start array", ap, 0, 1);
1481 } else if ((flags &
1482 (1<<flag_extent_start_is_scalar)) != 0) {
1483 OFFLOAD_TRACE(2,
1484 " extent_start scalar = %d\n",
1485 (int64_t)vd3->extent_start);
1487 if ((flags & (1<<flag_extent_elements_is_array)) != 0) {
1488 ap = static_cast<const Arr_Desc*>
1489 (vd3->extent_elements);
1490 ARRAY_DESC_DUMP(" ",
1491 "extent_elements array", ap, 0, 1);
1492 } else if ((flags &
1493 (1<<flag_extent_elements_is_scalar)) != 0) {
1494 OFFLOAD_TRACE(2,
1495 " extent_elements scalar = %d\n",
1496 (int64_t)vd3->extent_elements);
1498 if ((flags & (1<<flag_into_start_is_array)) != 0) {
1499 ap = static_cast<const Arr_Desc*>(vd3->into_start);
1500 ARRAY_DESC_DUMP(
1501 " ", "into_start array", ap, 0, 1);
1502 } else if ((flags &
1503 (1<<flag_into_start_is_scalar)) != 0) {
1504 OFFLOAD_TRACE(2,
1505 " into_start scalar = %d\n",
1506 (int64_t)vd3->into_start);
1508 if ((flags & (1<<flag_into_elements_is_array)) != 0) {
1509 ap = static_cast<const Arr_Desc*>(vd3->into_elements);
1510 ARRAY_DESC_DUMP(
1511 " ", "into_elements array", ap, 0, 1);
1512 } else if ((flags &
1513 (1<<flag_into_elements_is_scalar)) != 0) {
1514 OFFLOAD_TRACE(2,
1515 " into_elements scalar = %d\n",
1516 (int64_t)vd3->into_elements);
1518 if ((flags & (1<<flag_alloc_start_is_array)) != 0) {
1519 ap = static_cast<const Arr_Desc*>(vd3->alloc_start);
1520 ARRAY_DESC_DUMP(
1521 " ", "alloc_start array", ap, 0, 1);
1522 } else if ((flags &
1523 (1<<flag_alloc_start_is_scalar)) != 0) {
1524 OFFLOAD_TRACE(2,
1525 " alloc_start scalar = %d\n",
1526 (int64_t)vd3->alloc_start);
1528 if ((flags & (1<<flag_alloc_elements_is_array)) != 0) {
1529 ap = static_cast<const Arr_Desc*>(vd3->alloc_elements);
1530 ARRAY_DESC_DUMP(" ",
1531 "alloc_elements array", ap, 0, 1);
1532 } else if ((flags &
1533 (1<<flag_alloc_elements_is_scalar)) != 0) {
1534 OFFLOAD_TRACE(2,
1535 " alloc_elements scalar = %d\n",
1536 (int64_t)vd3->alloc_elements);
1539 if (!gen_var_descs_for_pointer_array(i)) {
1540 return false;
1542 break;
1544 case c_data:
1545 case c_void_ptr:
1546 case c_void_ptr_ptr:
1547 case c_cean_var:
1548 // In all uses later
1549 // VarDesc.size will have the length of the data to be
1550 // transferred
1551 // VarDesc.disp will have an offset from base
1553 if (m_vars[i].flags.is_non_cont_struct && src_is_for_host) {
1554 NonContigDesc *desc =
1555 static_cast<NonContigDesc*>(m_vars[i].ptr);
1556 noncont_struct_dump(" ", "DATA", desc);
1557 m_vars_extra[i].noncont_desc = desc;
1558 m_vars[i].ptr = reinterpret_cast<void*>(desc->base);
1559 m_vars[i].size = get_noncont_struct_size(desc);
1560 m_vars[i].disp = 0;
1562 else if (m_vars_extra[i].type_src == c_cean_var) {
1563 // array descriptor
1564 const Arr_Desc *ap =
1565 static_cast<const Arr_Desc*>(m_vars[i].ptr);
1567 // debug dump
1568 ARRAY_DESC_DUMP("", "IN/OUT", ap, 0, !src_is_for_mic);
1570 // offset and length are derived from the array descriptor
1571 __arr_data_offset_and_length(ap, m_vars[i].disp,
1572 m_vars[i].size);
1573 if (!is_arr_desc_contiguous(ap)) {
1574 m_vars[i].flags.is_noncont_src = 1;
1575 m_vars_extra[i].read_rng_src =
1576 init_read_ranges_arr_desc(ap);
1578 // all necessary information about length and offset is
1579 // transferred in var descriptor. There is no need to send
1580 // array descriptor to the target side.
1581 m_vars[i].ptr = reinterpret_cast<void*>(ap->base);
1583 else {
1584 m_vars[i].size *= m_vars[i].count;
1585 m_vars[i].disp = 0;
1588 if (m_vars[i].direction.bits) {
1589 // make sure that transfer size > 0
1590 if (m_vars[i].size <= 0) {
1591 LIBOFFLOAD_ERROR(c_zero_or_neg_transfer_size);
1592 exit(1);
1595 if (m_vars[i].flags.is_static) {
1596 PtrData *ptr_data;
1597 // find data associated with variable
1598 if (!find_ptr_data(ptr_data,
1599 m_vars[i].ptr,
1600 m_vars[i].disp,
1601 m_vars[i].size,
1602 false, false)) {
1603 return false;
1606 if (ptr_data != 0) {
1607 // offset to base from the beginning of the buffer
1608 // memory
1609 m_vars[i].offset =
1610 (char*) m_vars[i].ptr -
1611 (char*) ptr_data->cpu_addr.start();
1613 else {
1614 m_vars[i].flags.is_static = false;
1615 if (m_vars[i].into == NULL) {
1616 m_vars[i].flags.is_static_dstn = false;
1619 m_vars_extra[i].src_data = ptr_data;
1622 if (m_vars[i].direction.in &&
1623 !m_vars[i].flags.is_static &&
1624 !m_vars[i].flags.is_stack_buf) {
1625 m_in_datalen += m_vars[i].size;
1627 // for non-static target destination defined as CEAN
1628 // expression we pass to target its size and dist
1629 if (m_vars[i].into == NULL &&
1630 m_vars_extra[i].type_src == c_cean_var) {
1631 m_in_datalen += 2 * sizeof(uint64_t);
1633 m_need_runfunction = true;
1635 if (m_vars[i].direction.out &&
1636 !m_vars[i].flags.is_static &&
1637 !m_vars[i].flags.is_stack_buf) {
1638 m_out_datalen += m_vars[i].size;
1639 m_need_runfunction = true;
1642 if (m_is_openmp && src_is_for_host &&
1643 !m_vars[i].flags.is_device_ptr) {
1644 if (m_vars[i].flags.is_static) {
1645 PtrData *ptr_data = m_vars_extra[i].src_data;
1646 // Static data is transferred either by omp target
1647 // update construct which passes zeros for
1648 // alloc_if and free_if or by always modifier.
1649 // Implicit openmp reference is transfered also
1650 // if its reference count is equal to 1
1651 if (ptr_data &&
1652 IS_OPENMP_IMPLICIT_OR_LINK(ptr_data->var_alloc_type)) {
1653 if (m_vars[i].alloc_if) {
1654 ptr_data->add_reference();
1657 if (!m_vars[i].flags.always_copy &&
1658 (m_vars[i].alloc_if || m_vars[i].free_if) &&
1659 ptr_data->get_reference() != 1) {
1660 m_vars[i].direction.bits = c_parameter_nocopy;
1663 else if (
1664 !m_vars[i].flags.always_copy &&
1665 (m_vars[i].alloc_if || m_vars[i].free_if)) {
1666 m_vars[i].direction.bits = c_parameter_nocopy;
1669 else {
1670 AutoData *auto_data;
1671 if (m_vars[i].alloc_if) {
1672 auto_data = m_device.insert_auto_data(
1673 m_vars[i].ptr, m_vars[i].size);
1674 auto_data->add_reference();
1676 else {
1677 // TODO: what should be done if var is not in
1678 // the table?
1679 auto_data = m_device.find_auto_data(
1680 m_vars[i].ptr);
1683 // For automatic variables data is transferred:
1684 // - if always modifier is used OR
1685 // - if alloc_if == 0 && free_if == 0 OR
1686 // - if reference count is 1
1687 if (!m_vars[i].flags.always_copy &&
1688 (m_vars[i].alloc_if || m_vars[i].free_if) &&
1689 auto_data != 0 &&
1690 auto_data->get_reference() != 1) {
1691 m_vars[i].direction.bits = c_parameter_nocopy;
1694 // save data for later use
1695 m_vars_extra[i].auto_data = auto_data;
1698 break;
1700 case c_dv:
1701 if (m_vars[i].flags.use_device_ptr) {
1702 setup_use_device_ptr(i);
1703 break;
1705 else if (m_vars[i].direction.bits ||
1706 m_vars[i].alloc_if ||
1707 m_vars[i].free_if) {
1708 ArrDesc *dvp = static_cast<ArrDesc*>(m_vars[i].ptr);
1710 // debug dump
1711 __dv_desc_dump("IN/OUT", dvp);
1713 // send dope vector contents excluding base
1714 m_in_datalen += m_vars[i].size - sizeof(uint64_t);
1715 m_need_runfunction = true;
1717 break;
1719 case c_string_ptr:
1720 case c_string_ptr_ptr:
1721 if ((m_vars[i].direction.bits ||
1722 m_vars[i].alloc_if ||
1723 m_vars[i].free_if) &&
1724 m_vars[i].size == 0) {
1725 m_vars[i].size = 1;
1726 m_vars[i].count =
1727 strlen(*static_cast<char**>(m_vars[i].ptr)) + 1;
1729 /* fallthru */
1731 case c_data_ptr:
1732 case c_data_ptr_ptr:
1733 if (m_vars[i].flags.is_stack_buf &&
1734 !m_vars[i].direction.bits &&
1735 m_vars[i].alloc_if) {
1736 // this var_desc is for stack buffer
1737 bool is_new;
1739 if (!offload_stack_memory_manager(
1740 stack_addr, entry_id,
1741 m_vars[i].count, m_vars[i].align,
1742 thread_specific_function_locals, &is_new)) {
1743 return false;
1745 if (is_new) {
1746 m_compute_buffers.push_back(
1747 m_stack_ptr_data->mic_buf);
1748 m_device.m_persist_list.front().cpu_stack_addr =
1749 static_cast<char*>(m_vars[i].ptr);
1750 PersistData *new_el = &m_device.m_persist_list.front();
1751 print_persistList_item(
1752 "New element in persist list:",
1753 new_el);
1755 else {
1756 m_vars[i].flags.sink_addr = 1;
1757 m_in_datalen += sizeof(m_stack_ptr_data->mic_addr);
1758 if (thread_specific_function_locals) {
1759 m_stack_ptr_data = get_this_threads_mic_stack_addr(
1760 stack_addr, entry_id,
1761 thread_specific_function_locals);
1764 m_vars[i].size = m_destroy_stack.size();
1765 m_vars_extra[i].src_data = m_stack_ptr_data;
1767 // need to add or remove references for stack buffer at target
1768 if (is_new || m_destroy_stack.size()) {
1769 m_need_runfunction = true;
1772 break;
1774 /* fallthru */
1776 case c_cean_var_ptr:
1777 case c_cean_var_ptr_ptr:
1778 case c_dv_ptr:
1779 if (m_vars[i].flags.is_non_cont_struct && src_is_for_host) {
1780 NonContigDesc *desc =
1781 static_cast<NonContigDesc*>(m_vars[i].ptr);
1782 noncont_struct_dump(" ", "PTR", desc);
1783 m_vars_extra[i].noncont_desc = desc;
1784 m_vars[i].ptr = reinterpret_cast<void*>(desc->base);
1785 m_vars[i].disp = 0;
1787 else if (m_vars_extra[i].type_src == c_cean_var_ptr ||
1788 m_vars_extra[i].type_src == c_cean_var_ptr_ptr) {
1789 // array descriptor
1790 const Arr_Desc *ap =
1791 static_cast<const Arr_Desc*>(m_vars[i].ptr);
1793 // debug dump
1794 ARRAY_DESC_DUMP("", "IN/OUT", ap, 1, !src_is_for_mic);
1796 // offset and length are derived from the array descriptor
1797 __arr_data_offset_and_length(ap, m_vars[i].disp,
1798 m_vars[i].size);
1800 if (!is_arr_desc_contiguous(ap)) {
1801 m_vars[i].flags.is_noncont_src = 1;
1802 m_vars_extra[i].read_rng_src =
1803 init_read_ranges_arr_desc(ap);
1805 // all necessary information about length and offset is
1806 // transferred in var descriptor. There is no need to send
1807 // array descriptor to the target side.
1808 m_vars[i].ptr = reinterpret_cast<void*>(ap->base);
1810 else if (m_vars_extra[i].type_src == c_dv_ptr) {
1811 // need to send DV to the device unless it is 'nocopy'
1812 if (m_vars[i].direction.bits ||
1813 m_vars[i].alloc_if ||
1814 m_vars[i].free_if) {
1815 ArrDesc *dvp = *static_cast<ArrDesc**>(m_vars[i].ptr);
1817 // debug dump
1818 __dv_desc_dump("IN/OUT", dvp);
1820 // for use_device_ptr don't need to change
1821 // OUT direction to IN direction
1822 if (!m_vars[i].flags.use_device_ptr) {
1823 m_vars[i].direction.bits = c_parameter_in;
1827 // no displacement
1828 m_vars[i].disp = 0;
1830 else {
1831 // For "use_device_ptr" if direction is "in" then need to
1832 // find the associated device pointer and replace the host
1833 // pointer with device pointer. Also save the host pointer
1834 // to restore when "out" is encountered.
1835 // For "out" find the host pointer associated with the
1836 // device pointer and restore the host pointer
1837 if (m_vars[i].flags.use_device_ptr && src_is_for_host) {
1838 setup_use_device_ptr(i);
1839 break;
1842 // c_data_ptr or c_string_ptr
1843 m_vars[i].size *= m_vars[i].count;
1844 m_vars[i].disp = 0;
1847 if (m_vars[i].direction.bits ||
1848 m_vars[i].alloc_if ||
1849 m_vars[i].free_if) {
1850 PtrData *ptr_data;
1852 // check that buffer length > 0
1853 if (m_vars[i].alloc_if &&
1854 m_vars[i].disp + m_vars[i].size <
1855 (m_is_openmp ? 0 : 1)) {
1856 LIBOFFLOAD_ERROR(c_zero_or_neg_ptr_len);
1857 exit(1);
1860 // base address
1861 void *base = *static_cast<void**>(m_vars[i].ptr);
1863 // allocate buffer if we have no INTO and don't need
1864 // allocation for the ptr at target
1865 if (src_is_for_mic) {
1866 if (m_vars[i].flags.is_stack_buf) {
1867 // for stack persistent objects ptr data is created
1868 // by var_desc with number 0.
1869 // Its ptr_data is stored at m_stack_ptr_data
1870 ptr_data = m_stack_ptr_data;
1872 else if (m_vars[i].alloc_if) {
1873 if (m_vars[i].flags.preallocated) {
1874 m_out_datalen += sizeof(void*);
1875 m_need_runfunction = true;
1876 break;
1878 // add new entry
1879 if (!alloc_ptr_data(
1880 ptr_data,
1881 reinterpret_cast<char *>(base) + alloc_disp,
1882 (alloc_base != NULL) ?
1883 alloc_disp : m_vars[i].disp,
1884 (alloc_base != NULL) ?
1885 alloc_size : m_vars[i].size,
1886 alloc_disp,
1887 (alloc_base != NULL) ?
1888 0 : m_vars[i].align,
1889 m_vars[i].flags.targetptr,
1891 m_vars[i].flags.pin)) {
1892 return false;
1894 if (m_vars[i].flags.targetptr) {
1895 if (!init_mic_address(ptr_data)) {
1896 return false;
1898 *static_cast<void**>(m_vars[i].ptr) = base =
1899 reinterpret_cast<void*>(ptr_data->mic_addr);
1901 if (ptr_data->add_reference() == 0 &&
1902 ptr_data->mic_buf != 0) {
1903 // add buffer to the list of buffers that
1904 // are passed to dispatch call
1905 m_compute_buffers.push_back(
1906 ptr_data->mic_buf);
1908 else if (!m_vars[i].flags.pin &&
1909 !m_vars[i].flags.preallocated) {
1910 // will send buffer address to device
1911 m_vars[i].flags.sink_addr = 1;
1912 m_in_datalen += sizeof(ptr_data->mic_addr);
1915 if (!m_vars[i].flags.pin &&
1916 !ptr_data->is_static) {
1917 // need to add reference for buffer
1918 m_need_runfunction = true;
1921 else {
1922 bool error_if_not_found = true;
1923 if (m_is_openmp) {
1924 // For omp target update variable is ignored
1925 // if it does not exist.
1926 if (m_vars[i].flags.always_copy ||
1927 (!m_vars[i].alloc_if &&
1928 !m_vars[i].free_if)) {
1929 error_if_not_found = false;
1933 // use existing association from pointer table
1934 if (!find_ptr_data(ptr_data,
1935 base,
1936 m_vars[i].disp,
1937 m_vars[i].size,
1938 m_vars[i].flags.targetptr,
1939 error_if_not_found)) {
1940 return false;
1943 if (m_is_openmp) {
1944 // make var nocopy if it does not exist
1945 if (ptr_data == 0) {
1946 m_vars[i].direction.bits =
1947 c_parameter_nocopy;
1951 if (ptr_data != 0) {
1952 m_vars[i].flags.sink_addr = 1;
1953 m_in_datalen += sizeof(ptr_data->mic_addr);
1957 if (ptr_data != 0) {
1959 if (ptr_data->alloc_disp != 0) {
1960 m_vars[i].flags.alloc_disp = 1;
1961 m_in_datalen += sizeof(alloc_disp);
1964 if (m_vars[i].flags.sink_addr) {
1965 // get buffers's address on the sink
1966 if (!init_mic_address(ptr_data)) {
1967 return false;
1970 m_in_datalen += sizeof(ptr_data->mic_addr);
1973 if (!m_vars[i].flags.pin &&
1974 !ptr_data->is_static && m_vars[i].free_if) {
1975 // need to decrement buffer reference on target
1976 m_need_runfunction = true;
1979 // offset to base from the beginning of the buffer
1980 // memory
1981 m_vars[i].offset = (char*) base -
1982 (char*) ptr_data->cpu_addr.start();
1984 // copy other pointer properties to var descriptor
1985 m_vars[i].mic_offset = ptr_data->mic_offset;
1986 m_vars[i].flags.is_static = ptr_data->is_static;
1989 else {
1990 if (!find_ptr_data(ptr_data,
1991 base,
1992 m_vars[i].disp,
1993 m_vars[i].size,
1994 false, false)) {
1995 return false;
1997 if (ptr_data) {
1998 m_vars[i].offset =
1999 (char*) base -
2000 (char*) ptr_data->cpu_addr.start();
2004 if (m_is_openmp) {
2005 if (m_vars[i].flags.use_device_ptr) {
2006 setup_use_device_ptr(i);
2008 // for TO transfer of stack buffer's variable
2009 if (src_is_for_host && m_vars[i].flags.is_stack_buf) {
2010 AutoData *auto_data;
2011 char *base = *static_cast<char**>(m_vars[i].ptr);
2012 if (m_vars[i].alloc_if) {
2013 auto_data =m_device.insert_auto_data(
2014 base + m_vars[i].disp,
2015 m_vars[i].size);
2016 auto_data->add_reference();
2018 else {
2019 auto_data = m_device.find_auto_data(
2020 base + m_vars[i].disp);
2022 // save data for later use
2023 m_vars_extra[i].auto_data = auto_data;
2025 // For automatic variables
2026 // data is transferred:
2027 // - if always modifier is used OR
2028 // - if alloc_if == 0 && free_if == 0 OR
2029 // - if reference count is 1
2030 if (!m_vars[i].flags.always_copy &&
2031 (m_vars[i].alloc_if ||
2032 m_vars[i].free_if) &&
2033 auto_data != 0 &&
2034 auto_data->get_reference() != 1) {
2035 m_vars[i].direction.bits =
2036 c_parameter_nocopy;
2039 // for FROM transfer of global pointer variable
2040 // FROM transfer of stack buffer's variable
2041 // is treated at INTO branch
2042 else if (src_is_for_mic &&
2043 !m_vars[i].flags.is_stack_buf) {
2044 // data is transferred only if
2045 // alloc_if == 0 && free_if == 0
2046 // or reference count is 1
2047 if (!m_vars[i].flags.always_copy &&
2048 (m_vars[i].alloc_if ||
2049 m_vars[i].free_if) &&
2050 ptr_data &&
2051 ptr_data->get_reference() != 1)
2053 m_vars[i].direction.bits =
2054 c_parameter_nocopy;
2058 // save pointer data
2059 m_vars_extra[i].src_data = ptr_data;
2061 break;
2063 case c_func_ptr:
2064 case c_func_ptr_ptr:
2065 if (m_vars[i].direction.in) {
2066 m_in_datalen += __offload_funcs.max_name_length();
2068 if (m_vars[i].direction.out) {
2069 m_out_datalen += __offload_funcs.max_name_length();
2071 m_need_runfunction = true;
2072 break;
2074 case c_dv_data:
2075 case c_dv_ptr_data:
2076 case c_dv_data_slice:
2077 case c_dv_ptr_data_slice:
2078 ArrDesc *dvp;
2079 if (m_vars[i].flags.is_non_cont_struct) {
2080 NonContigDesc *desc =
2081 static_cast<NonContigDesc*>(m_vars[i].ptr);
2082 noncont_struct_dump(" ", "DV-DATA", desc);
2083 dvp = reinterpret_cast<ArrDesc*>(desc->base);
2085 else if (VAR_TYPE_IS_DV_DATA_SLICE(m_vars_extra[i].type_src)) {
2086 const Arr_Desc *ap;
2087 ap = static_cast<const Arr_Desc*>(m_vars[i].ptr);
2089 dvp = (m_vars_extra[i].type_src == c_dv_data_slice) ?
2090 reinterpret_cast<ArrDesc*>(ap->base) :
2091 *reinterpret_cast<ArrDesc**>(ap->base);
2093 else {
2094 dvp = (m_vars_extra[i].type_src == c_dv_data) ?
2095 static_cast<ArrDesc*>(m_vars[i].ptr) :
2096 *static_cast<ArrDesc**>(m_vars[i].ptr);
2099 // if allocatable dope vector isn't allocated don't
2100 // transfer its data
2101 if (!__dv_is_allocated(dvp)) {
2102 m_vars[i].direction.bits = c_parameter_nocopy;
2103 m_vars[i].alloc_if = 0;
2104 m_vars[i].free_if = 0;
2106 if (m_vars[i].direction.bits ||
2107 m_vars[i].alloc_if ||
2108 m_vars[i].free_if) {
2109 const Arr_Desc *ap;
2111 if (VAR_TYPE_IS_DV_DATA_SLICE(m_vars_extra[i].type_src)) {
2112 ap = static_cast<const Arr_Desc*>(m_vars[i].ptr);
2114 // debug dump
2115 ARRAY_DESC_DUMP("", "IN/OUT", ap, 0, !src_is_for_mic);
2117 if (!__dv_is_contiguous(dvp)) {
2118 m_vars[i].flags.is_noncont_src = 1;
2119 m_vars_extra[i].read_rng_src =
2120 init_read_ranges_dv(dvp);
2123 // size and displacement
2124 if (VAR_TYPE_IS_DV_DATA_SLICE(m_vars_extra[i].type_src)) {
2125 // offset and length are derived from the
2126 // array descriptor
2127 __arr_data_offset_and_length(ap,
2128 m_vars[i].disp,
2129 m_vars[i].size);
2130 if (m_vars[i].direction.bits) {
2131 if (!is_arr_desc_contiguous(ap)) {
2132 if (m_vars[i].flags.is_noncont_src) {
2133 LIBOFFLOAD_ERROR(c_slice_of_noncont_array);
2134 return false;
2136 m_vars[i].flags.is_noncont_src = 1;
2137 m_vars_extra[i].read_rng_src =
2138 init_read_ranges_arr_desc(ap);
2142 else {
2143 if (m_vars[i].flags.has_length) {
2144 m_vars[i].size =
2145 __dv_data_length(dvp, m_vars[i].count);
2147 else {
2148 m_vars[i].size = __dv_data_length(dvp);
2150 m_vars[i].disp = 0;
2153 // check that length >= 0
2154 if (m_vars[i].alloc_if &&
2155 (m_vars[i].disp + m_vars[i].size < 0)) {
2156 LIBOFFLOAD_ERROR(c_zero_or_neg_ptr_len);
2157 exit(1);
2160 // base address
2161 void *base = reinterpret_cast<void*>(dvp->Base);
2162 PtrData *ptr_data;
2164 // allocate buffer if we have no INTO and don't need
2165 // allocation for the ptr at target
2166 if (src_is_for_mic) {
2167 if (m_vars[i].alloc_if) {
2168 // add new entry
2169 if (!alloc_ptr_data(
2170 ptr_data,
2171 reinterpret_cast<char *>(base) + alloc_disp,
2172 (alloc_base != NULL) ?
2173 alloc_disp : m_vars[i].disp,
2174 (alloc_base != NULL) ?
2175 alloc_size : m_vars[i].size,
2176 alloc_disp,
2177 (alloc_base != NULL) ?
2178 0 : m_vars[i].align,
2179 m_vars[i].flags.targetptr,
2180 m_vars[i].flags.preallocated,
2181 m_vars[i].flags.pin)) {
2182 return false;
2185 if (ptr_data->add_reference() == 0 &&
2186 ptr_data->mic_buf != 0) {
2187 // add buffer to the list of buffers
2188 // that are passed to dispatch call
2189 m_compute_buffers.push_back(
2190 ptr_data->mic_buf);
2192 else {
2193 // will send buffer address to device
2194 m_vars[i].flags.sink_addr = 1;
2197 if (!ptr_data->is_static) {
2198 // need to add reference for buffer
2199 m_need_runfunction = true;
2202 else {
2203 bool error_if_not_found = true;
2204 if (m_is_openmp) {
2205 // For omp target update variable is ignored
2206 // if it does not exist.
2207 if (m_vars[i].flags.always_copy ||
2208 (!m_vars[i].alloc_if &&
2209 !m_vars[i].free_if)) {
2210 error_if_not_found = false;
2214 // use existing association from pointer table
2215 if (!find_ptr_data(ptr_data,
2216 base,
2217 m_vars[i].disp,
2218 m_vars[i].size,
2219 m_vars[i].flags.targetptr,
2220 error_if_not_found)) {
2221 return false;
2224 if (m_is_openmp) {
2225 // make var nocopy if it does not exist
2226 if (ptr_data == 0) {
2227 m_vars[i].direction.bits =
2228 c_parameter_nocopy;
2232 if (ptr_data != 0) {
2233 // need to update base in dope vector on device
2234 m_vars[i].flags.sink_addr = 1;
2238 if (ptr_data != 0) {
2239 if (m_is_openmp) {
2240 // data is transferred if
2241 // - if always modifier is used OR
2242 // - if alloc_if == 0 && free_if == 0 OR
2243 // - if reference count is 1
2244 if (!m_vars[i].flags.always_copy &&
2245 (m_vars[i].alloc_if ||
2246 m_vars[i].free_if) &&
2247 ptr_data->get_reference() != 1) {
2248 m_vars[i].direction.bits =
2249 c_parameter_nocopy;
2253 if (ptr_data->alloc_disp != 0) {
2254 m_vars[i].flags.alloc_disp = 1;
2255 m_in_datalen += sizeof(alloc_disp);
2258 if (m_vars[i].flags.sink_addr) {
2259 // get buffers's address on the sink
2260 if (!init_mic_address(ptr_data)) {
2261 return false;
2264 m_in_datalen += sizeof(ptr_data->mic_addr);
2267 if (!ptr_data->is_static && m_vars[i].free_if) {
2268 // need to decrement buffer reference on target
2269 m_need_runfunction = true;
2272 // offset to base from the beginning of the buffer
2273 // memory
2274 m_vars[i].offset =
2275 (char*) base -
2276 (char*) ptr_data->cpu_addr.start();
2278 // copy other pointer properties to var descriptor
2279 m_vars[i].mic_offset = ptr_data->mic_offset;
2280 m_vars[i].flags.is_static = ptr_data->is_static;
2283 else { // !src_is_for_mic
2284 if (!find_ptr_data(ptr_data,
2285 base,
2286 m_vars[i].disp,
2287 m_vars[i].size,
2288 false, false)) {
2289 return false;
2291 m_vars[i].offset = !ptr_data ? 0 :
2292 (char*) base -
2293 (char*) ptr_data->cpu_addr.start();
2296 // save pointer data
2297 m_vars_extra[i].src_data = ptr_data;
2299 break;
2301 default:
2302 LIBOFFLOAD_ERROR(c_unknown_var_type, m_vars_extra[i].type_src);
2303 LIBOFFLOAD_ABORT;
2305 if (m_vars_extra[i].type_src == c_data_ptr_array) {
2306 continue;
2309 if (src_is_for_mic && m_vars[i].flags.is_stack_buf) {
2310 if (this_threads_cpu_stack_addr == 0) {
2311 this_threads_cpu_stack_addr =
2312 get_this_threads_cpu_stack_addr(
2313 stack_addr, entry_id, thread_specific_function_locals);
2315 m_vars[i].offset = static_cast<char*>
2316 (m_vars[i].ptr) -
2317 this_threads_cpu_stack_addr;
2319 // if source is used at CPU save its offset and disp
2320 if (m_vars[i].into == NULL || m_vars[i].direction.in) {
2321 m_vars_extra[i].cpu_offset = m_vars[i].offset;
2322 m_vars_extra[i].cpu_disp = m_vars[i].disp;
2325 // If "into" is define we need to do the similar work for it
2326 if (!m_vars[i].into) {
2327 continue;
2330 int64_t into_disp =0, into_offset = 0;
2332 switch (m_vars_extra[i].type_dst) {
2333 case c_data_ptr_array:
2334 break;
2335 case c_data:
2336 case c_void_ptr:
2337 case c_void_ptr_ptr:
2338 case c_cean_var: {
2339 int64_t size = m_vars[i].size;
2341 if (m_vars[i].flags.is_non_cont_struct && src_is_for_mic) {
2342 NonContigDesc *desc =
2343 static_cast<NonContigDesc*>(m_vars[i].into);
2344 noncont_struct_dump("", "INTO DATA", desc);
2345 m_vars_extra[i].noncont_desc = desc;
2346 m_vars[i].into = reinterpret_cast<void*>(desc->base);
2347 size = get_noncont_struct_size(desc);
2348 into_disp = 0;
2350 else if (m_vars_extra[i].type_dst == c_cean_var) {
2351 // array descriptor
2352 const Arr_Desc *ap =
2353 static_cast<const Arr_Desc*>(m_vars[i].into);
2355 // debug dump
2356 ARRAY_DESC_DUMP(" ", "INTO", ap, 0, src_is_for_mic);
2358 // offset and length are derived from the array descriptor
2359 __arr_data_offset_and_length(ap, into_disp, size);
2361 if (!is_arr_desc_contiguous(ap)) {
2362 m_vars[i].flags.is_noncont_dst = 1;
2363 m_vars_extra[i].read_rng_dst =
2364 init_read_ranges_arr_desc(ap);
2365 if (!cean_ranges_match(
2366 m_vars_extra[i].read_rng_src,
2367 m_vars_extra[i].read_rng_dst)) {
2368 LIBOFFLOAD_ERROR(c_ranges_dont_match);
2369 exit(1);
2372 m_vars[i].into = reinterpret_cast<void*>(ap->base);
2375 int64_t size_src = m_vars_extra[i].read_rng_src &&
2376 !m_vars[i].flags.is_non_cont_struct ?
2377 cean_get_transf_size(m_vars_extra[i].read_rng_src) :
2378 m_vars[i].size;
2379 int64_t size_dst = m_vars_extra[i].read_rng_dst ?
2380 cean_get_transf_size(m_vars_extra[i].read_rng_dst) :
2381 size;
2382 // It's supposed that "into" size must be not less
2383 // than src size
2384 if (size_src > size_dst) {
2385 LIBOFFLOAD_ERROR(c_different_src_and_dstn_sizes,
2386 size_src, size_dst);
2387 exit(1);
2390 if (m_vars[i].direction.bits) {
2391 if (m_vars[i].flags.is_static_dstn) {
2392 PtrData *ptr_data;
2394 // find data associated with variable
2395 if (!find_ptr_data(ptr_data, m_vars[i].into,
2396 into_disp, size, false, false)) {
2397 return false;
2399 if (ptr_data != 0) {
2400 // offset to base from the beginning of the buffer
2401 // memory
2402 into_offset =
2403 (char*) m_vars[i].into -
2404 (char*) ptr_data->cpu_addr.start();
2406 else {
2407 m_vars[i].flags.is_static_dstn = false;
2409 m_vars_extra[i].dst_data = ptr_data;
2413 if (m_vars[i].direction.in &&
2414 !m_vars[i].flags.is_static_dstn) {
2415 m_in_datalen += m_vars[i].size;
2417 // for non-static target destination defined as CEAN
2418 // expression we pass to target its size and dist
2419 if (m_vars_extra[i].type_dst == c_cean_var) {
2420 m_in_datalen += 2 * sizeof(uint64_t);
2422 m_need_runfunction = true;
2425 if (m_is_openmp && src_is_for_mic) {
2426 if (m_vars[i].flags.is_static_dstn) {
2427 // Static data is transferred either by omp target
2428 // update construct which passes zeros for
2429 // alloc_if and free_if or by always modifier.
2430 if (!m_vars[i].flags.always_copy &&
2431 (m_vars[i].alloc_if || m_vars[i].free_if)) {
2432 m_vars[i].direction.bits = c_parameter_nocopy;
2435 else {
2436 AutoData *auto_data;
2437 if (m_vars[i].alloc_if) {
2438 auto_data = m_device.insert_auto_data(
2439 m_vars[i].into, size_dst);
2440 auto_data->add_reference();
2442 else {
2443 // TODO: what should be done if var is not in
2444 // the table?
2445 auto_data = m_device.find_auto_data(
2446 m_vars[i].into);
2449 // For automatic variables data is transferred:
2450 // - if always modifier is used OR
2451 // - if alloc_if == 0 && free_if == 0 OR
2452 // - if reference count is 1
2453 if (!m_vars[i].flags.always_copy &&
2454 (m_vars[i].alloc_if || m_vars[i].free_if) &&
2455 (auto_data == 0 ||
2456 auto_data->get_reference() != 1)) {
2457 m_vars[i].direction.bits = c_parameter_nocopy;
2459 // save data for later use
2460 m_vars_extra[i].auto_data = auto_data;
2463 break;
2466 case c_dv:
2467 if (m_vars[i].direction.bits ||
2468 m_vars[i].alloc_if ||
2469 m_vars[i].free_if) {
2470 ArrDesc *dvp = static_cast<ArrDesc*>(m_vars[i].into);
2472 // debug dump
2473 __dv_desc_dump("INTO", dvp);
2475 // send dope vector contents excluding base
2476 m_in_datalen += m_vars[i].size - sizeof(uint64_t);
2477 m_need_runfunction = true;
2479 break;
2481 case c_string_ptr:
2482 case c_data_ptr:
2483 case c_string_ptr_ptr:
2484 case c_data_ptr_ptr:
2485 case c_cean_var_ptr:
2486 case c_cean_var_ptr_ptr:
2487 case c_dv_ptr: {
2488 int64_t size = m_vars[i].size;
2490 if (m_vars_extra[i].type_dst == c_cean_var_ptr ||
2491 m_vars_extra[i].type_dst == c_cean_var_ptr_ptr) {
2492 // array descriptor
2493 const Arr_Desc *ap =
2494 static_cast<const Arr_Desc*>(m_vars[i].into);
2496 // debug dump
2497 ARRAY_DESC_DUMP(" ", "INTO", ap, 1, src_is_for_mic);
2499 // offset and length are derived from the array descriptor
2500 __arr_data_offset_and_length(ap, into_disp, size);
2502 if (!is_arr_desc_contiguous(ap)) {
2503 m_vars[i].flags.is_noncont_src = 1;
2504 m_vars_extra[i].read_rng_dst =
2505 init_read_ranges_arr_desc(ap);
2506 if (!cean_ranges_match(
2507 m_vars_extra[i].read_rng_src,
2508 m_vars_extra[i].read_rng_dst)) {
2509 LIBOFFLOAD_ERROR(c_ranges_dont_match);
2512 m_vars[i].into = reinterpret_cast<char**>(ap->base);
2514 else if (m_vars_extra[i].type_dst == c_dv_ptr) {
2515 // need to send DV to the device unless it is 'nocopy'
2516 if (m_vars[i].direction.bits ||
2517 m_vars[i].alloc_if ||
2518 m_vars[i].free_if) {
2519 ArrDesc *dvp = *static_cast<ArrDesc**>(m_vars[i].into);
2521 // debug dump
2522 __dv_desc_dump("INTO", dvp);
2524 m_vars[i].direction.bits = c_parameter_in;
2528 int64_t size_src = m_vars_extra[i].read_rng_src &&
2529 !m_vars[i].flags.is_non_cont_struct ?
2530 cean_get_transf_size(m_vars_extra[i].read_rng_src) :
2531 m_vars[i].size;
2532 int64_t size_dst = m_vars_extra[i].read_rng_dst ?
2533 cean_get_transf_size(m_vars_extra[i].read_rng_dst) :
2534 size;
2535 // It's supposed that "into" size must be not less than
2536 // src size
2537 if (size_src > size_dst) {
2538 LIBOFFLOAD_ERROR(c_different_src_and_dstn_sizes,
2539 size_src, size_dst);
2540 exit(1);
2543 if (m_vars[i].direction.bits) {
2544 PtrData *ptr_data;
2546 // base address
2547 void *base = *static_cast<void**>(m_vars[i].into);
2549 if (m_vars[i].direction.in) {
2550 // allocate buffer
2551 if (m_vars[i].flags.is_stack_buf) {
2552 // for stack persistent objects ptr data is created
2553 // by var_desc with number 0.
2554 // Its ptr_data is stored at m_stack_ptr_data
2555 ptr_data = m_stack_ptr_data;
2557 else if (m_vars[i].alloc_if) {
2558 if (m_vars[i].flags.preallocated) {
2559 m_out_datalen += sizeof(void*);
2560 m_need_runfunction = true;
2561 break;
2563 // add new entry
2564 if (!alloc_ptr_data(
2565 ptr_data,
2566 reinterpret_cast<char *>(base) + alloc_disp,
2567 (alloc_base != NULL) ?
2568 alloc_disp : into_disp,
2569 (alloc_base != NULL) ?
2570 alloc_size : size,
2571 alloc_disp,
2572 (alloc_base != NULL) ?
2573 0 : m_vars[i].align,
2574 m_vars[i].flags.targetptr,
2575 m_vars[i].flags.preallocated,
2576 m_vars[i].flags.pin)) {
2577 return false;
2579 if (m_vars[i].flags.targetptr) {
2580 if (!init_mic_address(ptr_data)) {
2581 return false;
2583 *static_cast<void**>(m_vars[i].into) = base =
2584 reinterpret_cast<void*>(ptr_data->mic_addr);
2586 if (ptr_data->add_reference() == 0 &&
2587 ptr_data->mic_buf != 0) {
2588 // add buffer to the list of buffers that
2589 // are passed to dispatch call
2590 m_compute_buffers.push_back(
2591 ptr_data->mic_buf);
2593 else {
2594 // will send buffer address to device
2595 m_vars[i].flags.sink_addr = 1;
2598 if (!ptr_data->is_static) {
2599 // need to add reference for buffer
2600 m_need_runfunction = true;
2603 else {
2604 // use existing association from pointer table
2605 if (!find_ptr_data(ptr_data, base, into_disp,
2606 size, m_vars[i].flags.targetptr, true)) {
2607 return false;
2609 m_vars[i].flags.sink_addr = 1;
2612 if (ptr_data->alloc_disp != 0) {
2613 m_vars[i].flags.alloc_disp = 1;
2614 m_in_datalen += sizeof(alloc_disp);
2617 if (m_vars[i].flags.sink_addr) {
2618 // get buffers's address on the sink
2619 if (!init_mic_address(ptr_data)) {
2620 return false;
2623 m_in_datalen += sizeof(ptr_data->mic_addr);
2626 if (!ptr_data->is_static && m_vars[i].free_if) {
2627 // need to decrement buffer reference on target
2628 m_need_runfunction = true;
2631 // copy other pointer properties to var descriptor
2632 m_vars[i].mic_offset = ptr_data->mic_offset;
2633 m_vars[i].flags.is_static_dstn = ptr_data->is_static;
2635 else {
2636 if (!find_ptr_data(ptr_data,
2637 base,
2638 into_disp,
2639 m_vars[i].size,
2640 false, false)) {
2641 return false;
2644 if (ptr_data) {
2645 into_offset = ptr_data ?
2646 (char*) base -
2647 (char*) ptr_data->cpu_addr.start() :
2651 if (m_is_openmp) {
2652 // for FROM transfer of stack buffer's variable
2653 if (src_is_for_mic && m_vars[i].flags.is_stack_buf) {
2654 AutoData *auto_data;
2655 char *base = *static_cast<char**>(m_vars[i].into);
2656 if (m_vars[i].alloc_if) {
2657 auto_data =m_device.insert_auto_data(
2658 base + into_disp,
2659 size);
2660 auto_data->add_reference();
2662 else {
2663 auto_data = m_device.find_auto_data(
2664 base + into_disp);
2666 // save data for later use
2667 m_vars_extra[i].auto_data = auto_data;
2668 // For automatic variables
2669 // data is transferred:
2670 // - if always modifier is used OR
2671 // - if alloc_if == 0 && free_if == 0 OR
2672 // - if reference count is 1
2673 if (!m_vars[i].flags.always_copy &&
2674 (m_vars[i].alloc_if ||
2675 m_vars[i].free_if) &&
2676 auto_data != 0 &&
2677 auto_data->get_reference() != 1) {
2678 m_vars[i].direction.bits =
2679 c_parameter_nocopy;
2683 // save pointer data
2684 m_vars_extra[i].dst_data = ptr_data;
2686 break;
2689 case c_func_ptr:
2690 case c_func_ptr_ptr:
2691 break;
2693 case c_dv_data:
2694 case c_dv_ptr_data:
2695 case c_dv_data_slice:
2696 case c_dv_ptr_data_slice:
2697 if (m_vars[i].direction.bits ||
2698 m_vars[i].alloc_if ||
2699 m_vars[i].free_if) {
2700 const Arr_Desc *ap;
2701 ArrDesc *dvp;
2702 PtrData *ptr_data;
2703 int64_t disp;
2704 int64_t size;
2706 if (VAR_TYPE_IS_DV_DATA_SLICE(m_vars_extra[i].type_dst)) {
2707 ap = static_cast<const Arr_Desc*>(m_vars[i].into);
2709 // debug dump
2710 ARRAY_DESC_DUMP(" ", "INTO", ap, 0, src_is_for_mic);
2712 dvp = (m_vars_extra[i].type_dst == c_dv_data_slice) ?
2713 reinterpret_cast<ArrDesc*>(ap->base) :
2714 *reinterpret_cast<ArrDesc**>(ap->base);
2716 else {
2717 dvp = (m_vars_extra[i].type_dst == c_dv_data) ?
2718 static_cast<ArrDesc*>(m_vars[i].into) :
2719 *static_cast<ArrDesc**>(m_vars[i].into);
2721 if (!__dv_is_contiguous(dvp)) {
2722 m_vars[i].flags.is_noncont_dst = 1;
2723 m_vars_extra[i].read_rng_dst =
2724 init_read_ranges_dv(dvp);
2726 // size and displacement
2727 if (VAR_TYPE_IS_DV_DATA_SLICE(m_vars_extra[i].type_dst)) {
2728 // offset and length are derived from the array
2729 // descriptor
2730 __arr_data_offset_and_length(ap, into_disp, size);
2731 if (m_vars[i].direction.bits) {
2732 if (!is_arr_desc_contiguous(ap)) {
2733 if (m_vars[i].flags.is_noncont_dst) {
2734 LIBOFFLOAD_ERROR(c_slice_of_noncont_array);
2735 return false;
2737 m_vars[i].flags.is_noncont_dst = 1;
2738 m_vars_extra[i].read_rng_dst =
2739 init_read_ranges_arr_desc(ap);
2740 if (!cean_ranges_match(
2741 m_vars_extra[i].read_rng_src,
2742 m_vars_extra[i].read_rng_dst)) {
2743 LIBOFFLOAD_ERROR(c_ranges_dont_match);
2748 else {
2749 if (m_vars[i].flags.has_length) {
2750 size = __dv_data_length(dvp, m_vars[i].count);
2752 else {
2753 size = __dv_data_length(dvp);
2755 disp = 0;
2758 int64_t size_src =
2759 m_vars_extra[i].read_rng_src &&
2760 (!m_vars[i].flags.is_non_cont_struct ||
2761 src_is_for_mic) ?
2762 cean_get_transf_size(m_vars_extra[i].read_rng_src) :
2763 m_vars[i].size;
2764 int64_t size_dst =
2765 m_vars_extra[i].read_rng_dst ?
2766 cean_get_transf_size(m_vars_extra[i].read_rng_dst) :
2767 size;
2768 // It's supposed that "into" size must be not less
2769 // than src size
2770 if (size_src > size_dst) {
2771 LIBOFFLOAD_ERROR(c_different_src_and_dstn_sizes,
2772 size_src, size_dst);
2773 exit(1);
2776 // base address
2777 void *base = reinterpret_cast<void*>(dvp->Base);
2779 // allocate buffer
2780 if (m_vars[i].direction.in) {
2781 if (m_vars[i].alloc_if) {
2782 // add new entry
2783 if (!alloc_ptr_data(
2784 ptr_data,
2785 reinterpret_cast<char *>(base) + alloc_disp,
2786 (alloc_base != NULL) ?
2787 alloc_disp : into_disp,
2788 (alloc_base != NULL) ?
2789 alloc_size : size,
2790 alloc_disp,
2791 (alloc_base != NULL) ?
2792 0 : m_vars[i].align,
2793 m_vars[i].flags.targetptr,
2794 m_vars[i].flags.preallocated,
2795 m_vars[i].flags.pin)) {
2796 return false;
2798 if (ptr_data->add_reference() == 0 &&
2799 ptr_data->mic_buf !=0) {
2800 // add buffer to the list of buffers
2801 // that are passed to dispatch call
2802 m_compute_buffers.push_back(
2803 ptr_data->mic_buf);
2805 else {
2806 // will send buffer address to device
2807 m_vars[i].flags.sink_addr = 1;
2810 if (!ptr_data->is_static) {
2811 // need to add reference for buffer
2812 m_need_runfunction = true;
2815 else {
2816 // use existing association from pointer table
2817 if (!find_ptr_data(ptr_data, base, into_disp,
2818 size, m_vars[i].flags.targetptr, true)) {
2819 return false;
2822 // need to update base in dope vector on device
2823 m_vars[i].flags.sink_addr = 1;
2826 if (ptr_data->alloc_disp != 0) {
2827 m_vars[i].flags.alloc_disp = 1;
2828 m_in_datalen += sizeof(alloc_disp);
2831 if (m_vars[i].flags.sink_addr) {
2832 // get buffers's address on the sink
2833 if (!init_mic_address(ptr_data)) {
2834 return false;
2836 m_in_datalen += sizeof(ptr_data->mic_addr);
2839 if (!ptr_data->is_static && m_vars[i].free_if) {
2840 // need to decrement buffer reference on target
2841 m_need_runfunction = true;
2844 // offset to base from the beginning of the buffer
2845 // memory
2846 into_offset =
2847 (char*) base - (char*) ptr_data->cpu_addr.start();
2849 // copy other pointer properties to var descriptor
2850 m_vars[i].mic_offset = ptr_data->mic_offset;
2851 m_vars[i].flags.is_static_dstn = ptr_data->is_static;
2853 else { // src_is_for_mic
2854 if (!find_ptr_data(ptr_data,
2855 base,
2856 into_disp,
2857 size,
2858 false, false)) {
2859 return false;
2861 into_offset = !ptr_data ?
2863 (char*) base - (char*) ptr_data->cpu_addr.start();
2866 // save pointer data
2867 m_vars_extra[i].dst_data = ptr_data;
2869 break;
2871 default:
2872 LIBOFFLOAD_ERROR(c_unknown_var_type, m_vars_extra[i].type_src);
2873 LIBOFFLOAD_ABORT;
2875 // if into is used at CPU save its offset and disp
2876 if (m_vars[i].direction.out) {
2877 m_vars_extra[i].cpu_offset = into_offset;
2878 m_vars_extra[i].cpu_disp = into_disp;
2880 else {
2881 if (m_vars[i].flags.is_stack_buf) {
2882 if (this_threads_cpu_stack_addr == 0) {
2883 this_threads_cpu_stack_addr =
2884 get_this_threads_cpu_stack_addr(
2885 stack_addr, entry_id,
2886 thread_specific_function_locals);
2888 into_offset = static_cast<char*>
2889 (m_vars[i].into) -
2890 this_threads_cpu_stack_addr;
2892 m_vars[i].offset = into_offset;
2893 m_vars[i].disp = into_disp;
2897 return true;
2900 bool OffloadDescriptor::setup_misc_data(const char *name)
2902 OffloadTimer timer(get_timer_data(), c_offload_host_setup_misc_data);
2904 // we can skip run functon call together with wait if offloaded
2905 // region is empty and there is no user defined non-pointer IN/OUT data
2906 if (m_need_runfunction) {
2907 // variable descriptors are sent as input data
2908 m_in_datalen += m_vars_total * sizeof(VarDesc);
2910 // timer data is sent as a part of the output data
2911 m_out_datalen += OFFLOAD_TIMER_DATALEN();
2913 // max from input data and output data length
2914 uint64_t data_len = m_in_datalen > m_out_datalen ? m_in_datalen :
2915 m_out_datalen;
2917 // Misc data has the following layout
2918 // <Function Descriptor>
2919 // <Function Name>
2920 // <In/Out Data> (optional)
2922 // We can transfer copyin/copyout data in misc/return data which can
2923 // be passed to run function call if its size does not exceed
2924 // COI_PIPELINE_MAX_IN_MISC_DATA_LEN. Otherwise we have to allocate
2925 // buffer for it.
2927 m_func_desc_size = sizeof(FunctionDescriptor) + strlen(name) + 1;
2928 m_func_desc_size = (m_func_desc_size + 7) & ~7;
2930 int misc_data_offset = 0;
2931 int misc_data_size = 0;
2932 if (data_len > 0) {
2933 if (m_func_desc_size +
2934 m_in_datalen <= COI_PIPELINE_MAX_IN_MISC_DATA_LEN &&
2935 m_out_datalen <= COI_PIPELINE_MAX_IN_MISC_DATA_LEN) {
2936 // use misc/return data for copyin/copyout
2937 misc_data_offset = m_func_desc_size;
2938 misc_data_size = data_len;
2940 else {
2941 OffloadTimer timer_buf(get_timer_data(),
2942 c_offload_host_alloc_data_buffer);
2944 // send/receive data using buffer
2945 COIRESULT res = COI::BufferCreate(data_len,
2946 COI_BUFFER_OPENCL,
2947 0, 0,
2948 1, &m_device.get_process(),
2949 &m_inout_buf);
2950 if (res != COI_SUCCESS) {
2951 if (m_status != 0) {
2952 m_status->result = translate_coi_error(res);
2953 return false;
2955 report_coi_error(c_buf_create, res);
2958 m_compute_buffers.push_back(m_inout_buf);
2959 m_destroy_buffers.push_back(m_inout_buf);
2963 // initialize function descriptor
2964 m_func_desc = (FunctionDescriptor*) malloc(m_func_desc_size +
2965 misc_data_size);
2966 if (m_func_desc == NULL)
2967 LIBOFFLOAD_ERROR(c_malloc);
2968 m_func_desc->console_enabled = console_enabled;
2969 m_func_desc->timer_enabled = offload_report_enabled &&
2970 (timer_enabled || offload_report_level);
2971 m_func_desc->offload_report_level = offload_report_enabled ?
2972 offload_report_level : 0;
2973 m_func_desc->offload_number = GET_OFFLOAD_NUMBER(get_timer_data());
2974 m_func_desc->in_datalen = m_in_datalen;
2975 m_func_desc->out_datalen = m_out_datalen;
2976 m_func_desc->vars_num = m_vars_total;
2977 m_func_desc->data_offset = misc_data_offset;
2979 // append entry name
2980 strcpy(m_func_desc->data, name);
2983 return true;
2986 void OffloadDescriptor::setup_omp_async_info()
2988 OFFLOAD_TRACE(2, "setup_omp_async_info\n");
2989 OmpAsyncLastEventType event_type = m_need_runfunction ?
2990 c_last_runfunc : c_last_write;
2991 int last_in = m_need_runfunction ? 0 : -1;
2992 int i;
2994 for (i = m_vars_total - 1; i >=0; i--) {
2995 bool src_is_target = (m_vars[i].direction.out || !m_vars[i].into);
2996 int var_type = src_is_target ? m_vars_extra[i].type_src :
2997 m_vars_extra[i].type_dst;
2998 bool target_is_static = src_is_target ? m_vars[i].flags.is_static :
2999 m_vars[i].flags.is_static_dstn;
3000 switch (var_type) {
3001 case c_data:
3002 case c_void_ptr:
3003 case c_cean_var:
3004 if (m_vars[i].direction.out && target_is_static) {
3005 event_type = c_last_read;
3007 else if (last_in < 0 && m_vars[i].direction.in &&
3008 target_is_static) {
3009 last_in = i;
3011 break;
3012 case c_string_ptr:
3013 case c_data_ptr:
3014 case c_string_ptr_ptr:
3015 case c_data_ptr_ptr:
3016 case c_cean_var_ptr:
3017 case c_cean_var_ptr_ptr:
3018 case c_dv_ptr:
3019 case c_dv_data:
3020 case c_dv_ptr_data:
3021 case c_dv_data_slice:
3022 case c_dv_ptr_data_slice:
3024 if (m_vars[i].direction.out) {
3025 event_type = c_last_read;
3027 else if (last_in < 0 && m_vars[i].direction.in) {
3028 last_in = i;
3030 break;
3031 default:
3032 break;
3034 if (event_type == c_last_read) {
3035 break;
3039 if (event_type == c_last_read) {
3040 m_vars_extra[i].omp_last_event_type = c_last_read;
3042 else if (event_type == c_last_write) {
3043 m_vars_extra[last_in].omp_last_event_type = c_last_write;
3045 m_omp_async_last_event_type = event_type;
3046 OFFLOAD_TRACE(2, "setup_omp_async_info: event_type=%d\n",
3047 m_omp_async_last_event_type);
3050 extern "C" {
3051 void offload_proxy_task_completed_ooo(
3052 COIEVENT e,
3053 const COIRESULT r,
3054 const void *info
3057 task_completion_callback ((void *) info);
3060 // Callback function for asynchronous offloads
3061 void offload_complete_task(
3062 COIEVENT e,
3063 const COIRESULT r,
3064 const void *info
3067 Stream *stream;
3068 OffloadDescriptor *task = const_cast<OffloadDescriptor*>(
3069 reinterpret_cast<const OffloadDescriptor*>(info));
3070 uint32_t events_remained;
3072 lock_complete.lock();
3073 if (!offload_descr_map[task]) {
3074 lock_complete.unlock();
3075 return;
3078 #ifndef TARGET_WINNT
3079 events_remained = __sync_sub_and_fetch(&task->m_event_count, 1);
3080 #else // TARGET_WINNT
3081 events_remained = _InterlockedDecrement(&task->m_event_count);
3082 #endif // TARGET_WINNT
3083 // Waiting for the last event
3084 if (events_remained != 0) {
3085 lock_complete.unlock();
3086 return;
3089 // Callback could be called when execution at host is completed.
3090 // Do nothing as engine data is destructed
3091 if (!task->get_device().get_ready()) {
3092 lock_complete.unlock();
3093 return;
3096 void * signal = task->get_signal();
3097 _Offload_stream stream_handle = task->get_stream();
3099 OFFLOAD_TRACE(2, "Call function offload_complete_task(%p)\n", info);
3101 // Completed offload has a signal
3102 if (task->m_has_signal) {
3103 if (!offload_descr_map[task]) {
3104 lock_complete.unlock();
3105 return;
3107 task->get_device().complete_signaled_ofld(signal);
3108 // Asynchronous offload can have both signal and stream. Need to
3109 // clean stream if any.
3110 stream_handle = task->get_stream();
3111 if (stream_handle != -1) {
3112 stream = Stream::find_stream(stream_handle, false);
3113 if (stream && stream->get_last_offload() == task) {
3114 stream->set_last_offload(NULL);
3117 offload_descr_map[task] = false;
3118 lock_complete.unlock();
3120 if (task->offload_finish(0)) { //arg is 0 for is_traceback
3121 task->cleanup();
3123 delete task;
3125 // Asynchronous by stream
3126 else {
3127 if (stream_handle != 0) {
3128 stream = Stream::find_stream(stream_handle, false);
3130 // the stream was not created or was destroyed
3131 if (!stream) {
3132 LIBOFFLOAD_ERROR(c_offload_no_stream,
3133 task->get_device().get_logical_index());
3134 LIBOFFLOAD_ABORT;
3136 if (!offload_descr_map[task]) {
3137 lock_complete.unlock();
3138 return;
3140 if (task == stream->get_last_offload()) {
3141 stream->set_last_offload(NULL);
3143 // if the offload has both signal and stream we will complete
3144 // it as it has the signal. So we don't need to mark signal
3145 // as completed.
3146 offload_descr_map[task] = false;
3147 lock_complete.unlock();
3148 if (task->offload_finish(0)) { //arg is 0 for is_traceback
3149 task->cleanup();
3151 delete task;
3157 void OffloadDescriptor::register_omp_event_call_back(
3158 const COIEVENT *event,
3159 const void *info)
3161 register_event_call_back(&offload_proxy_task_completed_ooo, event, info);
3164 void OffloadDescriptor::register_event_call_back(
3165 void (*func)(COIEVENT, const COIRESULT, const void*),
3166 const COIEVENT *event,
3167 const void *info)
3169 OFFLOAD_TRACE(2, "register_event_call_back(event=%p, info=%p)\n",
3170 event, info);
3171 if (COI::EventRegisterCallback) {
3172 COI::EventRegisterCallback(
3173 *event,
3174 func,
3175 info, 0);
3176 OFFLOAD_TRACE(2,
3177 "COI::EventRegisterCallback found; callback registered\n");
3181 bool OffloadDescriptor::wait_dependencies(
3182 const void **waits,
3183 int num_waits,
3184 _Offload_stream handle
3187 OffloadTimer timer(get_timer_data(), c_offload_host_wait_deps);
3188 bool ret = true;
3189 OffloadDescriptor *task;
3190 void * signal;
3192 if (num_waits == 0) {
3193 // Prepare in dependencies for stream
3194 get_stream_in_dependencies(m_num_in_dependencies,m_p_in_dependencies);
3195 return true;
3198 // wait for streams
3199 if (num_waits == -1) {
3200 Stream * stream;
3201 // some specific stream of the device
3202 if (handle != 0) {
3203 lock_complete.lock();
3204 stream = Stream::find_stream(handle, false);
3206 // the stream was not created or was destroyed
3207 if (!stream) {
3208 LIBOFFLOAD_ERROR(c_offload_no_stream, m_device.get_logical_index());
3209 LIBOFFLOAD_ABORT;
3211 task = stream->get_last_offload();
3213 // offload was completed by previous offload_wait pragma
3214 // or wait clause
3215 if (!offload_descr_map[task]) {
3216 lock_complete.unlock();
3217 return true;
3219 stream->set_last_offload(NULL);
3220 if (task->m_has_signal) {
3221 signal = task->get_signal();
3222 if (m_device.find_signal(signal, false) == task) {
3223 m_device.complete_signaled_ofld(signal);
3226 offload_descr_map[task] = false;
3227 lock_complete.unlock();
3229 if (!task->offload_finish(0)) { //arg is 0 for is_traceback
3230 ret = false;
3232 task->cleanup();
3233 delete task;
3235 // all streams of the device or over all devices
3236 else {
3237 StreamMap stream_map = Stream::all_streams;
3238 for (StreamMap::iterator it = stream_map.begin();
3239 it != stream_map.end(); it++) {
3240 Stream * stream = it->second;
3241 if (!m_wait_all_devices &&
3242 stream->get_device() != m_device.get_logical_index()) {
3243 continue;
3245 lock_complete.lock();
3247 // get associated async task
3248 OffloadDescriptor *task = stream->get_last_offload();
3249 // offload was completed by offload_wait pragma or wait clause
3250 if (!offload_descr_map[task]) {
3251 lock_complete.unlock();
3252 continue;
3254 if (task->m_has_signal) {
3255 signal = task->get_signal();
3256 if (task->get_device().find_signal(signal, false) ==
3257 task) {
3258 task->get_device().complete_signaled_ofld(signal);
3261 stream->set_last_offload(NULL);
3262 offload_descr_map[task] = false;
3263 lock_complete.unlock();
3264 if (!task->offload_finish(0)) { //arg is 0 for is_traceback
3265 ret = false;
3267 task->cleanup();
3268 delete task;
3270 // no uncompleted streams
3271 return true;
3274 else {
3276 // If offload is asynchronous we will not really wait for signals.
3277 // We will collect all waited events into m_p_in_dependencies vector
3278 // to be used in future calls to COI::Copy... API.
3280 if (!__offload_always_wait && (m_has_signal || (get_stream() > 0))) {
3281 uint64_t num_in_dep = 0,
3282 num_in_dep_prev = 0;
3283 COIEVENT *p_in_dep = NULL;
3284 _Offload_stream stream_handle = get_stream();
3285 Stream *stream;
3286 bool stream_need_connection = stream_handle > 0;
3288 if (stream_need_connection) {
3289 stream = Stream::find_stream(stream_handle, false);
3290 // check previous offload with the stream_handle
3291 // to be noncompleted
3292 if (!stream) {
3293 stream_need_connection = false;
3296 for (int i = 0; i < num_waits; i++) {
3297 task = m_device.find_signal(waits[i], false);
3298 if (task == 0) {
3299 LIBOFFLOAD_ERROR(c_offload1, m_device.get_logical_index(),
3300 waits[i]);
3301 LIBOFFLOAD_ABORT;
3303 else if (task == SIGNAL_HAS_COMPLETED) {
3304 continue;
3306 if (stream_need_connection &&
3307 stream->get_last_offload() == task) {
3308 stream_need_connection = false;
3310 if (!task->m_num_in_dependencies) {
3311 continue;
3313 num_in_dep += task->m_num_in_dependencies;
3314 p_in_dep = (COIEVENT*)realloc(p_in_dep,
3315 sizeof(COIEVENT) * num_in_dep);
3316 if (p_in_dep == NULL)
3317 LIBOFFLOAD_ERROR(c_malloc);
3318 memcpy(p_in_dep + num_in_dep_prev, task->m_p_in_dependencies,
3319 task->m_num_in_dependencies * sizeof(COIEVENT));
3320 num_in_dep_prev = num_in_dep;
3322 if (stream_need_connection) {
3323 task = stream->get_last_offload();
3324 if (task) {
3325 num_in_dep += task->m_num_in_dependencies;
3326 p_in_dep = (COIEVENT*)realloc(p_in_dep,
3327 sizeof(COIEVENT) * num_in_dep);
3328 if (p_in_dep == NULL)
3329 LIBOFFLOAD_ERROR(c_malloc);
3330 memcpy(p_in_dep + num_in_dep_prev,
3331 task->m_p_in_dependencies,
3332 task->m_num_in_dependencies * sizeof(COIEVENT));
3333 num_in_dep_prev = num_in_dep;
3336 m_num_in_dependencies = num_in_dep ? num_in_dep :
3337 m_num_in_dependencies;
3338 m_p_in_dependencies = num_in_dep ? p_in_dep : m_p_in_dependencies;
3340 // wait and do offload_finish for serial offload
3341 else {
3342 for (int i = 0; i < num_waits; i++) {
3343 _Offload_stream stream_handle;
3344 Stream *stream;
3346 lock_complete.lock();
3347 task = m_device.find_signal(waits[i], false);
3348 if (task == 0) {
3349 LIBOFFLOAD_ERROR(c_offload1, m_device.get_logical_index(),
3350 waits[i]);
3351 LIBOFFLOAD_ABORT;
3353 else if (!offload_descr_map[task]) {
3354 lock_complete.unlock();
3355 continue;
3357 // Need to mark signal as completed to prevent run condition
3358 // with the call to "offload_complete_task" for the same
3359 // signal.
3360 m_device.complete_signaled_ofld(waits[i]);
3362 // Asynchronous offload can have both signal and stream.
3363 // Need to clean stream if any.
3365 stream_handle = task->m_stream;
3366 if (stream_handle != -1) {
3367 stream = Stream::find_stream(stream_handle, false);
3368 if (stream && stream->get_last_offload() == task) {
3369 stream->set_last_offload(NULL);
3372 offload_descr_map[task] = false;
3373 lock_complete.unlock();
3375 if (!task->offload_finish(0)) { //arg is 0 for is_traceback
3376 ret = false;
3378 task->cleanup();
3380 delete task;
3384 return ret;
3387 bool OffloadDescriptor::offload_wrap(
3388 const char *name,
3389 bool is_empty,
3390 VarDesc *vars,
3391 VarDesc2 *vars2,
3392 int vars_total,
3393 const void **waits,
3394 int num_waits,
3395 const void **signal,
3396 int entry_id,
3397 const void *stack_addr,
3398 OffloadFlags offload_flags
3401 OffloadWaitKind wait_kind = c_offload_wait_signal;
3402 bool is_traceback = offload_flags.bits.fortran_traceback;
3404 // define kind of wait if any;
3405 // there can be one of the following kind:
3406 // 1. c_offload_wait_signal for "offload_wait wait(signal)"
3407 // 2. c_offload_wait_stream for "offload_wait stream(stream)"
3408 // 3. c_offload_wait_all_streams for "offload_wait stream(0)"
3409 if (num_waits == -1) {
3410 wait_kind = (m_stream == 0) ?
3411 c_offload_wait_all_streams :
3412 c_offload_wait_stream;
3414 char buf[35];
3415 const char *stream_str;
3417 if (m_stream == no_stream || num_waits ==-1) {
3418 stream_str = "none";
3420 else if (m_stream == 0) {
3421 stream_str = "all";
3423 else {
3424 sprintf(buf, "%#llx", m_stream);
3425 stream_str = buf;
3428 if (m_has_signal) {
3429 OFFLOAD_DEBUG_TRACE_1(1,
3430 GET_OFFLOAD_NUMBER(get_timer_data()),
3431 c_offload_init_func,
3432 "Offload function %s, is_empty=%d, #varDescs=%d, "
3433 "signal=none, stream=%s, #waits=%d%c",
3434 name, is_empty, vars_total, stream_str, num_waits,
3435 num_waits == 0 ? '\n' : ' ');
3436 // Breaks the norm of using OFFLOAD_DEBUG_TRACE to print the waits
3437 // since the number of waits is not fixed.
3438 if (!OFFLOAD_DO_TRACE && (console_enabled >= 1)) {
3439 if (num_waits) {
3440 printf("(");
3441 if (m_stream == no_stream) {
3442 printf("%p", waits[0]);
3443 for (int i = 1; i < num_waits; i++) {
3444 printf(", %p", waits[i]);
3447 else if (m_stream != 0) {
3448 printf("%#x", m_stream);
3450 else {
3451 printf(" all streams");
3453 printf(")");
3455 printf("\n");
3456 fflush(NULL);
3458 // stream in wait is reported further in OFFLOAD_REPORT for waits
3459 if (m_stream != no_stream && num_waits == 0) {
3460 OFFLOAD_REPORT(3, GET_OFFLOAD_NUMBER(get_timer_data()),
3461 c_offload_stream,
3462 "%d\n", m_stream);
3464 OFFLOAD_REPORT(3, GET_OFFLOAD_NUMBER(get_timer_data()),
3465 c_offload_signal,
3466 "none %d\n", 0);
3468 else {
3469 OFFLOAD_DEBUG_TRACE_1(1,
3470 GET_OFFLOAD_NUMBER(get_timer_data()),
3471 c_offload_init_func,
3472 "Offload function %s, is_empty=%d, #varDescs=%d, "
3473 "signal=%p, stream=%s, #waits=%d%c",
3474 name, is_empty, vars_total, signal, stream_str,
3475 num_waits, num_waits == 0 ? '\n' : ' ');
3476 // Breaks the norm of using OFFLOAD_DEBUG_TRACE to print the waits
3477 // since the number of waits is not fixed.
3478 if (!OFFLOAD_DO_TRACE && (console_enabled >= 1)) {
3479 if (num_waits) {
3480 printf("(");
3481 if (m_stream == no_stream) {
3482 printf("%p", waits[0]);
3483 for (int i = 1; i < num_waits; i++) {
3484 printf(", %p", waits[i]);
3486 printf(")");
3488 else if (m_stream != 0) {
3489 printf("%#x", m_stream);
3491 else {
3492 printf(" all streams");
3494 printf(")");
3496 printf("\n");
3497 fflush(NULL);
3499 // stream in wait is reported further in OFFLOAD_REPORT for waits
3500 if (m_stream != no_stream && num_waits == 0) {
3501 OFFLOAD_REPORT(3, GET_OFFLOAD_NUMBER(get_timer_data()),
3502 c_offload_stream,
3503 "%d\n", m_stream);
3505 OFFLOAD_REPORT(3, GET_OFFLOAD_NUMBER(get_timer_data()),
3506 c_offload_signal,
3507 "%d\n", signal);
3509 if (console_enabled >= 1 && offload_flags.flags != 0) {
3510 trace_offload_flags(get_timer_data(), offload_flags);
3513 OFFLOAD_REPORT(3, GET_OFFLOAD_NUMBER(get_timer_data()),
3514 c_offload_wait, "%d\n",
3515 wait_kind, num_waits,
3516 (wait_kind == c_offload_wait_signal) ?
3517 waits :
3518 reinterpret_cast<const void **>(m_stream));
3520 if (m_status != 0) {
3521 m_status->result = OFFLOAD_SUCCESS;
3522 m_status->device_number = m_device.get_logical_index();
3525 m_initial_need_runfunction = m_need_runfunction = !is_empty;
3527 // wait for dependencies to finish or set
3528 // m_num_in_dependencies and m_p_in_dependencies for asynchronous offload
3529 if (!wait_dependencies(waits, num_waits, m_stream)) {
3530 cleanup();
3531 return false;
3534 // setup buffers
3535 if (!setup_descriptors(vars, vars2, vars_total, entry_id, stack_addr)) {
3536 cleanup();
3537 return false;
3540 if (offload_flags.bits.omp_async) {
3541 setup_omp_async_info();
3544 // initiate send for pointers. Want to do it as early as possible.
3545 if (!send_pointer_data(signal != 0 || offload_flags.bits.omp_async,
3546 signal)) {
3547 cleanup();
3548 return false;
3551 // setup misc data for run function
3552 if (!setup_misc_data(name)) {
3553 cleanup();
3554 return false;
3557 // gather copyin data into buffer
3558 if (!gather_copyin_data()) {
3559 cleanup();
3560 return false;
3563 // Start the computation
3564 if (!compute(signal)) {
3565 cleanup();
3566 return false;
3569 // initiate receive for pointers
3570 if (!receive_pointer_data(signal != 0 || offload_flags.bits.omp_async,
3571 true, signal)) {
3572 cleanup();
3573 return false;
3576 if (offload_flags.bits.omp_async) {
3577 return true;
3580 // if there is a signal or stream save descriptor for the later use.
3581 // num_waits == -1 is for offload_wait and there is nothing to save
3582 if (num_waits != -1 && (signal != 0 || m_stream != no_stream)) {
3584 if (signal != 0) {
3585 m_device.add_signal(*signal, this);
3588 if (m_stream != no_stream && m_stream != 0) {
3589 Stream* stream = Stream::find_stream(m_stream, false);
3590 if (stream) {
3591 stream->set_last_offload(this);
3593 else {
3594 LIBOFFLOAD_ERROR(c_offload_no_stream, m_device.get_logical_index());
3595 LIBOFFLOAD_ABORT;
3598 // Register callback function "offload_complete_task" for all out
3599 // events or for all in events if there are no out transfers
3600 if (!m_preallocated_alloc) {
3601 m_event_count = m_out_deps_total ?
3602 m_out_deps_total : m_in_deps_total;
3603 COIEVENT *event_list = m_out_deps_total ? m_out_deps : m_in_deps;
3605 for (int i = 0; i < m_event_count; i++) {
3606 register_event_call_back(&offload_complete_task,
3607 &event_list[i], this);
3609 offload_descr_map[this] = true;
3610 return true;
3614 // wait for the offload to finish.
3615 if (!offload_finish(is_traceback)) {
3616 cleanup();
3617 return false;
3620 cleanup();
3621 return true;
3624 bool OffloadDescriptor::offload(
3625 const char *name,
3626 bool is_empty,
3627 VarDesc *vars,
3628 VarDesc2 *vars2,
3629 int vars_total,
3630 const void **waits,
3631 int num_waits,
3632 const void **signal,
3633 int entry_id,
3634 const void *stack_addr,
3635 OffloadFlags offload_flags
3638 bool res;
3639 res = offload_wrap(name, is_empty, vars, vars2, vars_total,
3640 waits, num_waits, signal, entry_id,
3641 stack_addr, offload_flags);
3642 if (res == false && !m_traceback_called) {
3643 if (offload_flags.bits.fortran_traceback) {
3644 OFFLOAD_TRACE(3,
3645 "Calling Fortran library to continue traceback from MIC\n");
3646 FORTRAN_TRACE_BACK(m_status->result);
3647 m_traceback_called = true;
3650 return res;
3653 bool OffloadDescriptor::offload_finish(
3654 bool is_traceback
3657 COIRESULT res;
3659 // wait for compute dependencies to become signaled
3660 if (m_in_deps_total > 0 &&
3661 (m_out_deps_total <= 0 || m_preallocated_alloc)) {
3662 OffloadTimer timer(get_timer_data(), c_offload_host_wait_compute);
3664 if (__offload_active_wait) {
3665 // keep CPU busy
3666 do {
3667 res = COI::EventWait(m_in_deps_total, m_in_deps, 0, 1, 0, 0);
3669 while (res == COI_TIME_OUT_REACHED);
3671 else {
3672 res = COI::EventWait(m_in_deps_total, m_in_deps, -1, 1, 0, 0);
3675 if (res != COI_SUCCESS) {
3676 if (m_status != 0 && !m_traceback_called) {
3677 m_status->result = translate_coi_error(res);
3678 if (is_traceback) {
3679 OFFLOAD_TRACE(3,
3680 "Calling Fortran library to continue traceback from MIC\n");
3681 FORTRAN_TRACE_BACK(m_status->result);
3682 m_traceback_called = true;
3684 return false;
3686 if (is_traceback && !m_traceback_called) {
3687 OFFLOAD_TRACE(3,
3688 "Calling Fortran library to continue traceback from MIC\n");
3689 FORTRAN_TRACE_BACK(OFFLOAD_ERROR);
3690 exit(1);
3692 report_coi_error(c_event_wait, res);
3696 // need to do scatter copyout data received from target after
3697 // completing in dependencies to get preallocated buffers.
3698 // If there are no preallocated buffers we will scatter_copyout_data
3699 // after completing out dependencies. In this case we dont need wait
3700 // in dependencies as they are already in DAG.
3701 if (m_out_with_preallocated) {
3702 if (!scatter_copyout_data()) {
3703 return false;
3705 if (!receive_pointer_data(m_out_deps_total > 0, false, NULL)) {
3706 cleanup();
3707 return false;
3711 // wait for receive dependencies to become signaled
3712 if (m_out_deps_total > 0) {
3713 OffloadTimer timer(get_timer_data(), c_offload_host_wait_buffers_reads);
3715 if (__offload_active_wait) {
3716 // keep CPU busy
3717 do {
3718 res = COI::EventWait(m_out_deps_total, m_out_deps, 0, 1, 0, 0);
3720 while (res == COI_TIME_OUT_REACHED);
3722 else {
3723 res = COI::EventWait(m_out_deps_total, m_out_deps, -1, 1, 0, 0);
3726 if (res != COI_SUCCESS) {
3727 if (m_status != 0 && !m_traceback_called) {
3728 m_status->result = translate_coi_error(res);
3729 if (is_traceback) {
3730 OFFLOAD_TRACE(3,
3731 "Calling Fortran library to continue traceback from MIC\n");
3732 FORTRAN_TRACE_BACK(m_status->result);
3733 m_traceback_called = true;
3735 return false;
3737 if (is_traceback && !m_traceback_called) {
3738 OFFLOAD_TRACE(3,
3739 "Calling Fortran library to continue traceback from MIC\n");
3740 FORTRAN_TRACE_BACK(OFFLOAD_ERROR);
3741 exit(1);
3743 report_coi_error(c_event_wait, res);
3747 if (!m_out_with_preallocated && !scatter_copyout_data()) {
3748 return false;
3750 // destroy buffers
3752 OffloadTimer timer(get_timer_data(), c_offload_host_destroy_buffers);
3754 for (BufferList::const_iterator it = m_destroy_buffers.begin();
3755 it != m_destroy_buffers.end(); it++) {
3756 res = COI::BufferDestroy(*it);
3757 if (res != COI_SUCCESS) {
3758 if (m_status != 0) {
3759 m_status->result = translate_coi_error(res);
3760 return false;
3762 report_coi_error(c_buf_destroy, res);
3767 return true;
3770 void OffloadDescriptor::cleanup()
3772 // release device in orsl
3773 ORSL::release(m_device.get_logical_index());
3775 OFFLOAD_TIMER_STOP(get_timer_data(), c_offload_host_total_offload);
3777 // report stuff
3778 Offload_Report_Epilog(get_timer_data());
3781 bool OffloadDescriptor::is_signaled()
3783 bool signaled = true;
3784 COIRESULT res;
3786 // check compute and receive dependencies
3787 if (m_out_deps_total > 0) {
3788 res = COI::EventWait(m_out_deps_total, m_out_deps, 0, 1, 0, 0);
3789 signaled = signaled && (res == COI_SUCCESS);
3791 else if (m_in_deps_total > 0) {
3792 res = COI::EventWait(m_in_deps_total, m_in_deps, 0, 1, 0, 0);
3793 signaled = signaled && (res == COI_SUCCESS);
3796 return signaled;
3799 static Arr_Desc * make_arr_desc(
3800 void* ptr_val,
3801 int64_t extent_start_val,
3802 int64_t extent_elements_val,
3803 int64_t size
3806 Arr_Desc *res;
3807 res = (Arr_Desc *)malloc(sizeof(Arr_Desc));
3808 if (res == NULL)
3809 LIBOFFLOAD_ERROR(c_malloc);
3810 res->base = reinterpret_cast<int64_t>(ptr_val);
3811 res->rank = 1;
3812 res->dim[0].size = size;
3813 res->dim[0].lindex = 0;
3814 res->dim[0].lower = extent_start_val;
3815 res->dim[0].upper = extent_elements_val + extent_start_val - 1;
3816 res->dim[0].stride = 1;
3817 return res;
3820 // Send pointer data if source or destination or both of them are
3821 // noncontiguous. There is guarantee that length of destination enough for
3822 // transferred data.
3823 bool OffloadDescriptor::send_noncontiguous_pointer_data(
3824 int i,
3825 PtrData* src_data,
3826 PtrData* dst_data,
3827 COIEVENT *event,
3828 uint64_t &data_sent,
3829 uint32_t in_deps_amount,
3830 COIEVENT *in_deps
3833 NonContigDesc *desc;
3834 int noncont_num;
3835 int64_t offset_src, offset_dst;
3836 int64_t length_src, length_dst;
3837 int64_t length_src_cur, length_dst_cur;
3838 int64_t send_size;
3839 COIRESULT res;
3840 bool dst_is_empty = true;
3841 bool src_is_empty = true;
3843 // If BufferWriteMultiD is defined we can set values of required arguments
3844 // and transfer noncontiguous data via call to the COI routine.
3845 if (!m_vars[i].flags.is_non_cont_struct &&
3846 __offload_use_coi_noncontiguous_transfer && COI::BufferWriteMultiD) {
3847 struct Arr_Desc* arr_desc_dst;
3848 struct Arr_Desc* arr_desc_src;
3849 int64_t size_src, size_dst;
3850 char *base = offload_get_src_base(static_cast<char*>(m_vars[i].ptr),
3851 m_vars_extra[i].type_src);
3852 COIBUFFER dst_buf = m_vars[i].into ?
3853 m_vars_extra[i].dst_data->mic_buf :
3854 m_vars_extra[i].src_data->mic_buf;
3856 offset_src = (m_vars_extra[i].read_rng_src)?
3857 m_vars_extra[i].read_rng_src->init_offset : m_vars_extra[i].cpu_disp;
3858 size_src = m_vars_extra[i].read_rng_src ?
3859 cean_get_transf_size(m_vars_extra[i].read_rng_src) :
3860 m_vars[i].size;
3862 offset_dst = (m_vars_extra[i].read_rng_dst)?
3863 m_vars_extra[i].read_rng_dst->init_offset : m_vars[i].disp;
3864 size_dst = m_vars_extra[i].read_rng_dst ?
3865 cean_get_transf_size(m_vars_extra[i].read_rng_dst) : m_vars[i].size;
3867 int64_t el_size = (!m_vars[i].into ||
3868 (m_vars_extra[i].read_rng_src && m_vars_extra[i].read_rng_dst)) ?
3870 m_vars_extra[i].read_rng_src ?
3871 m_vars_extra[i].read_rng_src->arr_desc->dim[
3872 m_vars_extra[i].read_rng_src->arr_desc->rank - 1].size :
3873 m_vars_extra[i].read_rng_dst->arr_desc->dim[
3874 m_vars_extra[i].read_rng_dst->arr_desc->rank - 1].size;
3876 arr_desc_src = (m_vars_extra[i].read_rng_src) ?
3877 m_vars_extra[i].read_rng_src->arr_desc :
3878 make_arr_desc(NULL, // don't required for source
3879 offset_src/el_size, size_src/el_size, el_size);
3881 arr_desc_dst = !m_vars[i].into ?
3882 arr_desc_src :
3883 (m_vars_extra[i].read_rng_dst) ?
3884 m_vars_extra[i].read_rng_dst->arr_desc :
3885 make_arr_desc(NULL,
3886 offset_dst/el_size, size_src/el_size, el_size);
3888 int64_t alloc_disp = m_vars[i].into ?
3889 m_vars_extra[i].dst_data->alloc_disp :
3890 m_vars_extra[i].src_data->alloc_disp;
3892 arr_desc_dst->base = 0;
3893 arr_desc_src->base = reinterpret_cast<int64_t>(base);
3895 res = COI::BufferWriteMultiD(
3896 dst_buf, // in_DestBuffer,
3897 NULL, // DestProcess,
3898 m_vars[i].offset + m_vars[i].mic_offset -
3899 alloc_disp, // Offset
3900 (void*)arr_desc_dst, // descriptor of DestArray
3901 (void*)arr_desc_src, // descriptor of SrcArray
3902 COI_COPY_UNSPECIFIED, // Type
3903 m_num_in_dependencies, // Number of in Dependencies
3904 m_p_in_dependencies, // array of in Dependencies
3905 event); // out Dependency
3906 if (res != COI_SUCCESS) {
3907 if (m_status != 0) {
3908 m_status->result = translate_coi_error(res);
3909 return false;
3911 report_coi_error(c_buf_copy, res);
3913 return(true);
3916 data_sent = 0;
3917 if (m_vars[i].flags.is_non_cont_struct) {
3918 desc = m_vars_extra[i].noncont_desc;
3919 noncont_num = 0;
3921 else {
3922 // Set length_src and length_dst
3923 length_src = (m_vars_extra[i].read_rng_src) ?
3924 m_vars_extra[i].read_rng_src->range_size : m_vars[i].size;
3925 length_dst = !m_vars[i].into ? length_src :
3926 (m_vars_extra[i].read_rng_dst) ?
3927 m_vars_extra[i].read_rng_dst->range_size :
3928 m_vars[i].size;
3929 send_size = (length_src < length_dst) ? length_src : length_dst;
3932 // if event is defined we must multiplate it for all contiguous ranges
3933 // that will be Copied/Write.
3934 // Take in account that we already have 1 event.
3935 if (event) {
3936 uint32_t range_num = m_vars[i].flags.is_non_cont_struct ?
3937 desc->interval_cnt :
3938 (length_src / send_size) *
3939 ((m_vars_extra[i].read_rng_src) ?
3940 m_vars_extra[i].read_rng_src->range_max_number : 1) ;
3941 m_in_deps_allocated += range_num ;
3942 m_in_deps =
3943 (COIEVENT*)realloc(m_in_deps, sizeof(COIEVENT) * m_in_deps_allocated);
3944 m_in_deps_total--;
3947 // consequently get contiguous ranges,
3948 // define corresponded destination offset and send data
3949 do {
3950 if (m_vars[i].flags.is_non_cont_struct) {
3951 // ranges are over
3952 if (noncont_num >= desc->interval_cnt) {
3953 break;
3955 offset_src = offset_dst = desc->interval[noncont_num].lower;
3956 send_size = desc->interval[noncont_num].size;
3957 noncont_num++;
3959 else {
3960 if (src_is_empty) {
3961 if (m_vars_extra[i].read_rng_src) {
3962 if (!get_next_range(m_vars_extra[i].read_rng_src,
3963 &offset_src)) {
3964 // source ranges are over - nothing to send
3965 break;
3968 else if (data_sent == 0) {
3969 offset_src = m_vars_extra[i].cpu_disp;
3971 else {
3972 break;
3974 length_src_cur = length_src;
3976 else {
3977 // if source is contiguous or its contiguous range is greater
3978 // than destination one
3979 offset_src += send_size;
3981 length_src_cur -= send_size;
3982 src_is_empty = length_src_cur == 0;
3984 if (dst_is_empty) {
3985 if (m_vars[i].into) {
3986 if (m_vars_extra[i].read_rng_dst) {
3987 if (!get_next_range(m_vars_extra[i].read_rng_dst,
3988 &offset_dst)) {
3989 // destination ranges are over
3990 LIBOFFLOAD_ERROR(c_destination_is_over);
3991 return false;
3994 // into is contiguous.
3995 else {
3996 offset_dst = m_vars[i].disp;
3998 length_dst_cur = length_dst;
4000 // same as source
4001 else {
4002 offset_dst = offset_src;
4003 length_dst_cur = length_src;
4006 else {
4007 // if destination is contiguous or its contiguous range is greater
4008 // than source one
4009 offset_dst += send_size;
4011 length_dst_cur -= send_size;
4012 dst_is_empty = length_dst_cur == 0;
4014 if (event) {
4015 event = &m_in_deps[m_in_deps_total++];
4017 if (src_data != 0 && src_data->cpu_buf != 0) {
4018 res = COI::BufferCopy(
4019 dst_data->mic_buf,
4020 src_data->cpu_buf,
4021 m_vars[i].mic_offset +
4022 m_vars[i].offset + offset_dst,
4023 m_vars_extra[i].cpu_offset + offset_src,
4024 send_size,
4025 COI_COPY_UNSPECIFIED,
4026 m_num_in_dependencies,
4027 m_p_in_dependencies,
4028 event);
4029 if (res != COI_SUCCESS) {
4030 if (m_status != 0) {
4031 m_status->result = translate_coi_error(res);
4032 return false;
4034 report_coi_error(c_buf_copy, res);
4037 else {
4038 char *base = offload_get_src_base(m_vars[i].ptr,
4039 m_vars_extra[i].type_src);
4041 res = COI::BufferWrite(
4042 dst_data->mic_buf,
4043 m_vars[i].mic_offset +
4044 m_vars[i].offset + offset_dst,
4045 base + offset_src,
4046 send_size,
4047 COI_COPY_UNSPECIFIED,
4048 m_num_in_dependencies,
4049 m_p_in_dependencies,
4050 event);
4051 if (res != COI_SUCCESS) {
4052 if (m_status != 0) {
4053 m_status->result = translate_coi_error(res);
4054 return false;
4056 report_coi_error(c_buf_write, res);
4059 data_sent += send_size;
4061 while (true);
4062 return true;
4065 bool OffloadDescriptor::send_pointer_data(bool is_async, void* info)
4067 OffloadTimer timer(get_timer_data(), c_offload_host_send_pointers);
4069 bool should_use_async_buffer_write = m_initial_need_runfunction;
4070 uint64_t ptr_sent = 0;
4071 COIRESULT res;
4072 uint32_t in_deps_amount = 0;
4073 COIEVENT *in_deps = NULL;
4075 // For offload_transfer and offload with empty body without signal:
4076 // - if there is only one buffer copy - send data synchronously
4077 // - if there are multiple buffer copy and
4078 // __offload_parallel_copy is false - send data synchronously
4079 // - if there are multiple buffer copy and
4080 // __offload_parallel_copy is true - send data asynchronously
4081 // It concerns only big size data - greater than __offload_use_async_buffer_write.
4082 // Data of size less than __offload_use_async_buffer_write are sent synchronously.
4083 // Synchronous transfer results in better performance in COI.
4084 // __offload_parallel_copy is false by default but can be changed
4085 // via environment variable OFFLOAD_PARALLEL_COPY
4086 if (!m_initial_need_runfunction && __offload_parallel_copy) {
4087 int big_size_count = 0;
4088 for (int i = 0; i < m_vars_total; i++) {
4089 if (m_vars[i].direction.in &&
4090 m_vars[i].size >= __offload_use_async_buffer_write) {
4091 switch (m_vars_extra[i].type_dst) {
4092 case c_data:
4093 case c_void_ptr:
4094 case c_void_ptr_ptr:
4095 case c_cean_var:
4096 if (m_vars[i].flags.is_static_dstn) {
4097 big_size_count++;
4099 break;
4100 case c_string_ptr:
4101 case c_string_ptr_ptr:
4102 case c_data_ptr:
4103 case c_data_ptr_ptr:
4104 case c_cean_var_ptr:
4105 case c_cean_var_ptr_ptr:
4106 case c_dv_ptr:
4107 case c_dv_data:
4108 case c_dv_ptr_data:
4109 case c_dv_data_slice:
4110 case c_dv_ptr_data_slice:
4111 big_size_count++;
4112 break;
4113 default:
4114 break;
4118 if (big_size_count > 1) {
4119 should_use_async_buffer_write = true;
4123 // Initiate send for pointer data
4124 for (int i = 0; i < m_vars_total; i++) {
4125 uint64_t sent_data = m_vars[i].size;
4127 if (m_vars_extra[i].omp_last_event_type == c_last_write &&
4128 m_in_deps_total > 0) {
4129 m_num_in_dependencies = m_in_deps_total;
4130 m_p_in_dependencies = m_in_deps;
4132 switch (m_vars_extra[i].type_dst) {
4133 case c_data_ptr_array:
4134 break;
4135 case c_data:
4136 case c_void_ptr:
4137 case c_void_ptr_ptr:
4138 case c_cean_var:
4139 if (m_vars[i].direction.in &&
4140 m_vars[i].flags.is_static_dstn) {
4141 COIEVENT *event =
4142 (m_stream != no_stream ||
4143 is_async ||
4144 (should_use_async_buffer_write &&
4145 m_vars[i].size >= __offload_use_async_buffer_write)) ?
4146 &m_in_deps[m_in_deps_total++] : 0;
4147 PtrData* dst_data = m_vars[i].into ?
4148 m_vars_extra[i].dst_data :
4149 m_vars_extra[i].src_data;
4150 PtrData* src_data =
4151 VAR_TYPE_IS_PTR(m_vars_extra[i].type_src) ||
4152 VAR_TYPE_IS_SCALAR(m_vars_extra[i].type_src) &&
4153 m_vars[i].flags.is_static ?
4154 m_vars_extra[i].src_data : 0;
4156 if (m_vars[i].flags.is_non_cont_struct ||
4157 m_vars[i].flags.is_noncont_src ||
4158 m_vars[i].flags.is_noncont_dst) {
4159 if (!send_noncontiguous_pointer_data(
4160 i, src_data, dst_data, event, sent_data,
4161 m_num_in_dependencies, m_p_in_dependencies)) {
4162 return false;
4165 else if (src_data != 0 && src_data->cpu_buf != 0) {
4166 res = COI::BufferCopy(
4167 dst_data->mic_buf,
4168 src_data->cpu_buf,
4169 m_vars[i].mic_offset +
4170 m_vars[i].offset + m_vars[i].disp,
4171 m_vars_extra[i].cpu_offset +
4172 m_vars_extra[i].cpu_disp,
4173 m_vars[i].size,
4174 COI_COPY_UNSPECIFIED,
4175 m_num_in_dependencies,
4176 m_p_in_dependencies,
4177 event);
4178 if (res != COI_SUCCESS) {
4179 if (m_status != 0) {
4180 m_status->result = translate_coi_error(res);
4181 return false;
4183 report_coi_error(c_buf_copy, res);
4186 else {
4187 char *base = offload_get_src_base(m_vars[i].ptr,
4188 m_vars_extra[i].type_src);
4189 res = COI::BufferWrite(
4190 dst_data->mic_buf,
4191 m_vars[i].mic_offset +
4192 m_vars[i].offset + m_vars[i].disp,
4193 base + m_vars_extra[i].cpu_disp,
4194 m_vars[i].size,
4195 COI_COPY_UNSPECIFIED,
4196 m_num_in_dependencies,
4197 m_p_in_dependencies,
4198 event);
4199 if (res != COI_SUCCESS) {
4200 if (m_status != 0) {
4201 m_status->result = translate_coi_error(res);
4202 return false;
4204 report_coi_error(c_buf_write, res);
4207 ptr_sent += sent_data;
4209 break;
4211 case c_data_ptr:
4212 // If use_device_ptr no data needs to be sent
4213 if (m_vars[i].flags.use_device_ptr) {
4214 break;
4216 case c_string_ptr:
4217 case c_string_ptr_ptr:
4218 case c_data_ptr_ptr:
4219 case c_cean_var_ptr:
4220 case c_cean_var_ptr_ptr:
4221 case c_dv_ptr:
4222 if (m_vars[i].direction.in && m_vars[i].size > 0) {
4223 COIEVENT *event =
4224 (m_stream != no_stream ||
4225 is_async ||
4226 (should_use_async_buffer_write &&
4227 m_vars[i].size >= __offload_use_async_buffer_write)) ?
4228 &m_in_deps[m_in_deps_total++] : 0;
4229 PtrData* dst_data = m_vars[i].into ?
4230 m_vars_extra[i].dst_data :
4231 m_vars_extra[i].src_data;
4232 PtrData* src_data =
4233 VAR_TYPE_IS_PTR(m_vars_extra[i].type_src) ||
4234 VAR_TYPE_IS_SCALAR(m_vars_extra[i].type_src) &&
4235 m_vars[i].flags.is_static ?
4236 m_vars_extra[i].src_data : 0;
4238 if (m_vars[i].flags.is_non_cont_struct ||
4239 m_vars[i].flags.is_noncont_src ||
4240 m_vars[i].flags.is_noncont_dst) {
4241 send_noncontiguous_pointer_data(
4242 i, src_data, dst_data, event, sent_data,
4243 in_deps_amount, in_deps);
4245 else if (src_data != 0 && src_data->cpu_buf != 0) {
4246 res = COI::BufferCopy(
4247 dst_data->mic_buf,
4248 src_data->cpu_buf,
4249 m_vars[i].mic_offset +
4250 m_vars[i].offset + m_vars[i].disp,
4251 m_vars_extra[i].cpu_offset +
4252 m_vars_extra[i].cpu_disp,
4253 m_vars[i].size,
4254 COI_COPY_UNSPECIFIED,
4255 m_num_in_dependencies,
4256 m_p_in_dependencies,
4257 event);
4258 if (res != COI_SUCCESS) {
4259 if (m_status != 0) {
4260 m_status->result = translate_coi_error(res);
4261 return false;
4263 report_coi_error(c_buf_copy, res);
4266 else {
4267 char *base = offload_get_src_base(m_vars[i].ptr,
4268 m_vars_extra[i].type_src);
4269 res = COI::BufferWrite(
4270 dst_data->mic_buf,
4271 m_vars[i].mic_offset +
4272 m_vars[i].offset + m_vars[i].disp,
4273 base + m_vars_extra[i].cpu_disp,
4274 m_vars[i].size,
4275 COI_COPY_UNSPECIFIED,
4276 m_num_in_dependencies,
4277 m_p_in_dependencies,
4278 event);
4279 if (res != COI_SUCCESS) {
4280 if (m_status != 0) {
4281 m_status->result = translate_coi_error(res);
4282 return false;
4284 report_coi_error(c_buf_write, res);
4288 ptr_sent += sent_data;
4290 break;
4292 case c_dv_data:
4293 case c_dv_ptr_data:
4294 if (m_vars[i].direction.in &&
4295 m_vars[i].size > 0) {
4296 PtrData *ptr_data = m_vars[i].into ?
4297 m_vars_extra[i].dst_data :
4298 m_vars_extra[i].src_data;
4299 PtrData* src_data = m_vars_extra[i].src_data;
4301 COIEVENT *event =
4302 (m_stream != no_stream ||
4303 is_async ||
4304 (should_use_async_buffer_write &&
4305 m_vars[i].size >= __offload_use_async_buffer_write)) ?
4306 &m_in_deps[m_in_deps_total++] : 0;
4308 if (m_vars[i].flags.is_non_cont_struct ||
4309 m_vars[i].flags.is_noncont_src ||
4310 m_vars[i].flags.is_noncont_dst) {
4311 send_noncontiguous_pointer_data(
4312 i, src_data, ptr_data, event, sent_data,
4313 in_deps_amount, in_deps);
4315 else if (src_data && src_data->cpu_buf != 0) {
4316 res = COI::BufferCopy(
4317 ptr_data->mic_buf,
4318 src_data->cpu_buf,
4319 m_vars[i].offset + ptr_data->mic_offset +
4320 m_vars[i].disp,
4321 m_vars_extra[i].cpu_offset +
4322 m_vars_extra[i].cpu_disp,
4323 m_vars[i].size,
4324 COI_COPY_UNSPECIFIED,
4325 m_num_in_dependencies,
4326 m_p_in_dependencies,
4327 event);
4328 if (res != COI_SUCCESS) {
4329 if (m_status != 0) {
4330 m_status->result = translate_coi_error(res);
4331 return false;
4333 report_coi_error(c_buf_copy, res);
4336 else {
4337 char *base = offload_get_src_base(m_vars[i].ptr,
4338 m_vars_extra[i].type_src);
4339 res = COI::BufferWrite(
4340 ptr_data->mic_buf,
4341 ptr_data->mic_offset +
4342 m_vars[i].offset + m_vars[i].disp,
4343 base + m_vars_extra[i].cpu_disp,
4344 m_vars[i].size,
4345 COI_COPY_UNSPECIFIED,
4346 m_num_in_dependencies,
4347 m_p_in_dependencies,
4348 event);
4349 if (res != COI_SUCCESS) {
4350 if (m_status != 0) {
4351 m_status->result = translate_coi_error(res);
4352 return false;
4354 report_coi_error(c_buf_write, res);
4357 ptr_sent += sent_data;
4359 break;
4361 case c_dv_data_slice:
4362 case c_dv_ptr_data_slice:
4363 if (m_vars[i].direction.in &&
4364 m_vars[i].size > 0) {
4365 PtrData *dst_data = m_vars[i].into ?
4366 m_vars_extra[i].dst_data :
4367 m_vars_extra[i].src_data;
4368 PtrData* src_data =
4369 (VAR_TYPE_IS_PTR(m_vars_extra[i].type_src) ||
4370 VAR_TYPE_IS_DV_DATA(m_vars_extra[i].type_src) ||
4371 VAR_TYPE_IS_DV_DATA_SLICE(m_vars_extra[i].type_src) ||
4372 VAR_TYPE_IS_SCALAR(m_vars_extra[i].type_src) &&
4373 m_vars[i].flags.is_static) ?
4374 m_vars_extra[i].src_data : 0;
4375 COIEVENT *event =
4376 (m_stream != no_stream ||
4377 is_async ||
4378 (should_use_async_buffer_write &&
4379 m_vars[i].size >= __offload_use_async_buffer_write)) ?
4380 &m_in_deps[m_in_deps_total++] : 0;
4381 if (m_vars[i].flags.is_non_cont_struct ||
4382 m_vars[i].flags.is_noncont_src ||
4383 m_vars[i].flags.is_noncont_dst) {
4384 send_noncontiguous_pointer_data(
4385 i, src_data, dst_data, event, sent_data,
4386 in_deps_amount, in_deps);
4388 else if (src_data && src_data->cpu_buf != 0) {
4389 res = COI::BufferCopy(
4390 dst_data->mic_buf,
4391 src_data->cpu_buf,
4392 m_vars[i].offset +
4393 dst_data->mic_offset +
4394 m_vars[i].disp,
4395 m_vars_extra[i].cpu_offset +
4396 m_vars_extra[i].cpu_disp,
4397 m_vars[i].size,
4398 COI_COPY_UNSPECIFIED,
4399 m_num_in_dependencies,
4400 m_p_in_dependencies,
4401 event);
4402 if (res != COI_SUCCESS) {
4403 if (m_status != 0) {
4404 m_status->result = translate_coi_error(res);
4405 return false;
4407 report_coi_error(c_buf_copy, res);
4410 else {
4411 char *base = offload_get_src_base(m_vars[i].ptr,
4412 m_vars_extra[i].type_src);
4413 res = COI::BufferWrite(
4414 dst_data->mic_buf,
4415 dst_data->mic_offset +
4416 m_vars[i].offset + m_vars[i].disp,
4417 base + m_vars_extra[i].cpu_disp,
4418 m_vars[i].size,
4419 COI_COPY_UNSPECIFIED,
4420 m_num_in_dependencies,
4421 m_p_in_dependencies,
4422 event);
4423 if (res != COI_SUCCESS) {
4424 if (m_status != 0) {
4425 m_status->result = translate_coi_error(res);
4426 return false;
4428 report_coi_error(c_buf_write, res);
4432 ptr_sent += sent_data;
4434 break;
4436 default:
4437 break;
4439 if (m_vars_extra[i].omp_last_event_type == c_last_write) {
4440 register_omp_event_call_back(&m_in_deps[m_in_deps_total - 1], info);
4442 // alloc field isn't used at target.
4443 // We can reuse it for offset of array pointers.
4444 if (m_vars_extra[i].is_arr_ptr_el) {
4445 m_vars[i].ptr_arr_offset = m_vars_extra[i].ptr_arr_offset;
4448 // list of out events created while send_pointer_data now became input
4449 // dependencies for runfunction (or Read transfers from target if
4450 // runfunction is absent)
4451 m_num_in_dependencies = m_in_deps_total ? m_in_deps_total :
4452 m_num_in_dependencies;
4453 m_p_in_dependencies = m_in_deps_total ? m_in_deps : m_p_in_dependencies;
4455 if (m_status) {
4456 m_status->data_sent += ptr_sent;
4459 OFFLOAD_TIMER_HOST_SDATA(get_timer_data(), ptr_sent);
4460 OFFLOAD_DEBUG_TRACE_1(1, GET_OFFLOAD_NUMBER(get_timer_data()),
4461 c_offload_sent_pointer_data,
4462 "Total pointer data sent to target: [%lld] bytes\n",
4463 ptr_sent);
4465 return true;
4468 bool OffloadDescriptor::gather_copyin_data()
4470 OffloadTimer timer(get_timer_data(), c_offload_host_gather_inputs);
4472 if (m_need_runfunction && m_in_datalen > 0) {
4473 COIMAPINSTANCE map_inst;
4474 char *data;
4476 // init marshaller
4477 if (m_inout_buf != 0) {
4478 OffloadTimer timer_map(get_timer_data(),
4479 c_offload_host_map_in_data_buffer);
4481 COIRESULT res = COI::BufferMap(m_inout_buf, 0, m_in_datalen,
4482 COI_MAP_WRITE_ENTIRE_BUFFER,
4483 0, 0, 0, &map_inst,
4484 reinterpret_cast<void**>(&data));
4485 if (res != COI_SUCCESS) {
4486 if (m_status != 0) {
4487 m_status->result = translate_coi_error(res);
4488 return false;
4490 report_coi_error(c_buf_map, res);
4493 else {
4494 data = (char*) m_func_desc + m_func_desc->data_offset;
4497 // send variable descriptors
4498 memcpy(data, m_vars, m_vars_total * sizeof(VarDesc));
4499 data += m_vars_total * sizeof(VarDesc);
4501 // init marshaller
4502 m_in.init_buffer(data, m_in_datalen);
4504 // Gather copy data into buffer
4505 for (int i = 0; i < m_vars_total; i++) {
4506 bool src_is_for_mic = (m_vars[i].direction.out ||
4507 m_vars[i].into == NULL);
4508 PtrData* ptr_data = src_is_for_mic ?
4509 m_vars_extra[i].src_data :
4510 m_vars_extra[i].dst_data;
4511 if (m_vars[i].flags.alloc_disp) {
4512 m_in.send_data(&ptr_data->alloc_disp,
4513 sizeof(ptr_data->alloc_disp));
4515 if (TYPE_IS_PTR_TO_PTR(m_vars_extra[i].type_src) ||
4516 TYPE_IS_PTR_TO_PTR(m_vars_extra[i].type_dst) ||
4517 (m_vars_extra[i].type_src == c_data_ptr_array &&
4518 m_vars[i].flags.is_pointer)) {
4519 m_in.send_data(&m_vars_extra[i].pointer_offset,
4520 sizeof(m_vars_extra[i].pointer_offset));
4522 // send sink address to the target
4523 if (m_vars[i].flags.sink_addr) {
4524 m_in.send_data(&ptr_data->mic_addr,
4525 sizeof(ptr_data->mic_addr));
4528 switch (m_vars_extra[i].type_dst) {
4529 case c_data_ptr_array:
4530 break;
4531 case c_data:
4532 case c_void_ptr:
4533 case c_void_ptr_ptr:
4534 case c_cean_var:
4535 if (m_vars[i].direction.in &&
4536 !m_vars[i].flags.is_static_dstn) {
4538 char *ptr = offload_get_src_base(m_vars[i].ptr,
4539 m_vars_extra[i].type_src);
4540 if (m_vars_extra[i].type_dst == c_cean_var) {
4541 // offset and length are derived from the array
4542 // descriptor
4543 int64_t size = m_vars[i].size;
4544 int64_t disp = m_vars[i].disp;
4545 m_in.send_data(reinterpret_cast<char*>(&size),
4546 sizeof(int64_t));
4547 m_in.send_data(reinterpret_cast<char*>(&disp),
4548 sizeof(int64_t));
4551 m_in.send_data(ptr + m_vars_extra[i].cpu_disp,
4552 m_vars[i].size);
4554 break;
4556 case c_dv:
4557 if (m_vars[i].direction.bits ||
4558 m_vars[i].alloc_if ||
4559 m_vars[i].free_if) {
4560 // send dope vector excluding base
4561 char *ptr = static_cast<char*>(m_vars[i].ptr);
4562 m_in.send_data(ptr + sizeof(uint64_t),
4563 m_vars[i].size - sizeof(uint64_t));
4565 break;
4567 case c_data_ptr:
4568 // send to target addresses of obsolete
4569 // stacks to be released
4570 if (m_vars[i].flags.is_stack_buf &&
4571 !m_vars[i].direction.bits &&
4572 m_vars[i].alloc_if &&
4573 m_vars[i].size != 0) {
4574 for (PtrDataList::iterator it =
4575 m_destroy_stack.begin();
4576 it != m_destroy_stack.end(); it++) {
4577 PtrData * ptr_data = *it;
4578 m_in.send_data(&(ptr_data->mic_addr),
4579 sizeof(ptr_data->mic_addr));
4582 break;
4583 case c_func_ptr:
4584 case c_func_ptr_ptr:
4585 if (m_vars[i].direction.in) {
4586 m_in.send_func_ptr(*((const void**) m_vars[i].ptr));
4588 break;
4590 default:
4591 break;
4595 if (m_status) {
4596 m_status->data_sent += m_in.get_tfr_size();
4599 if (m_func_desc->data_offset == 0) {
4600 OffloadTimer timer_unmap(get_timer_data(),
4601 c_offload_host_unmap_in_data_buffer);
4602 COIRESULT res = COI::BufferUnmap(map_inst, 0, 0, 0);
4603 if (res != COI_SUCCESS) {
4604 if (m_status != 0) {
4605 m_status->result = translate_coi_error(res);
4606 return false;
4608 report_coi_error(c_buf_unmap, res);
4613 OFFLOAD_TIMER_HOST_SDATA(get_timer_data(), m_in.get_tfr_size());
4614 OFFLOAD_DEBUG_TRACE_1(1,
4615 GET_OFFLOAD_NUMBER(get_timer_data()), c_offload_copyin_data,
4616 "Total copyin data sent to target: [%lld] bytes\n",
4617 m_in.get_tfr_size());
4619 return true;
4622 bool OffloadDescriptor::compute(void *info)
4624 OffloadTimer timer(get_timer_data(), c_offload_host_start_compute);
4626 if (m_need_runfunction) {
4627 OFFLOAD_DEBUG_TRACE_1(2, GET_OFFLOAD_NUMBER(get_timer_data()),
4628 c_offload_compute, "Compute task on MIC\n");
4630 void* misc = m_func_desc;
4631 int misc_len = m_func_desc_size;
4632 void* ret = 0;
4633 int ret_len = 0;
4635 if (m_func_desc->data_offset != 0) {
4636 misc_len += m_in_datalen;
4638 if (m_out_datalen > 0) {
4639 ret = (char*) m_func_desc + m_func_desc->data_offset;
4640 ret_len = m_out_datalen;
4644 // dispatch task
4645 COIRESULT res;
4646 COIEVENT event;
4648 res = m_device.compute(m_stream,
4649 m_compute_buffers,
4650 misc, misc_len,
4651 ret, ret_len,
4652 m_num_in_dependencies,
4653 m_p_in_dependencies,
4654 &event);
4656 if (res != COI_SUCCESS) {
4657 if (m_status != 0) {
4658 m_status->result = translate_coi_error(res);
4659 return false;
4661 report_coi_error(c_pipeline_run_func, res);
4664 if (m_omp_async_last_event_type == c_last_runfunc) {
4665 register_omp_event_call_back(&event, info);
4668 m_in_deps_total = m_num_in_dependencies = 1;
4669 m_in_deps[0] = event;
4670 m_p_in_dependencies = m_in_deps;
4673 return true;
4676 // receive pointer data if source or destination or both of them are
4677 // noncontiguous. There is guarantee that length of destination enough for
4678 // transferred data.
4679 bool OffloadDescriptor::receive_noncontiguous_pointer_data(
4680 int i,
4681 COIBUFFER dst_buf,
4682 COIEVENT *event,
4683 uint64_t &received_data,
4684 uint32_t in_deps_amount,
4685 COIEVENT *in_deps
4688 NonContigDesc *desc;
4689 int noncont_num;
4690 int64_t offset_src, offset_dst;
4691 int64_t length_src, length_dst;
4692 int64_t length_src_cur, length_dst_cur;
4693 int64_t receive_size;
4694 COIRESULT res;
4695 bool dst_is_empty = true;
4696 bool src_is_empty = true;
4698 char *base = offload_get_src_base(
4699 m_vars[i].into ?
4700 static_cast<char*>(m_vars[i].into) :
4701 static_cast<char*>(m_vars[i].ptr),
4702 m_vars_extra[i].type_dst);
4703 received_data = 0;
4705 // If BufferReadMultiD is defined we can set values of required arguments
4706 // and transfer noncontiguous data via call to the COI routine.
4707 if (__offload_use_coi_noncontiguous_transfer && COI::BufferReadMultiD) {
4708 struct Arr_Desc* arr_desc_dst;
4709 struct Arr_Desc* arr_desc_src;
4710 int64_t size_src, size_dst;
4712 offset_src = (m_vars_extra[i].read_rng_src)?
4713 m_vars_extra[i].read_rng_src->init_offset : m_vars[i].disp;
4714 size_src = m_vars_extra[i].read_rng_src ?
4715 cean_get_transf_size(m_vars_extra[i].read_rng_src) :
4716 m_vars[i].size;
4718 offset_dst = (m_vars_extra[i].read_rng_dst)?
4719 m_vars_extra[i].read_rng_dst->init_offset : m_vars_extra[i].cpu_disp;
4720 size_dst = m_vars_extra[i].read_rng_dst ?
4721 cean_get_transf_size(m_vars_extra[i].read_rng_dst) : m_vars[i].size;
4723 int64_t el_size = (!m_vars[i].into ||
4724 (m_vars_extra[i].read_rng_src &&
4725 m_vars_extra[i].read_rng_dst)) ?
4727 m_vars_extra[i].read_rng_src ?
4728 m_vars_extra[i].read_rng_src->arr_desc->dim[
4729 m_vars_extra[i].read_rng_src->arr_desc->rank - 1].size :
4730 m_vars_extra[i].read_rng_dst->arr_desc->dim[
4731 m_vars_extra[i].read_rng_dst->arr_desc->rank - 1].size;
4732 arr_desc_src = (m_vars_extra[i].read_rng_src) ?
4733 m_vars_extra[i].read_rng_src->arr_desc :
4734 make_arr_desc(NULL, // don't required for source
4735 offset_src/el_size, size_src/el_size,
4736 el_size);
4737 arr_desc_dst = !m_vars[i].into ? arr_desc_src :
4738 (m_vars_extra[i].read_rng_dst) ?
4739 m_vars_extra[i].read_rng_dst->arr_desc :
4740 make_arr_desc(NULL,
4741 offset_dst/el_size, size_src/el_size, el_size);
4743 arr_desc_dst->base = reinterpret_cast<int64_t>(base);
4745 res = COI::BufferReadMultiD(
4746 m_vars_extra[i].src_data->mic_buf, // SourceBuffer
4747 m_vars[i].offset + m_vars[i].mic_offset -
4748 m_vars_extra[i].src_data->alloc_disp, // Offset
4749 (void*)arr_desc_dst, // descriptor of DestArray
4750 (void*)arr_desc_src, // descriptor of SrcArray
4751 COI_COPY_UNSPECIFIED, // Type
4752 m_num_in_dependencies, // Number of in Dependencies
4753 m_p_in_dependencies, // array of in Dependencies
4754 event); // out Dependency
4755 if (res != COI_SUCCESS) {
4756 if (m_status != 0) {
4757 m_status->result = translate_coi_error(res);
4758 return false;
4760 report_coi_error(c_buf_copy, res);
4762 return(true);
4764 if (m_vars[i].flags.is_non_cont_struct) {
4765 desc = m_vars_extra[i].noncont_desc;
4766 noncont_num = 0;
4768 else {
4769 // Set length_src and length_dst
4770 length_src = (m_vars_extra[i].read_rng_src) ?
4771 m_vars_extra[i].read_rng_src->range_size : m_vars[i].size;
4772 length_dst = !m_vars[i].into ? length_src :
4773 (m_vars_extra[i].read_rng_dst) ?
4774 m_vars_extra[i].read_rng_dst->range_size : m_vars[i].size;
4775 receive_size = (length_src < length_dst) ? length_src : length_dst;
4778 // if event is defined we must multiplate for all contiguous intervals
4779 // that will be Copied/Read.
4780 // Take in account that we already have 1 event.
4781 if (event) {
4782 uint32_t range_num = m_vars[i].flags.is_non_cont_struct ?
4783 desc->interval_cnt :
4784 (length_src / receive_size) *
4785 ((m_vars_extra[i].read_rng_src) ?
4786 m_vars_extra[i].read_rng_src->range_max_number : 1) ;
4787 m_out_deps_allocated += range_num;
4788 m_out_deps =
4789 (COIEVENT*)realloc(m_out_deps, sizeof(COIEVENT) * m_out_deps_allocated);
4790 m_out_deps_total--;
4793 // consequently get contiguous ranges,
4794 // define corresponded destination offset and receive data
4795 do {
4796 if (m_vars[i].flags.is_non_cont_struct) {
4797 // ranges are over
4798 if (noncont_num >= desc->interval_cnt) {
4799 break;
4801 offset_src = offset_dst = desc->interval[noncont_num].lower;
4802 receive_size = desc->interval[noncont_num].size;
4803 noncont_num++;
4805 else { // get source offset
4806 if (src_is_empty) {
4807 if (m_vars_extra[i].read_rng_src) {
4808 if (!get_next_range(m_vars_extra[i].read_rng_src,
4809 &offset_src)) {
4810 // source ranges are over - nothing to send
4811 break;
4814 else if (received_data == 0) {
4815 offset_src = m_vars[i].disp;
4817 else {
4818 break;
4820 length_src_cur = length_src;
4822 else {
4823 // if source is contiguous or its contiguous range is greater
4824 // than destination one
4825 offset_src += receive_size;
4827 length_src_cur -= receive_size;
4828 src_is_empty = length_src_cur == 0;
4830 // get destination offset
4831 if (dst_is_empty) {
4832 if (m_vars[i].into) {
4833 if (m_vars_extra[i].read_rng_dst) {
4834 if (!get_next_range(m_vars_extra[i].read_rng_dst,
4835 &offset_dst)) {
4836 // destination ranges are over
4837 LIBOFFLOAD_ERROR(c_destination_is_over);
4838 return false;
4841 // destination is contiguous.
4842 else {
4843 offset_dst = m_vars_extra[i].cpu_disp;
4845 length_dst_cur = length_dst;
4847 // same as source
4848 else {
4849 offset_dst = offset_src;
4850 length_dst_cur = length_src;
4853 else {
4854 // if destination is contiguous or its contiguous range is greater
4855 // than source one
4856 offset_dst += receive_size;
4858 length_dst_cur -= receive_size;
4859 dst_is_empty = length_dst_cur == 0;
4861 if (event) {
4862 event = &m_out_deps[m_out_deps_total++];
4864 if (dst_buf != 0) {
4865 res = COI::BufferCopy(
4866 dst_buf,
4867 m_vars_extra[i].src_data->mic_buf,
4868 m_vars_extra[i].cpu_offset + offset_dst,
4869 m_vars[i].offset + offset_src +
4870 m_vars[i].mic_offset,
4871 receive_size,
4872 COI_COPY_UNSPECIFIED,
4873 m_num_in_dependencies,
4874 m_p_in_dependencies,
4875 event);
4876 if (res != COI_SUCCESS) {
4877 if (m_status != 0) {
4878 m_status->result = translate_coi_error(res);
4879 return false;
4881 report_coi_error(c_buf_copy, res);
4884 else {
4885 res = COI::BufferRead(
4886 m_vars_extra[i].src_data->mic_buf,
4887 m_vars[i].offset + offset_src +
4888 m_vars[i].mic_offset,
4889 base + offset_dst,
4890 receive_size,
4891 COI_COPY_UNSPECIFIED,
4892 m_num_in_dependencies,
4893 m_p_in_dependencies,
4894 event);
4895 if (res != COI_SUCCESS) {
4896 if (m_status != 0) {
4897 m_status->result = translate_coi_error(res);
4898 return false;
4900 report_coi_error(c_buf_read, res);
4903 received_data += receive_size;
4905 while (true);
4906 return true;
4909 bool OffloadDescriptor::receive_pointer_data(bool is_async,
4910 bool first_run, void *info)
4912 OffloadTimer timer(get_timer_data(), c_offload_host_start_buffers_reads);
4914 bool should_use_async_buffer_read = m_initial_need_runfunction;
4915 uint64_t ptr_received = 0;
4916 COIRESULT res;
4918 // For offload_transfer and offload with empty body without signal:
4919 // - if there is only one buffer copy - get data synchronously
4920 // - if there are multiple buffer copy and
4921 // __offload_parallel_copy is false - get data synchronously
4922 // - if there are multiple buffer copy
4923 // and __offload_parallel_copy is true - get data asynchronously
4924 // It concerns only data with size greater than __offload_use_async_buffer_read.
4925 // Data of size less than __offload_use_async_buffer_read are received synchronously.
4926 // Synchronous transfer results in better performance in COI.
4927 // __offload_parallel_copy is false by default but can be changed
4928 // via environment variable OFFLOAD_PARALLEL_COPY
4929 if (!m_initial_need_runfunction && __offload_parallel_copy) {
4930 int big_size_count = 0;
4932 for (int i = 0; i < m_vars_total; i++) {
4933 if (m_vars[i].direction.out &&
4934 m_vars[i].size >= __offload_use_async_buffer_read) {
4935 // preallocated OUT only at second run
4936 if (first_run == m_vars[i].flags.preallocated) {
4937 continue;
4939 switch (m_vars_extra[i].type_src) {
4940 case c_data:
4941 case c_void_ptr:
4942 case c_void_ptr_ptr:
4943 case c_cean_var:
4944 if (m_vars[i].flags.is_static) {
4945 big_size_count++;
4947 break;
4948 case c_string_ptr:
4949 case c_data_ptr:
4950 case c_string_ptr_ptr:
4951 case c_data_ptr_ptr:
4952 case c_cean_var_ptr:
4953 case c_cean_var_ptr_ptr:
4954 case c_dv_data:
4955 case c_dv_ptr_data:
4956 case c_dv_data_slice:
4957 case c_dv_ptr_data_slice:
4958 case c_dv_ptr:
4959 big_size_count++;
4960 break;
4961 default:
4962 break;
4966 if (big_size_count > 1) {
4967 should_use_async_buffer_read = true;
4970 uint32_t in_deps_amount = m_in_deps_total;
4971 COIEVENT *in_deps = m_in_deps_total > 0 ? m_in_deps : 0;
4973 for (int i = 0; i < m_vars_total; i++) {
4974 uint64_t received_data = m_vars[i].size;
4976 // Nothing to receive if use_device_ptr
4977 if (m_vars[i].flags.use_device_ptr )
4978 continue;
4979 if (m_vars_extra[i].omp_last_event_type == c_last_read &&
4980 m_out_deps_total > 0) {
4981 m_num_in_dependencies = m_out_deps_total;
4982 m_p_in_dependencies = m_out_deps;
4984 // At first run don't receive by preallocated target pointer as the
4985 //pointer value will be ready later after call to scatter_copyout_data
4986 if (first_run && m_vars[i].alloc_if && m_vars[i].flags.preallocated) {
4987 m_preallocated_alloc = true;
4988 // need one more call to OffloadDescriptor::receive_pointer_data
4989 if (m_vars[i].direction.out) {
4990 m_out_with_preallocated = true;
4992 continue;
4994 switch (m_vars_extra[i].type_src) {
4995 case c_data_ptr_array:
4996 break;
4997 case c_data:
4998 case c_void_ptr:
4999 case c_void_ptr_ptr:
5000 case c_cean_var:
5001 if (m_vars[i].direction.out &&
5002 m_vars[i].flags.is_static) {
5003 COIEVENT *event =
5004 (m_stream != no_stream ||
5005 is_async ||
5006 m_in_deps_total > 0 ||
5007 (should_use_async_buffer_read &&
5008 m_vars[i].size >= __offload_use_async_buffer_read)) ?
5009 &m_out_deps[m_out_deps_total++] : 0;
5010 PtrData *ptr_data = NULL;
5011 COIBUFFER dst_buf = NULL; // buffer at host
5012 char *base;
5014 if (VAR_TYPE_IS_PTR(m_vars_extra[i].type_dst)) {
5015 ptr_data = m_vars[i].into ?
5016 m_vars_extra[i].dst_data :
5017 m_vars_extra[i].src_data;
5019 else if (VAR_TYPE_IS_SCALAR(m_vars_extra[i].type_dst)) {
5020 if (m_vars[i].flags.is_static_dstn) {
5021 ptr_data = m_vars[i].into ?
5022 m_vars_extra[i].dst_data :
5023 m_vars_extra[i].src_data;
5026 dst_buf = ptr_data ? ptr_data->cpu_buf : NULL;
5027 if (dst_buf == NULL) {
5028 base = offload_get_src_base(
5029 m_vars[i].into ?
5030 static_cast<char*>(m_vars[i].into) :
5031 static_cast<char*>(m_vars[i].ptr),
5032 m_vars_extra[i].type_dst);
5035 if (m_vars[i].flags.is_non_cont_struct ||
5036 m_vars[i].flags.is_noncont_src ||
5037 m_vars[i].flags.is_noncont_dst) {
5038 receive_noncontiguous_pointer_data(
5039 i, dst_buf, event, received_data,
5040 m_num_in_dependencies, m_p_in_dependencies);
5042 else if (dst_buf != 0) {
5043 res = COI::BufferCopy(
5044 dst_buf,
5045 m_vars_extra[i].src_data->mic_buf,
5046 m_vars_extra[i].cpu_offset +
5047 m_vars_extra[i].cpu_disp,
5048 m_vars[i].offset + m_vars[i].disp,
5049 m_vars[i].size,
5050 COI_COPY_UNSPECIFIED,
5051 m_num_in_dependencies,
5052 m_p_in_dependencies,
5053 event);
5054 if (res != COI_SUCCESS) {
5055 if (m_status != 0) {
5056 m_status->result = translate_coi_error(res);
5057 return false;
5059 report_coi_error(c_buf_copy, res);
5062 else {
5063 res = COI::BufferRead(
5064 m_vars_extra[i].src_data->mic_buf,
5065 m_vars[i].offset + m_vars[i].disp,
5066 base + m_vars_extra[i].cpu_offset +
5067 m_vars_extra[i].cpu_disp,
5068 m_vars[i].size,
5069 COI_COPY_UNSPECIFIED,
5070 m_num_in_dependencies,
5071 m_p_in_dependencies,
5072 event);
5073 if (res != COI_SUCCESS) {
5074 if (m_status != 0) {
5075 m_status->result = translate_coi_error(res);
5076 return false;
5078 report_coi_error(c_buf_read, res);
5081 ptr_received += received_data;
5083 break;
5085 case c_string_ptr:
5086 case c_data_ptr:
5087 case c_string_ptr_ptr:
5088 case c_data_ptr_ptr:
5089 case c_cean_var_ptr:
5090 case c_cean_var_ptr_ptr:
5091 case c_dv_data:
5092 case c_dv_ptr_data:
5093 case c_dv_data_slice:
5094 case c_dv_ptr_data_slice:
5095 case c_dv_ptr: {
5096 COIBUFFER dst_buf = NULL; // buffer on host
5097 if (m_vars[i].direction.out && m_vars[i].size > 0) {
5098 COIEVENT *event =
5099 (m_stream != no_stream ||
5100 is_async ||
5101 m_in_deps_total > 0 ||
5102 (should_use_async_buffer_read &&
5103 m_vars[i].size >= __offload_use_async_buffer_read)) ?
5104 &m_out_deps[m_out_deps_total++] : 0;
5106 uint64_t dst_offset = 0;
5107 char *base = static_cast<char*>(m_vars[i].ptr);
5109 if (VAR_TYPE_IS_PTR(m_vars_extra[i].type_dst)) {
5110 PtrData *ptr_data = m_vars[i].into ?
5111 m_vars_extra[i].dst_data :
5112 m_vars_extra[i].src_data;
5113 dst_buf = ptr_data ? ptr_data->cpu_buf : NULL;
5114 if (dst_buf == NULL) {
5115 base = m_vars[i].into ?
5116 *static_cast<char**>(m_vars[i].into) :
5117 *static_cast<char**>(m_vars[i].ptr);
5119 dst_offset = m_vars_extra[i].cpu_offset +
5120 m_vars_extra[i].cpu_disp;
5122 else if (VAR_TYPE_IS_SCALAR(m_vars_extra[i].type_dst)) {
5123 if (m_vars[i].flags.is_static_dstn) {
5124 dst_buf = m_vars[i].into ?
5125 m_vars_extra[i].dst_data->cpu_buf :
5126 m_vars_extra[i].src_data->cpu_buf;
5128 if (dst_buf == NULL) {
5129 base = offload_get_src_base(
5130 m_vars[i].into ?
5131 static_cast<char*>(m_vars[i].into) :
5132 static_cast<char*>(m_vars[i].ptr),
5133 m_vars_extra[i].type_dst);
5135 dst_offset = m_vars_extra[i].cpu_offset +
5136 m_vars_extra[i].cpu_disp;
5138 else if (VAR_TYPE_IS_DV_DATA(m_vars_extra[i].type_dst) ||
5139 VAR_TYPE_IS_DV_DATA_SLICE(m_vars_extra[i].type_dst)) {
5140 PtrData *ptr_data = m_vars[i].into != 0 ?
5141 m_vars_extra[i].dst_data :
5142 m_vars_extra[i].src_data;
5143 dst_buf = ptr_data != 0 ? ptr_data->cpu_buf : 0;
5144 if (dst_buf == NULL) {
5145 base = offload_get_src_base(
5146 m_vars[i].into ?
5147 static_cast<char*>(m_vars[i].into) :
5148 static_cast<char*>(m_vars[i].ptr),
5149 m_vars_extra[i].type_dst);
5152 dst_offset = m_vars_extra[i].cpu_offset +
5153 m_vars_extra[i].cpu_disp;
5156 if (m_vars[i].flags.is_non_cont_struct ||
5157 m_vars[i].flags.is_noncont_src ||
5158 m_vars[i].flags.is_noncont_dst) {
5159 receive_noncontiguous_pointer_data(
5160 i, dst_buf, event, received_data,
5161 m_num_in_dependencies, m_p_in_dependencies);
5163 else if (dst_buf != 0) {
5164 res = COI::BufferCopy(
5165 dst_buf,
5166 m_vars_extra[i].src_data->mic_buf,
5167 dst_offset,
5168 m_vars[i].offset + m_vars[i].disp +
5169 m_vars[i].mic_offset,
5170 m_vars[i].size,
5171 COI_COPY_UNSPECIFIED,
5172 m_num_in_dependencies,
5173 m_p_in_dependencies,
5174 event);
5175 if (res != COI_SUCCESS) {
5176 if (m_status != 0) {
5177 m_status->result = translate_coi_error(res);
5178 return false;
5180 report_coi_error(c_buf_copy, res);
5183 else {
5184 res = COI::BufferRead(
5185 m_vars_extra[i].src_data->mic_buf,
5186 m_vars[i].offset + m_vars[i].disp +
5187 m_vars[i].mic_offset,
5188 base + dst_offset,
5189 m_vars[i].size,
5190 COI_COPY_UNSPECIFIED,
5191 m_num_in_dependencies,
5192 m_p_in_dependencies,
5193 event);
5194 if (res != COI_SUCCESS) {
5195 if (m_status != 0) {
5196 m_status->result = translate_coi_error(res);
5197 return false;
5199 report_coi_error(c_buf_read, res);
5202 ptr_received += received_data;
5204 break;
5207 default:
5208 break;
5211 if (m_vars_extra[i].omp_last_event_type == c_last_read) {
5212 register_omp_event_call_back(&m_out_deps[m_out_deps_total - 1], info);
5214 // destroy buffers for obsolete stacks
5215 if (m_destroy_stack.size() != 0) {
5216 for (PtrDataList::iterator it = m_destroy_stack.begin();
5217 it != m_destroy_stack.end(); it++) {
5218 PtrData *ptr_data = *it;
5219 m_destroy_buffers.push_back(ptr_data->mic_buf);
5220 OFFLOAD_TRACE(3, "Removing stack buffer with addr %p\n",
5221 ptr_data->mic_addr);
5223 m_destroy_stack.clear();
5225 if (m_vars[i].free_if) {
5226 // remove association for automatic variables
5227 if (m_is_openmp) {
5228 if (m_vars_extra[i].auto_data) {
5229 AutoData *auto_data = m_vars_extra[i].auto_data;
5230 if (m_vars[i].flags.always_delete) {
5231 auto_data->nullify_reference();
5233 else if (auto_data->remove_reference() == 0) {
5234 m_device.remove_auto_data(auto_data->cpu_addr.start());
5236 continue;
5238 else {
5239 PtrData *ptr_data = m_vars_extra[i].src_data;
5240 if (ptr_data &&
5241 IS_OPENMP_IMPLICIT_OR_LINK(ptr_data->var_alloc_type)) {
5242 if (ptr_data->get_reference() > 0) {
5243 ptr_data->remove_reference();
5245 continue;
5250 // destroy buffers
5251 if (m_vars[i].direction.out || m_vars[i].into == NULL) {
5252 if (!VAR_TYPE_IS_PTR(m_vars_extra[i].type_src) &&
5253 !VAR_TYPE_IS_DV_DATA_SLICE(m_vars_extra[i].type_src) &&
5254 !VAR_TYPE_IS_DV_DATA(m_vars_extra[i].type_src)) {
5255 continue;
5258 PtrData *ptr_data = m_vars_extra[i].src_data;
5259 if (ptr_data->remove_reference() == 0) {
5260 // destroy buffers
5261 if (ptr_data->cpu_buf != 0) {
5262 m_destroy_buffers.push_back(ptr_data->cpu_buf);
5264 if (ptr_data->mic_buf != 0) {
5265 m_destroy_buffers.push_back(ptr_data->mic_buf);
5267 OFFLOAD_TRACE(3, "Removing association for addr %p\n",
5268 ptr_data->cpu_addr.start());
5270 // remove association from map
5271 if (m_vars[i].flags.targetptr) {
5272 m_device.remove_targetptr_data(ptr_data->cpu_addr.start());
5274 else {
5275 m_device.remove_ptr_data(ptr_data->cpu_addr.start());
5279 else if (VAR_TYPE_IS_PTR(m_vars_extra[i].type_dst) ||
5280 VAR_TYPE_IS_DV_DATA_SLICE(m_vars_extra[i].type_dst) ||
5281 VAR_TYPE_IS_DV_DATA(m_vars_extra[i].type_dst)) {
5282 PtrData *ptr_data = m_vars_extra[i].dst_data;
5284 if (ptr_data->remove_reference() == 0) {
5285 // destroy buffers
5286 if (ptr_data->cpu_buf != 0) {
5287 m_destroy_buffers.push_back(ptr_data->cpu_buf);
5289 if (ptr_data->mic_buf != 0) {
5290 m_destroy_buffers.push_back(ptr_data->mic_buf);
5292 OFFLOAD_TRACE(3, "Removing association for addr %p\n",
5293 ptr_data->cpu_addr.start());
5295 // remove association from map
5296 if (m_vars[i].flags.targetptr) {
5297 m_device.remove_targetptr_data(ptr_data->cpu_addr.start());
5299 else {
5300 m_device.remove_ptr_data(ptr_data->cpu_addr.start());
5307 if (m_status) {
5308 m_status->data_received += ptr_received;
5311 m_num_in_dependencies = m_out_deps_total ? m_out_deps_total :
5312 m_num_in_dependencies;
5313 m_p_in_dependencies = m_out_deps_total ? m_out_deps : m_p_in_dependencies;
5315 OFFLOAD_TIMER_HOST_RDATA(get_timer_data(), ptr_received);
5316 OFFLOAD_DEBUG_TRACE_1(1, GET_OFFLOAD_NUMBER(get_timer_data()),
5317 c_offload_received_pointer_data,
5318 "Total pointer data received from target: [%lld] bytes\n",
5319 ptr_received);
5321 return true;
5324 bool OffloadDescriptor::scatter_copyout_data()
5326 OffloadTimer timer(get_timer_data(), c_offload_host_scatter_outputs);
5328 if (m_need_runfunction && m_out_datalen > 0) {
5330 // total size that need to be transferred from target to host
5331 COIMAPINSTANCE map_inst;
5332 COIRESULT res;
5333 char *data;
5335 // output data buffer
5336 if (m_func_desc->data_offset == 0) {
5337 OffloadTimer timer_map(get_timer_data(),
5338 c_offload_host_map_out_data_buffer);
5340 COIRESULT res = COI::BufferMap(m_inout_buf, 0, m_out_datalen,
5341 COI_MAP_READ_ONLY, 0, 0, 0,
5342 &map_inst,
5343 reinterpret_cast<void**>(&data));
5344 if (res != COI_SUCCESS) {
5345 if (m_status != 0) {
5346 m_status->result = translate_coi_error(res);
5347 return false;
5349 report_coi_error(c_buf_map, res);
5352 else {
5353 data = (char*) m_func_desc + m_func_desc->data_offset;
5356 // get timing data
5357 OFFLOAD_TIMER_TARGET_DATA(get_timer_data(), data);
5358 data += OFFLOAD_TIMER_DATALEN();
5360 // initialize output marshaller
5361 m_out.init_buffer(data, m_out_datalen);
5363 for (int i = 0; i < m_vars_total; i++) {
5364 bool src_is_for_mic = (m_vars[i].direction.out ||
5365 m_vars[i].into == NULL);
5367 if (m_vars_extra[i].type_src != c_data_ptr_array &&
5368 m_vars[i].flags.preallocated && m_vars[i].alloc_if) {
5369 PtrData *ptr_data;
5370 void *ptr_value;
5371 void ** cpu_ptr = src_is_for_mic ?
5372 reinterpret_cast<void**>(m_vars[i].ptr) :
5373 reinterpret_cast<void**>(m_vars[i].into);
5374 void* alloc_base = NULL;
5375 int64_t alloc_disp = 0;
5376 int64_t alloc_size;
5377 if (m_vars_extra[i].alloc != NULL) {
5378 // array descriptor
5379 const Arr_Desc *ap =
5380 static_cast<const Arr_Desc*>(m_vars_extra[i].alloc);
5382 __arr_data_offset_and_length(ap, alloc_disp, alloc_size);
5384 alloc_base = reinterpret_cast<void*>(ap->base);
5387 // get pointer to target memory
5388 m_out.receive_data(&ptr_value, sizeof(void*));
5390 // add new entry
5391 if (!alloc_ptr_data(
5392 ptr_data,
5393 ptr_value,
5394 (alloc_base != NULL) ?
5395 alloc_disp : m_vars[i].disp,
5396 (alloc_base != NULL) ?
5397 alloc_size : m_vars[i].size,
5398 alloc_disp,
5400 m_vars[i].flags.targetptr,
5401 m_vars[i].flags.preallocated,
5402 m_vars[i].flags.pin)) {
5403 return false;
5406 ptr_data->add_reference();
5407 *cpu_ptr = ptr_value;
5408 if (src_is_for_mic) {
5409 m_vars_extra[i].src_data = ptr_data;
5411 else {
5412 m_vars_extra[i].dst_data = ptr_data;
5414 m_vars[i].offset = (char*) ptr_value -
5415 (char*) ptr_data->cpu_addr.start();
5418 switch (m_vars_extra[i].type_src) {
5419 case c_data_ptr_array:
5420 break;
5421 case c_data:
5422 case c_void_ptr:
5423 case c_void_ptr_ptr:
5424 case c_cean_var:
5425 if (m_vars[i].direction.out &&
5426 !m_vars[i].flags.is_static) {
5428 if (m_vars[i].into) {
5429 char *ptr = offload_get_src_base(
5430 static_cast<char*>(m_vars[i].into),
5431 m_vars_extra[i].type_dst);
5432 m_out.receive_data(ptr + m_vars_extra[i].cpu_disp,
5433 m_vars[i].size);
5435 else {
5436 m_out.receive_data(
5437 static_cast<char*>(m_vars[i].ptr) +
5438 m_vars_extra[i].cpu_disp,
5439 m_vars[i].size);
5442 break;
5444 case c_func_ptr:
5445 case c_func_ptr_ptr:
5446 if (m_vars[i].direction.out) {
5447 m_out.receive_func_ptr((const void**) m_vars[i].ptr);
5449 break;
5451 default:
5452 break;
5456 if (m_status) {
5457 m_status->data_received += m_out.get_tfr_size();
5460 if (m_func_desc->data_offset == 0) {
5461 OffloadTimer timer_unmap(get_timer_data(),
5462 c_offload_host_unmap_out_data_buffer);
5464 COIRESULT res = COI::BufferUnmap(map_inst, 0, 0, 0);
5465 if (res != COI_SUCCESS) {
5466 if (m_status != 0) {
5467 m_status->result = translate_coi_error(res);
5468 return false;
5470 report_coi_error(c_buf_unmap, res);
5475 OFFLOAD_TIMER_HOST_RDATA(get_timer_data(), m_out.get_tfr_size());
5476 OFFLOAD_TRACE(1, "Total copyout data received from target: [%lld] bytes\n",
5477 m_out.get_tfr_size());
5479 return true;
5482 static void get_arr_desc_numbers(
5483 const Arr_Desc *ap,
5484 int64_t el_size,
5485 int64_t &offset,
5486 int64_t &size,
5487 int &el_number,
5488 CeanReadRanges* &ptr_ranges
5491 if (is_arr_desc_contiguous(ap)) {
5492 ptr_ranges = NULL;
5493 __arr_data_offset_and_length(ap, offset, size);
5494 el_number = size / el_size;
5496 else {
5497 ptr_ranges = init_read_ranges_arr_desc(ap);
5498 el_number = (ptr_ranges->range_size / el_size) *
5499 ptr_ranges->range_max_number;
5500 size = ptr_ranges->range_size;
5504 bool OffloadDescriptor::gen_var_descs_for_pointer_array(int i)
5506 int pointers_number;
5507 int tmp_val;
5508 int new_index = m_vars_total;
5509 const Arr_Desc *ap;
5510 const VarDesc3 *vd3 = static_cast<const VarDesc3*>(m_vars[i].ptr);
5511 int flags = vd3->array_fields;
5512 bool src_is_for_mic = (m_vars[i].direction.out ||
5513 m_vars[i].into == NULL);
5515 ReadArrElements<void *> ptr;
5516 ReadArrElements<void *> into;
5517 ReadArrElements<int64_t> ext_start;
5518 ReadArrElements<int64_t> ext_elements;
5519 ReadArrElements<int64_t> align;
5520 ReadArrElements<int64_t> alloc_if;
5521 ReadArrElements<int64_t> free_if;
5522 ReadArrElements<int64_t> into_start;
5523 ReadArrElements<int64_t> into_elem;
5524 ReadArrElements<int64_t> alloc_start;
5525 ReadArrElements<int64_t> alloc_elem;
5528 ap = static_cast<const Arr_Desc*>(vd3->ptr_array);
5530 // "pointers_number" for total number of transferred pointers.
5531 // For each of them we create new var_desc and put it at the bottom
5532 // of the var_desc's array
5533 get_arr_desc_numbers(ap, sizeof(void *), ptr.offset, ptr.size,
5534 pointers_number, ptr.ranges);
5535 ptr.base = reinterpret_cast<char*>(ap->base);
5537 // 2. prepare memory for new var_descs
5538 m_vars_total += pointers_number;
5539 m_vars = (VarDesc*)realloc(m_vars, m_vars_total * sizeof(VarDesc));
5540 if (m_vars == NULL)
5541 LIBOFFLOAD_ERROR(c_malloc);
5542 m_vars_extra =
5543 (VarExtra*)realloc(m_vars_extra, m_vars_total * sizeof(VarExtra));
5544 if (m_vars_extra == NULL)
5545 LIBOFFLOAD_ERROR(c_malloc);
5546 m_in_deps =
5547 (COIEVENT*)realloc(m_in_deps, sizeof(COIEVENT) * (m_vars_total + 1));
5548 if (m_in_deps == NULL)
5549 LIBOFFLOAD_ERROR(c_malloc);
5550 m_out_deps =
5551 (COIEVENT*)realloc(m_out_deps, sizeof(COIEVENT) * m_vars_total);
5552 if (m_out_deps == NULL)
5553 LIBOFFLOAD_ERROR(c_malloc);
5555 // 3. Prepare for reading new var_desc's fields
5556 // EXTENT START
5557 if ((flags & (1<<flag_extent_start_is_array)) != 0) {
5558 ap = static_cast<const Arr_Desc*>(vd3->extent_start);
5559 get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, ext_start.offset,
5560 ext_start.size, tmp_val, ext_start.ranges);
5561 ext_start.base = reinterpret_cast<char*>(ap->base);
5562 ext_start.el_size = ap->dim[ap->rank - 1].size;
5564 if (tmp_val < pointers_number) {
5565 LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "extent start");
5566 return false;
5569 else if ((flags & (1<<flag_extent_start_is_scalar)) != 0) {
5570 ext_start.val = (int64_t)vd3->extent_start;
5572 else {
5573 ext_start.val = 0;
5576 // EXTENT ELEMENTS NUMBER
5577 if ((flags & (1<<flag_extent_elements_is_array)) != 0) {
5578 ap = static_cast<const Arr_Desc*>(vd3->extent_elements);
5579 get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size,
5580 ext_elements.offset, ext_elements.size,
5581 tmp_val, ext_elements.ranges);
5582 ext_elements.base = reinterpret_cast<char*>(ap->base);
5583 ext_elements.el_size = ap->dim[ap->rank - 1].size;
5585 if (tmp_val < pointers_number) {
5586 LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "extent elements");
5587 return false;
5590 else if ((flags & (1<<flag_extent_elements_is_scalar)) != 0) {
5591 ext_elements.val = (int64_t)vd3->extent_elements;
5593 else {
5594 ext_elements.val = m_vars[i].count;
5597 // ALLOC_IF
5598 if ((flags & (1<<flag_alloc_if_is_array)) != 0) {
5599 ap = static_cast<const Arr_Desc*>(vd3->alloc_if_array);
5600 get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, alloc_if.offset,
5601 alloc_if.size, tmp_val, alloc_if.ranges);
5602 alloc_if.base = reinterpret_cast<char*>(ap->base);
5603 alloc_if.el_size = ap->dim[ap->rank - 1].size;
5605 if (tmp_val < pointers_number) {
5606 LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "alloc_if");
5607 return false;
5610 else {
5611 alloc_if.val = m_vars[i].alloc_if;
5614 // FREE_IF
5615 if ((flags & (1<<flag_free_if_is_array)) != 0) {
5616 ap = static_cast<const Arr_Desc*>(vd3->free_if_array);
5617 get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, free_if.offset,
5618 free_if.size, tmp_val, free_if.ranges);
5619 free_if.base = reinterpret_cast<char*>(ap->base);
5620 free_if.el_size = ap->dim[ap->rank - 1].size;
5622 if (tmp_val < pointers_number) {
5623 LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "free_if");
5624 return false;
5627 else {
5628 free_if.val = m_vars[i].free_if;
5631 // ALIGN
5633 if ((flags & (1<<flag_align_is_array)) != 0) {
5634 ap = static_cast<const Arr_Desc*>(vd3->align_array);
5635 get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, align.offset,
5636 align.size, tmp_val, align.ranges);
5637 align.base = reinterpret_cast<char*>(ap->base);
5638 align.el_size = ap->dim[ap->rank - 1].size;
5640 if (tmp_val < pointers_number) {
5641 LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "align");
5642 return false;
5645 else {
5646 align.val = m_vars[i].align;
5649 // 3.1 INTO
5651 if (m_vars[i].into) {
5652 ap = static_cast<const Arr_Desc*>(m_vars[i].into);
5653 get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, into.offset,
5654 into.size, tmp_val, into.ranges);
5655 into.base = reinterpret_cast<char*>(ap->base);
5657 if (tmp_val < pointers_number) {
5658 LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "into");
5659 return false;
5663 // 3.2 INTO_START
5665 if ((flags & (1<<flag_into_start_is_array)) != 0) {
5666 ap = static_cast<const Arr_Desc*>(vd3->into_start);
5667 get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, into_start.offset,
5668 into_start.size, tmp_val, into_start.ranges);
5669 into_start.base = reinterpret_cast<char*>(ap->base);
5670 into_start.el_size = ap->dim[ap->rank - 1].size;
5672 if (tmp_val < pointers_number) {
5673 LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "into_extent start");
5674 return false;
5677 else if ((flags & (1<<flag_into_start_is_scalar)) != 0) {
5678 into_start.val = (int64_t)vd3->into_start;
5680 else {
5681 into_start.val = 0;
5684 // 3.3 INTO_ELEMENTS
5686 if ((flags & (1<<flag_into_elements_is_array)) != 0) {
5687 ap = static_cast<const Arr_Desc*>(vd3->into_elements);
5688 get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, into_elem.offset,
5689 into_elem.size, tmp_val, into_elem.ranges);
5690 into_elem.base = reinterpret_cast<char*>(ap->base);
5691 into_elem.el_size = ap->dim[ap->rank - 1].size;
5693 if (tmp_val < pointers_number) {
5694 LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "into_extent elements");
5695 return false;
5698 else if ((flags & (1<<flag_into_elements_is_scalar)) != 0) {
5699 into_elem.val = (int64_t)vd3->into_elements;
5701 else {
5702 into_elem.val = m_vars[i].count;
5705 // alloc_start
5707 if ((flags & (1<<flag_alloc_start_is_array)) != 0) {
5708 ap = static_cast<const Arr_Desc*>(vd3->alloc_start);
5709 get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size,
5710 alloc_start.offset, alloc_start.size, tmp_val,
5711 alloc_start.ranges);
5712 alloc_start.base = reinterpret_cast<char*>(ap->base);
5713 alloc_start.el_size = ap->dim[ap->rank - 1].size;
5715 if (tmp_val < pointers_number) {
5716 LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "alloc_extent start");
5717 return false;
5720 else if ((flags & (1<<flag_alloc_start_is_scalar)) != 0) {
5721 alloc_start.val = (int64_t)vd3->alloc_start;
5723 else {
5724 alloc_start.val = 0;
5727 // alloc_elem
5729 if ((flags & (1<<flag_alloc_elements_is_array)) != 0) {
5730 ap = static_cast<const Arr_Desc*>(vd3->alloc_elements);
5731 get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, alloc_elem.offset,
5732 alloc_elem.size, tmp_val, alloc_elem.ranges);
5733 alloc_elem.base = reinterpret_cast<char*>(ap->base);
5734 alloc_elem.el_size = ap->dim[ap->rank - 1].size;
5735 if (tmp_val < pointers_number) {
5736 LIBOFFLOAD_ERROR(c_pointer_array_mismatch,
5737 "alloc_extent elements");
5738 return false;
5741 else if ((flags & (1<<flag_alloc_elements_is_scalar)) != 0) {
5742 alloc_elem.val = (int64_t)vd3->alloc_elements;
5744 else {
5745 alloc_elem.val = 0;
5748 for (int k = 0; k < pointers_number; k++) {
5749 int type = flags & 0x3f;
5750 int type_src, type_dst;
5751 // Get new values
5752 // type_src, type_dst
5753 type_src = type_dst = (type == c_data_ptr_array) ?
5754 c_data_ptr : (type == c_func_ptr_array) ?
5755 c_func_ptr : (type == c_void_ptr_array) ?
5756 c_void_ptr : (type == c_string_ptr_array) ?
5757 c_string_ptr : 0;
5759 // Get ptr val
5760 if (!ptr.read_next(true)) {
5761 break;
5763 else {
5764 ptr.val = (void*)(ptr.base + ptr.offset);
5767 // !!! If we got error at phase of reading - it's an internal
5768 // !!! error, as we must detect mismatch before
5770 // Get into val
5771 if (m_vars[i].into) {
5772 if (!into.read_next(true)) {
5773 LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "into");
5774 LIBOFFLOAD_ABORT;
5776 else {
5777 into.val = (void*)(into.base + into.offset);
5781 // Get other components of the clause
5782 if (!ext_start.read_next(flags & (1<<flag_extent_start_is_array))) {
5783 LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "extent start");
5784 LIBOFFLOAD_ABORT;
5786 if (!ext_elements.read_next(
5787 flags & (1<<flag_extent_elements_is_array))) {
5788 LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "extent elements");
5789 LIBOFFLOAD_ABORT;
5791 if (!alloc_if.read_next(flags & (1<<flag_alloc_if_is_array))) {
5792 LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "alloc_if");
5793 LIBOFFLOAD_ABORT;
5795 if (!free_if.read_next(flags & (1<<flag_free_if_is_array))) {
5796 LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "free_if");
5797 LIBOFFLOAD_ABORT;
5799 if (!align.read_next(flags & (1<<flag_align_is_array))) {
5800 LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "align");
5801 LIBOFFLOAD_ABORT;
5803 if (!into_start.read_next(flags & (1<<flag_into_start_is_array))) {
5804 LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "into_extent start");
5805 LIBOFFLOAD_ABORT;
5807 if (!into_elem.read_next(flags & (1<<flag_into_elements_is_array))) {
5808 LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "into_extent elements");
5809 LIBOFFLOAD_ABORT;
5811 if (!alloc_start.read_next(flags & (1<<flag_alloc_start_is_array))) {
5812 LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "alloc_extent start");
5813 LIBOFFLOAD_ABORT;
5815 if (!alloc_elem.read_next(
5816 flags & (1<<flag_alloc_elements_is_array))) {
5817 LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "alloc_extent elements");
5818 LIBOFFLOAD_ABORT;
5821 m_vars[new_index + k].direction.bits = m_vars[i].direction.bits;
5822 m_vars[new_index + k].alloc_if = alloc_if.val;
5823 m_vars[new_index + k].free_if = free_if.val;
5824 m_vars[new_index + k].align = align.val;
5825 m_vars[new_index + k].mic_offset = 0;
5826 m_vars[new_index + k].flags.bits = m_vars[i].flags.bits;
5827 m_vars[new_index + k].flags.is_pointer = 0;
5828 m_vars[new_index + k].offset = 0;
5829 m_vars[new_index + k].size = m_vars[i].size;
5830 m_vars[new_index + k].flags.targetptr = m_vars[i].flags.targetptr;
5831 m_vars[new_index + k].flags.preallocated =
5832 m_vars[i].flags.preallocated;
5834 if (ext_start.val == 0) {
5835 m_vars[new_index + k].count = ext_elements.val;
5836 m_vars[new_index + k].ptr = ptr.val;
5837 if (type_src == c_string_ptr) {
5838 m_vars[new_index + k].size = 0;
5841 else {
5842 m_vars[new_index + k].count = 0;
5843 m_vars[new_index + k].ptr =
5844 static_cast<void*>(make_arr_desc(
5845 ptr.val,
5846 ext_start.val,
5847 ext_elements.val,
5848 m_vars[i].size));
5850 type_src = type_src == c_data_ptr ? c_cean_var_ptr :
5851 c_string_ptr ? c_cean_var_ptr :
5852 type_src;
5853 if (!m_vars[i].into) {
5854 type_dst = type_src;
5858 if (m_vars[i].into && into_elem.val != 0) {
5859 m_vars[new_index + k].into =
5860 static_cast<void*>(make_arr_desc(
5861 into.val,
5862 into_start.val,
5863 into_elem.val,
5864 m_vars[i].size));
5865 type_dst = (type == c_data_ptr_array) ? c_cean_var_ptr :
5866 (type == c_string_ptr_array) ? c_cean_var_ptr :
5867 type_src;
5869 else {
5870 m_vars[new_index + k].into = NULL;
5873 if (alloc_elem.val != 0) {
5874 m_vars[new_index + k].alloc =
5875 static_cast<void*>(make_arr_desc(
5876 ptr.val,
5877 alloc_start.val,
5878 alloc_elem.val,
5879 m_vars[i].size));
5881 else {
5882 m_vars[new_index + k].alloc = NULL;
5885 m_vars[new_index + k].type.src =
5886 m_vars_extra[new_index + k].type_src = type_src;
5887 m_vars[new_index + k].type.dst =
5888 m_vars_extra[new_index + k].type_dst = type_dst;
5890 m_vars_extra[new_index + k].alloc = m_vars[new_index + k].alloc;
5891 m_vars_extra[new_index + k].is_arr_ptr_el = 1;
5892 m_vars_extra[new_index + k].ptr_arr_offset =
5893 src_is_for_mic ? ptr.offset : into.offset;
5895 // count and alloc fields are useless at target. They can be reused
5896 // for pointer arrays.
5897 m_vars[i].count = pointers_number;
5898 m_vars[i].ptr_arr_offset = new_index;
5899 return true;
5902 // Gets in dependencies of the previous offload via the stream "m_stream".
5903 // Out argument in_deps_amount - address of amount of the dependencies
5904 // Out argument in_deps - address of array of dependencies.
5905 // Description of the dependencies scheme for streams :
5906 // ----------------------------------------------------
5907 // Every offload forms DAG consisted of 3 nodes:
5908 // for in-transfers, runfunction and out-transfers.
5909 // Every node has in-dependencies and out-dependencies
5910 // Out-dependencies of previous node forms in-dependencies of current node.
5911 // In-dependencies of 1-st node (of in-transfers) without streams is equal
5912 // to NULL. For streams in-dependencies of 1-st node is equal to list of out
5913 // dependencies of last node of previous offload via this stream.
5914 // So we can say that DAGs of 2 consequent offloads via the same stream are
5915 // connected by the way described above.
5916 void OffloadDescriptor::get_stream_in_dependencies(
5917 uint32_t &in_deps_amount,
5918 COIEVENT* &in_deps
5921 if (m_stream != no_stream && m_stream != 0) {
5922 Stream * stream = Stream::find_stream(m_stream, false);
5923 if (!stream) {
5924 LIBOFFLOAD_ERROR(c_offload_no_stream,
5925 m_device.get_logical_index());
5926 LIBOFFLOAD_ABORT;
5928 OffloadDescriptor* offload = stream->get_last_offload();
5930 // if it's the first offload in the stream
5931 if (!offload) {
5932 return;
5934 // if last offload has out-tranfers
5935 if (offload->m_out_deps_total) {
5936 in_deps_amount = offload->m_out_deps_total;
5937 in_deps = offload->m_out_deps;
5939 // last offload only sends pointer data or run function or both of them
5940 // and has no out-transfers
5941 else if (offload->m_in_deps_total) {
5942 in_deps_amount = offload->m_in_deps_total;
5943 in_deps = offload->m_in_deps;
5948 static void __offload_fini_library(void)
5950 OFFLOAD_DEBUG_TRACE(2, "Cleanup offload library ...\n");
5951 if (mic_engines_total > 0) {
5952 delete[] mic_engines;
5953 mic_engines_total = 0;
5955 if (mic_proxy_fs_root != 0) {
5956 free(mic_proxy_fs_root);
5957 mic_proxy_fs_root = 0;
5960 if (knc_library_path != 0) {
5961 free(knc_library_path);
5962 knc_library_path = 0;
5965 if (knl_library_path != 0) {
5966 free(knl_library_path);
5967 knl_library_path = 0;
5970 // destroy thread key
5971 thread_key_delete(mic_thread_key);
5974 // unload COI library
5975 if (COI::is_available) {
5976 COI::fini();
5979 OFFLOAD_DEBUG_TRACE(2, "Cleanup offload library ... done\n");
5982 typedef std::pair<int, micLcpuMask*> deviceLcpu;
5983 typedef std::list<deviceLcpu> deviceLcpuList;
5985 static int process_offload_devices(
5986 const char *env_var,
5987 uint32_t num_devices,
5988 deviceLcpuList &device_cpu_list
5991 // Value is composed of comma separated physical device index
5992 // optionally qualified by logical CPU subset, e.g. 0[60,70-80]
5993 char *buf = strdup(env_var);
5994 if (buf == NULL)
5995 LIBOFFLOAD_ERROR(c_malloc);
5996 char *str = buf;
5997 bool device_set_finished = false;
5998 int num_devices_specified = 0;
5999 do {
6000 char *dev_ptr = str;
6001 int dev_len = strcspn(str, "[,");
6002 micLcpuMask* cpu_mask = 0;
6003 if (str[dev_len] == '[') {
6004 // CPU subset specified
6005 cpu_mask = new micLcpuMask;
6006 cpu_mask->reset();
6007 char *cpu_ptr = str + dev_len + 1;
6008 do {
6009 int64_t cnum;
6010 bool cpu_set_finished = false;
6011 int cpu_len = strcspn(cpu_ptr, ",-]");
6012 if (cpu_ptr[cpu_len] == ',' || cpu_ptr[cpu_len] == ']') {
6013 // A single CPU specified
6014 cpu_set_finished = cpu_ptr[cpu_len] == ']';
6015 cpu_ptr[cpu_len] = '\0';
6016 // Convert cpu string to an int
6017 if (!__offload_parse_int_string(cpu_ptr, cnum)) {
6018 LIBOFFLOAD_ERROR(c_mic_init7);
6019 delete cpu_mask;
6020 free(buf);
6021 return 0;
6022 } else {
6023 OFFLOAD_DEBUG_TRACE(3,
6024 "Single CPU %d selected\n", cnum);
6025 cpu_mask->set(cnum);
6027 cpu_ptr = cpu_ptr + cpu_len + 1;
6028 if (cpu_set_finished) {
6029 break;
6031 } else if (cpu_ptr[cpu_len] == '-') {
6032 int64_t range_start, range_end;
6033 // A range of CPUs specified
6034 cpu_ptr[cpu_len] = '\0';
6035 // Convert cpu string to an int
6036 if (!__offload_parse_int_string(cpu_ptr, range_start)) {
6037 LIBOFFLOAD_ERROR(c_mic_init8);
6038 delete cpu_mask;
6039 free(buf);
6040 return 0;
6041 } else {
6042 OFFLOAD_DEBUG_TRACE(3,
6043 "Start of CPU range specified as %d\n",
6044 range_start);
6045 cpu_ptr = cpu_ptr + cpu_len + 1;
6046 cpu_len = strcspn(cpu_ptr, ",]");
6047 if (cpu_ptr[cpu_len] == ',' ||
6048 cpu_ptr[cpu_len] == ']') {
6049 cpu_set_finished = cpu_ptr[cpu_len] == ']';
6050 cpu_ptr[cpu_len] = '\0';
6051 // Convert cpu string to an int
6052 if (!__offload_parse_int_string(
6053 cpu_ptr, range_end)) {
6054 LIBOFFLOAD_ERROR(c_mic_init9);
6055 delete cpu_mask;
6056 free(buf);
6057 return 0;
6058 } else {
6059 OFFLOAD_DEBUG_TRACE(3,
6060 "End of CPU range specified as %d\n",
6061 range_end);
6062 if (range_end < range_start) {
6063 LIBOFFLOAD_ERROR(c_mic_init10);
6064 delete cpu_mask;
6065 free(buf);
6066 return 0;
6067 } else {
6068 for (int i=range_start; i<=range_end; i++)
6070 OFFLOAD_DEBUG_TRACE(3,
6071 "CPU %d selected as part of range\n",
6073 cpu_mask->set(i);
6075 cpu_ptr = cpu_ptr + cpu_len + 1;
6076 if (cpu_set_finished) {
6077 break;
6081 } else {
6082 LIBOFFLOAD_ERROR(c_mic_init10);
6083 delete cpu_mask;
6084 free(buf);
6085 return 0;
6088 } else {
6089 // Error: expected , or - or ]
6090 LIBOFFLOAD_ERROR(c_mic_init11);
6091 delete cpu_mask;
6092 free(buf);
6093 return 0;
6095 } while (true);
6096 // Point to next device specification
6097 str = cpu_ptr;
6098 if (*str == '\0') {
6099 device_set_finished = true;
6100 } else {
6101 // Skip the comma after a device specification
6102 str++;
6104 } else if (str[dev_len] == ',') {
6105 // CPU subset not specified
6106 // Point to next device specification
6107 str = str + dev_len + 1;
6108 } else {
6109 // No more device specifications
6110 device_set_finished = true;
6112 dev_ptr[dev_len] = '\0';
6113 // Convert device string to an int
6114 int64_t num;
6115 if (!__offload_parse_int_string(dev_ptr, num)) {
6116 LIBOFFLOAD_ERROR(c_mic_init5);
6117 delete cpu_mask;
6118 free(buf);
6119 return 0;
6121 if (num < 0 || num >= num_devices) {
6122 LIBOFFLOAD_ERROR(c_mic_init6, num);
6123 delete cpu_mask;
6124 free(buf);
6125 return 0;
6127 OFFLOAD_DEBUG_TRACE(3, "Offloadable MIC = %d\n", num);
6128 // Save the specified physical device and cpu mask
6129 device_cpu_list.push_back(make_pair(num, cpu_mask));
6130 num_devices_specified++;
6132 if (device_set_finished) {
6133 break;
6135 } while (true);
6137 free(buf);
6138 return num_devices_specified;
6141 static void __offload_init_library_once(void)
6143 COIRESULT res;
6144 uint32_t num_devices;
6145 deviceLcpuList device_cpu_list;
6146 prefix = report_get_message_str(c_report_host);
6148 // initialize trace
6149 const char *env_var = getenv(htrace_envname);
6150 if (env_var != 0 && *env_var != '\0') {
6151 int64_t new_val;
6152 if (__offload_parse_int_string(env_var, new_val)) {
6153 console_enabled = new_val & 0x0f;
6157 OFFLOAD_DEBUG_TRACE(2, "---- Start of environment variable processing\n");
6158 env_var = getenv(offload_report_envname);
6159 if (env_var != 0 && *env_var != '\0') {
6160 OFFLOAD_DEBUG_TRACE(2, "---- %s=%s\n",
6161 offload_report_envname, env_var);
6162 int64_t env_val;
6163 if (__offload_parse_int_string(env_var, env_val)) {
6164 if (env_val == OFFLOAD_REPORT_1 ||
6165 env_val == OFFLOAD_REPORT_2 ||
6166 env_val == OFFLOAD_REPORT_3) {
6167 offload_report_level = env_val;
6168 OFFLOAD_DEBUG_TRACE(2, "Offload report level set to %d\n",
6169 offload_report_level);
6171 else {
6172 LIBOFFLOAD_ERROR(c_invalid_env_report_value,
6173 offload_report_envname);
6176 else {
6177 LIBOFFLOAD_ERROR(c_invalid_env_var_int_value,
6178 offload_report_envname);
6181 else if (!offload_report_level) {
6182 env_var = getenv(timer_envname);
6183 if (env_var != 0 && *env_var != '\0') {
6184 OFFLOAD_DEBUG_TRACE(2, "---- %s=%s\n", timer_envname, env_var);
6185 timer_enabled = atoi(env_var);
6186 OFFLOAD_DEBUG_TRACE(2, "Timer enable flag set to %d\n",
6187 timer_enabled);
6191 // initialize COI
6192 if (!COI::init()) {
6193 return;
6196 // Process OFFLOAD_NODES, specification of physical MICs available
6197 env_var = getenv("OFFLOAD_NODES");
6198 if (env_var != 0 && *env_var != '\0') {
6199 OFFLOAD_DEBUG_TRACE(2, "---- OFFLOAD_NODES=%s\n", env_var);
6200 // Pass env var on to COI
6201 char * new_env_var =
6202 (char*) malloc(sizeof("COI_OFFLOAD_NODES=") +
6203 strlen(env_var) + 1);
6204 if (new_env_var == NULL)
6205 LIBOFFLOAD_ERROR(c_malloc);
6206 sprintf(new_env_var, "COI_OFFLOAD_NODES=%s", env_var);
6207 putenv(new_env_var);
6208 OFFLOAD_DEBUG_TRACE(2, "Setting COI_OFFLOAD_NODES = %s \n", getenv("COI_OFFLOAD_NODES"));
6210 // value is composed of comma separated physical device indexes
6211 char *buf = strdup(env_var);
6212 if (buf == NULL)
6213 LIBOFFLOAD_ERROR(c_malloc);
6214 char *str, *ptr;
6215 int num_mics = 0;
6216 for (str = strtok_r(buf, ",", &ptr); str != 0;
6217 str = strtok_r(0, ",", &ptr)) {
6218 // count this MIC
6219 num_mics++;
6221 OFFLOAD_DEBUG_TRACE(2, "Number of offloadable MICs = %d\n", num_mics);
6222 free(buf);
6224 else {
6225 OFFLOAD_DEBUG_TRACE(2, "OFFLOAD_NODES is not set\n");
6228 // get number of devices installed in the system
6229 res = COI::EngineGetCount(COI_ISA_MIC, &num_devices);
6230 if (res != COI_SUCCESS) {
6231 return;
6234 if (num_devices > MIC_ENGINES_MAX) {
6235 num_devices = MIC_ENGINES_MAX;
6238 // Determine devices & cpus that can be used for offloading
6239 env_var = getenv("OFFLOAD_DEVICES");
6240 if (env_var != 0 && *env_var != '\0') {
6241 OFFLOAD_DEBUG_TRACE(2, "---- OFFLOAD_DEVICES=%s\n", env_var);
6242 if (strcasecmp(env_var, "none") != 0) {
6243 mic_engines_total =
6244 process_offload_devices(
6245 env_var, num_devices, device_cpu_list);
6246 if (mic_engines_total > 0) {
6247 OFFLOAD_DEBUG_TRACE(2, "Valid value, %d device(s) specified\n",
6248 mic_engines_total);
6250 else {
6251 OFFLOAD_DEBUG_TRACE(2, "Invalid value, will not offload\n");
6252 return;
6255 else {
6256 // No need to continue since no offload devices
6257 return;
6260 else {
6261 OFFLOAD_DEBUG_TRACE(2, "OFFLOAD_DEVICES is not set\n");
6263 if (mic_engines_total == 0) {
6264 // Fallback to using all available devices and all CPUs on each
6265 OFFLOAD_DEBUG_TRACE(2, "Fallback to all devices\n");
6266 device_cpu_list.clear();
6267 mic_engines_total = 0;
6268 for (int i = 0; i < num_devices; i++) {
6269 COIENGINE engine;
6270 res = COI::EngineGetHandle(COI_ISA_MIC, i, &engine);
6271 if (res == COI_SUCCESS) {
6272 device_cpu_list.push_back(make_pair(i, (micLcpuMask*)0));
6273 OFFLOAD_DEBUG_TRACE(2, "Device %d is available\n", i);
6274 mic_engines_total++;
6279 // no need to continue if there are no devices to offload to
6280 if (mic_engines_total <= 0) {
6281 return;
6284 // Initialize indexes for available devices
6285 mic_engines = new Engine[mic_engines_total];
6286 std::list<deviceLcpu>::iterator deviceIterator;
6287 int l_idx = 0;
6288 for (deviceIterator = device_cpu_list.begin();
6289 deviceIterator != device_cpu_list.end();
6290 deviceIterator++)
6292 deviceLcpu device_mask_pair = *deviceIterator;
6293 int device_num = device_mask_pair.first;
6294 micLcpuMask *device_mask = device_mask_pair.second;
6296 mic_engines[l_idx].set_indexes(l_idx, device_num);
6297 mic_engines[l_idx].set_cpu_mask(device_mask);
6298 OFFLOAD_DEBUG_TRACE(2,
6299 "Logical MIC%d => Physical MIC%d\n", l_idx, device_num);
6300 if (device_mask != NULL) {
6301 std::string cpu_string =
6302 device_mask->to_string<
6303 char,
6304 std::string::traits_type,
6305 std::string::allocator_type>();
6306 OFFLOAD_DEBUG_TRACE(2, " CPUs: %s\n", cpu_string.data());
6308 else {
6309 OFFLOAD_DEBUG_TRACE(2, " CPUs: all\n");
6311 l_idx++;
6314 // Get DMA channel count to pass it to COI
6315 env_var = getenv("OFFLOAD_DMA_CHANNEL_COUNT");
6316 if (env_var != 0 && *env_var != '\0') {
6317 OFFLOAD_DEBUG_TRACE(2, "---- OFFLOAD_DMA_CHANNEL_COUNT=%s\n", env_var);
6318 int64_t new_val;
6319 if (__offload_parse_int_string(env_var, new_val)) {
6320 mic_dma_channel_count = new_val;
6321 OFFLOAD_DEBUG_TRACE(2, "Using %d DMA channels\n",
6322 mic_dma_channel_count);
6324 else {
6325 LIBOFFLOAD_ERROR(c_invalid_env_var_value,
6326 "OFFLOAD_DMA_CHANNEL_COUNT");
6329 else {
6330 OFFLOAD_DEBUG_TRACE(2, "OFFLOAD_DMA_CHANNEL_COUNT is not set\n");
6333 // Set COI_HOST_THREAD_AFFINITY if OFFLOAD_HOST_THREAD_AFFINITY is set.
6334 // Use putenv instead of setenv as Windows has no setenv.
6335 // Note: putenv requires its argument can't be freed or modified.
6336 // So no free after call to putenv or elsewhere.
6337 env_var = getenv("OFFLOAD_HOST_THREAD_AFFINITY");
6338 if (env_var != 0 && *env_var != '\0') {
6339 OFFLOAD_DEBUG_TRACE(2, "---- OFFLOAD_HOST_THREAD_AFFINITY=%s\n", env_var);
6340 char * new_env_var =
6341 (char*) malloc(sizeof("COI_HOST_THREAD_AFFINITY=") +
6342 strlen(env_var) + 1);
6343 if (new_env_var == NULL)
6344 LIBOFFLOAD_ERROR(c_malloc);
6345 sprintf(new_env_var, "COI_HOST_THREAD_AFFINITY=%s", env_var);
6346 putenv(new_env_var);
6347 OFFLOAD_DEBUG_TRACE(2, "Setting COI_HOST_THREAD_AFFINITY = %s \n",
6348 getenv("COI_HOST_THREAD_AFFINITY"));
6350 else {
6351 OFFLOAD_DEBUG_TRACE(2, "OFFLOAD_HOST_THREAD_AFFINITY is not set\n");
6354 // library search path for KNC device binaries
6355 env_var = getenv("MIC_LD_LIBRARY_PATH");
6356 if (env_var != 0) {
6357 OFFLOAD_DEBUG_TRACE(2, "---- MIC_LD_LIBRARY_PATH=%s\n", env_var);
6358 knc_library_path = strdup(env_var);
6359 if (knc_library_path == NULL)
6360 LIBOFFLOAD_ERROR(c_malloc);
6361 OFFLOAD_DEBUG_TRACE(2, "KNC library path set to %s\n", knc_library_path);
6363 else {
6364 OFFLOAD_DEBUG_TRACE(2, "MIC_LD_LIBRARY_PATH is not set\n");
6367 // library search path for KNL device binaries
6368 env_var = getenv("LD_LIBRARY_PATH");
6369 if (env_var != 0) {
6370 OFFLOAD_DEBUG_TRACE(2, "---- LD_LIBRARY_PATH=%s\n", env_var);
6371 knl_library_path = strdup(env_var);
6372 if (knl_library_path == NULL)
6373 LIBOFFLOAD_ERROR(c_malloc);
6374 OFFLOAD_DEBUG_TRACE(2, "KNL library path set to %s\n", knl_library_path);
6376 else {
6377 OFFLOAD_DEBUG_TRACE(2, "LD_LIBRARY_PATH is not set\n");
6380 // memory size reserved for COI buffers
6381 env_var = getenv("MIC_BUFFERSIZE");
6382 if (env_var != 0 && *env_var != '\0') {
6383 OFFLOAD_DEBUG_TRACE(2, "---- MIC_BUFFERSIZE=%s\n", env_var);
6384 uint64_t new_size;
6385 if (__offload_parse_size_string(env_var, new_size)) {
6386 mic_buffer_size = new_size;
6387 OFFLOAD_DEBUG_TRACE(2,
6388 "Reserved memory for COI buffers set to %lld bytes\n",
6389 mic_buffer_size);
6391 else {
6392 LIBOFFLOAD_ERROR(c_invalid_env_var_value, "MIC_BUFFERSIZE");
6395 else {
6396 OFFLOAD_DEBUG_TRACE(2, "MIC_BUFFERSIZE is not set\n");
6399 // memory size reserved for 4K pages for COI buffers
6400 env_var = getenv("MIC_4K_BUFFER_RESERVE_SIZE");
6401 if (env_var != 0 && *env_var != '\0') {
6402 OFFLOAD_DEBUG_TRACE(2, "---- MIC_4K_BUFFER_RESERVE_SIZE=%s\n", env_var);
6403 uint64_t new_size;
6404 if (__offload_parse_size_string(env_var, new_size)) {
6405 mic_4k_buffer_size = new_size;
6406 OFFLOAD_DEBUG_TRACE(2,
6407 "Reserved memory for 4K COI buffers set to %lld bytes\n",
6408 mic_4k_buffer_size);
6410 else {
6411 LIBOFFLOAD_ERROR(c_invalid_env_var_value, "MIC_4K_BUFFER_RESERVE_SIZE");
6414 else {
6415 OFFLOAD_DEBUG_TRACE(2, "MIC_4K_BUFFER_RESERVE_SIZE is not set\n");
6418 // memory size reserved for 2M pages for COI buffers
6419 env_var = getenv("MIC_2M_BUFFER_RESERVE_SIZE");
6420 if (env_var != 0 && *env_var != '\0') {
6421 OFFLOAD_DEBUG_TRACE(2, "---- MIC_2M_BUFFER_RESERVE_SIZE=%s\n", env_var);
6422 uint64_t new_size;
6423 if (__offload_parse_size_string(env_var, new_size)) {
6424 mic_2m_buffer_size = new_size;
6425 OFFLOAD_DEBUG_TRACE(2,
6426 "Reserved memory for 2M COI buffers set to %lld bytes\n",
6427 mic_2m_buffer_size);
6429 else {
6430 LIBOFFLOAD_ERROR(c_invalid_env_var_value,
6431 "MIC_2M_BUFFER_RESERVE_SIZE");
6434 else {
6435 OFFLOAD_DEBUG_TRACE(2, "MIC_2M_BUFFER_RESERVE_SIZE is not set\n");
6438 // determine stacksize for the pipeline on the device
6439 env_var = getenv("MIC_STACKSIZE");
6440 if (env_var != 0 && *env_var != '\0') {
6441 OFFLOAD_DEBUG_TRACE(2, "---- MIC_STACKSIZE=%s\n", env_var);
6442 uint64_t new_size;
6443 if (__offload_parse_size_string(env_var, new_size) &&
6444 (new_size >= 16384) && ((new_size & 4095) == 0)) {
6445 mic_stack_size = new_size;
6446 OFFLOAD_DEBUG_TRACE(2, "MIC stack size set to %lld bytes\n",
6447 mic_stack_size);
6449 else {
6450 LIBOFFLOAD_ERROR(c_mic_init3);
6453 else {
6454 OFFLOAD_DEBUG_TRACE(2, "MIC_STACKSIZE is not set\n");
6457 // proxy I/O
6458 env_var = getenv("MIC_PROXY_IO");
6459 if (env_var != 0 && *env_var != '\0') {
6460 OFFLOAD_DEBUG_TRACE(2, "---- MIC_PROXY_IO=%s\n", env_var);
6461 int64_t new_val;
6462 if (__offload_parse_int_string(env_var, new_val)) {
6463 mic_proxy_io = new_val;
6464 OFFLOAD_DEBUG_TRACE(2, "MIC proxy i/o set to %s\n",
6465 mic_proxy_io);
6467 else {
6468 LIBOFFLOAD_ERROR(c_invalid_env_var_int_value, "MIC_PROXY_IO");
6471 else {
6472 OFFLOAD_DEBUG_TRACE(2, "MIC_PROXY_IO is not set\n");
6476 env_var = getenv("MIC_PROXY_FS_ROOT");
6477 if (env_var != 0 && *env_var != '\0') {
6478 OFFLOAD_DEBUG_TRACE(2, "---- MIC_PROXY_FS_ROOT=%s\n", env_var);
6479 mic_proxy_fs_root = strdup(env_var);
6480 if (mic_proxy_fs_root == NULL)
6481 LIBOFFLOAD_ERROR(c_malloc);
6482 OFFLOAD_DEBUG_TRACE(2, "MIC proxy fs root set to %s\n",
6483 mic_proxy_fs_root);
6485 else {
6486 OFFLOAD_DEBUG_TRACE(2, "MIC_PROXY_FS_ROOT is not set\n");
6489 // Prepare environment for the target process using the following
6490 // rules
6491 // - If MIC_ENV_PREFIX is set then any environment variable on the
6492 // host which has that prefix are copied to the device without
6493 // the prefix.
6494 // All other host environment variables are ignored.
6495 // - If MIC_ENV_PREFIX is not set or if MIC_ENV_PREFIX="" then host
6496 // environment is duplicated.
6497 env_var = getenv("MIC_ENV_PREFIX");
6498 if (env_var != 0 && *env_var != '\0') {
6499 OFFLOAD_DEBUG_TRACE(2, "---- MIC_ENV_PREFIX=%s\n", env_var);
6500 mic_env_vars.set_prefix(env_var);
6502 int len = strlen(env_var);
6503 for (int i = 0; environ[i] != 0; i++) {
6504 if (strncmp(environ[i], env_var, len) == 0 &&
6505 strncmp(environ[i], "MIC_LD_LIBRARY_PATH", 19) != 0 &&
6506 environ[i][len] != '=') {
6507 mic_env_vars.analyze_env_var(environ[i]);
6511 else {
6512 OFFLOAD_DEBUG_TRACE(2, "MIC_ENV_PREFIX is not set\n");
6515 // create key for thread data
6516 if (thread_key_create(&mic_thread_key, Engine::destroy_thread_data)) {
6517 LIBOFFLOAD_ERROR(c_mic_init4, errno);
6518 return;
6521 // cpu frequency
6522 cpu_frequency = COI::PerfGetCycleFrequency();
6524 env_var = getenv(mic_use_2mb_buffers_envname);
6525 if (env_var != 0 && *env_var != '\0') {
6526 OFFLOAD_DEBUG_TRACE(2, "---- %s=%s\n",
6527 mic_use_2mb_buffers_envname, env_var);
6528 uint64_t new_size;
6529 if (__offload_parse_size_string(env_var, new_size)) {
6530 __offload_use_2mb_buffers = new_size;
6531 OFFLOAD_DEBUG_TRACE(2,
6532 "Threshold for use of 2M buffers set to %lld\n",
6533 __offload_use_2mb_buffers);
6535 else {
6536 LIBOFFLOAD_ERROR(c_invalid_env_var_value,
6537 mic_use_2mb_buffers_envname);
6540 else {
6541 OFFLOAD_DEBUG_TRACE(2, "%s is not set\n", mic_use_2mb_buffers_envname);
6544 env_var = getenv(mic_use_async_buffer_write_envname);
6545 if (env_var != 0 && *env_var != '\0') {
6546 OFFLOAD_DEBUG_TRACE(2, "---- %s=%s\n",
6547 mic_use_async_buffer_write_envname, env_var);
6548 uint64_t new_size;
6549 if (__offload_parse_size_string(env_var, new_size)) {
6550 __offload_use_async_buffer_write = new_size;
6551 OFFLOAD_DEBUG_TRACE(2,
6552 "Threshold for async buffer write set to %lld\n",
6553 __offload_use_async_buffer_write);
6556 else {
6557 OFFLOAD_DEBUG_TRACE(2, "%s is not set\n",
6558 mic_use_async_buffer_write_envname);
6561 env_var = getenv(mic_use_async_buffer_read_envname);
6562 if (env_var != 0 && *env_var != '\0') {
6563 OFFLOAD_DEBUG_TRACE(2, "---- %s=%s\n",
6564 mic_use_async_buffer_read_envname, env_var);
6565 uint64_t new_size;
6566 if (__offload_parse_size_string(env_var, new_size)) {
6567 __offload_use_async_buffer_read = new_size;
6568 OFFLOAD_DEBUG_TRACE(2,
6569 "Threshold for async buffer read set to %lld\n",
6570 __offload_use_async_buffer_read);
6573 else {
6574 OFFLOAD_DEBUG_TRACE(2, "%s is not set\n",
6575 mic_use_async_buffer_read_envname);
6578 // mic initialization type
6579 env_var = getenv(offload_init_envname);
6580 if (env_var != 0 && *env_var != '\0') {
6581 OFFLOAD_DEBUG_TRACE(2, "---- %s=%s\n",
6582 offload_init_envname, env_var);
6583 if (strcmp(env_var, "on_offload") == 0) {
6584 __offload_init_type = c_init_on_offload;
6585 OFFLOAD_DEBUG_TRACE(2,
6586 "A MIC device will be initialized "
6587 "on first offload to that device\n");
6589 else if (strcmp(env_var, "on_offload_all") == 0) {
6590 __offload_init_type = c_init_on_offload_all;
6591 OFFLOAD_DEBUG_TRACE(2,
6592 "All MIC devices will be initialized "
6593 "on first offload to any device\n");
6595 else if (strcmp(env_var, "on_start") == 0) {
6596 __offload_init_type = c_init_on_start;
6597 OFFLOAD_DEBUG_TRACE(2,
6598 "All MIC devices will be initialized "
6599 "at program start\n");
6601 else {
6602 LIBOFFLOAD_ERROR(c_invalid_env_var_value, offload_init_envname);
6605 else {
6606 OFFLOAD_DEBUG_TRACE(2, "%s is not set\n", offload_init_envname);
6609 // active wait
6610 env_var = getenv(offload_active_wait_envname);
6611 if (env_var != 0 && *env_var != '\0') {
6612 OFFLOAD_DEBUG_TRACE(2, "---- %s=%s\n",
6613 offload_active_wait_envname, env_var);
6614 int64_t new_val;
6615 if (__offload_parse_int_string(env_var, new_val)) {
6616 __offload_active_wait = new_val;
6617 OFFLOAD_DEBUG_TRACE(2,
6618 "Flag to poll on event completion is set to %d\n",
6619 __offload_active_wait);
6621 else {
6622 LIBOFFLOAD_ERROR(c_invalid_env_var_int_value,
6623 offload_active_wait_envname);
6626 else {
6627 OFFLOAD_DEBUG_TRACE(2, "%s is not set\n", offload_active_wait_envname);
6630 // always wait
6631 env_var = getenv(offload_always_wait_envname);
6632 if (env_var != 0 && *env_var != '\0') {
6633 OFFLOAD_DEBUG_TRACE(2, "---- %s=%s\n",
6634 offload_always_wait_envname, env_var);
6635 int64_t new_val;
6636 if (__offload_parse_int_string(env_var, new_val)) {
6637 __offload_always_wait = new_val;
6638 OFFLOAD_DEBUG_TRACE(2,
6639 "Flag to poll on event completion is set to %d\n",
6640 __offload_active_wait);
6642 else {
6643 LIBOFFLOAD_ERROR(c_invalid_env_var_int_value,
6644 offload_always_wait_envname);
6647 else {
6648 OFFLOAD_DEBUG_TRACE(2, "%s is not set\n", offload_always_wait_envname);
6651 // omp device num
6652 env_var = getenv(omp_device_num_envname);
6653 if (env_var != 0 && *env_var != '\0') {
6654 OFFLOAD_DEBUG_TRACE(2, "---- %s=%s\n",
6655 omp_device_num_envname, env_var);
6656 int64_t new_val;
6657 if (__offload_parse_int_string(env_var, new_val) && new_val >= 0) {
6658 __omp_device_num = new_val;
6659 OFFLOAD_DEBUG_TRACE(2, "OpenMP default device number is set to %d\n",
6660 __omp_device_num);
6662 else {
6663 LIBOFFLOAD_ERROR(c_omp_invalid_device_num_env,
6664 omp_device_num_envname);
6667 else {
6668 OFFLOAD_DEBUG_TRACE(2, "%s is not set\n", omp_device_num_envname);
6671 // parallel copy of offload_transfer
6672 env_var = getenv(parallel_copy_envname);
6673 if (env_var != 0 && *env_var != '\0') {
6674 OFFLOAD_DEBUG_TRACE(2, "---- %s=%s\n",
6675 parallel_copy_envname, env_var);
6676 int64_t new_val;
6677 if (__offload_parse_int_string(env_var, new_val) && new_val >= 0) {
6678 __offload_parallel_copy = new_val;
6679 OFFLOAD_DEBUG_TRACE(2,
6680 "Flag for using async buffer copy is set to %d\n",
6681 __offload_parallel_copy);
6683 else {
6684 LIBOFFLOAD_ERROR(c_invalid_env_var_value,
6685 parallel_copy_envname);
6688 else {
6689 OFFLOAD_DEBUG_TRACE(2, "%s is not set\n", parallel_copy_envname);
6692 // use COI interface for noncontiguous arrays transfer
6693 env_var = getenv(use_coi_noncontiguous_transfer_envname);
6694 if (env_var != 0 && *env_var != '\0') {
6695 OFFLOAD_DEBUG_TRACE(2, "---- %s=%s\n",
6696 use_coi_noncontiguous_transfer_envname, env_var);
6697 uint64_t new_size;
6698 if (__offload_parse_size_string(env_var, new_size)) {
6699 __offload_use_coi_noncontiguous_transfer = new_size;
6700 OFFLOAD_DEBUG_TRACE(2,
6701 "Flag for using new COI noncontiguous API is set to %d\n",
6702 __offload_use_coi_noncontiguous_transfer);
6704 else {
6705 LIBOFFLOAD_ERROR(c_invalid_env_var_value,
6706 use_coi_noncontiguous_transfer_envname);
6709 else {
6710 OFFLOAD_DEBUG_TRACE(2, "%s is not set\n",
6711 use_coi_noncontiguous_transfer_envname);
6714 OFFLOAD_DEBUG_TRACE(2, "---- End of environment variable processing\n");
6716 // init ORSL
6717 ORSL::init();
6720 extern int __offload_init_library(void)
6722 // do one time intialization
6723 static OffloadOnceControl ctrl = OFFLOAD_ONCE_CONTROL_INIT;
6724 __offload_run_once(&ctrl, __offload_init_library_once);
6726 // offload is available if COI is available and the number of devices > 0
6727 bool is_available = COI::is_available && (mic_engines_total > 0);
6729 // register pending libraries if there are any
6730 if (is_available && __target_libs) {
6731 mutex_locker_t locker(__target_libs_lock);
6733 for (TargetImageList::iterator it = __target_libs_list.begin();
6734 it != __target_libs_list.end(); it++) {
6735 // Register library in COI
6736 COI::ProcessRegisterLibraries(1, &it->data, &it->size,
6737 &it->origin, &it->offset);
6739 // add lib to all engines
6740 for (int i = 0; i < mic_engines_total; i++) {
6741 mic_engines[i].add_lib(*it);
6745 __target_libs = false;
6746 __target_libs_list.clear();
6749 return is_available;
6752 extern "C" bool __offload_target_image_is_executable(const void *target_image)
6754 const struct Image *image = static_cast<const struct Image*>(target_image);
6756 // decode image
6757 const char *name = image->data;
6758 const void *data = image->data + strlen(image->data) + 1;
6760 // determine image type
6761 const Elf64_Ehdr *hdr = static_cast<const Elf64_Ehdr*>(data);
6762 return (hdr->e_type == ET_EXEC);
6765 extern "C" bool __offload_register_image(const void *target_image)
6767 const struct Image *image = static_cast<const struct Image*>(target_image);
6768 const void *data = image->data + strlen(image->data) + 1;
6769 uint64_t size = image->size;
6770 uint64_t offset = 0;
6772 // decode image
6773 const char *fat_name = image->data;
6774 char *mic_name = (char *) malloc(strlen(image->data) + 1);
6775 char *host_name = (char *) malloc(strlen(image->data));
6776 int i;
6778 if ((mic_name == NULL) || (host_name == NULL))
6779 LIBOFFLOAD_ERROR(c_malloc);
6781 // The origin name is the name of the file on the host
6782 // this is used by Vtune, since it is a fat binary we
6783 // use the host file name of the fat binary.
6784 // Driver prepends the host file name ending with "?"
6785 // to the image->data name so need to extract the string
6786 // name format: <mic_name>?<origin>
6788 // Get <mic_name>
6789 i = 0;
6790 while ((*fat_name != '\0') && (*fat_name != '?')) {
6791 mic_name[i] = *fat_name;
6792 fat_name++;
6793 i++;
6796 // Remove the host file name by inserting end of string marker
6797 mic_name[i] = '\0';
6799 // Get <host_name>
6800 if (*fat_name == '?') {
6801 // The string following "?" is the name of the host file name.
6802 fat_name++;
6803 i = 0;
6804 while (*fat_name != '\0') {
6805 host_name[i] = *fat_name;
6806 fat_name++;
6807 i++;
6809 host_name[i] = '\0';
6811 else {
6812 // Windows current does not have host name
6813 free(host_name);
6814 host_name = 0;
6817 // our actions depend on the image type
6818 const Elf64_Ehdr *hdr = static_cast<const Elf64_Ehdr*>(data);
6819 switch (hdr->e_type) {
6820 case ET_EXEC:
6821 __current_image_is_dll = false;
6822 // Each offload application is supposed to have only one target
6823 // image representing target executable.
6824 // No thread synchronization is required here as the initialization
6825 // code is always executed in a single thread.
6826 if (__target_exe != 0) {
6827 LIBOFFLOAD_ERROR(c_multiple_target_exes);
6828 exit(1);
6830 __target_exe = new TargetImage(mic_name, data, size, host_name, offset);
6832 // Registration code for execs is always called from the context
6833 // of main and thus we can safely call any function here,
6834 // including LoadLibrary API on windows. This is the place where
6835 // we do the offload library initialization.
6836 if (__offload_init_library()) {
6837 // initialize engine if init_type is on_start
6838 if (__offload_init_type == c_init_on_start) {
6839 for (int i = 0; i < mic_engines_total; i++) {
6840 mic_engines[i].init();
6844 return mic_engines_total > 0;
6846 case ET_DYN:
6848 char * fullname = NULL;
6849 __current_image_is_dll = true;
6850 // We add the library to a list of pending libraries
6851 __target_libs_lock.lock();
6852 __target_libs = true;
6853 __target_libs_list.push_back(
6854 TargetImage(mic_name, data, size, fullname, offset));
6855 __target_libs_lock.unlock();
6856 // If __target_exe is set, then main has started running
6857 // If not main, then we can't do anything useful here
6858 // because this registration code is called from DllMain
6859 // context (on windows).
6860 if (__target_exe != 0) {
6861 // There is no need to delay loading the library
6862 if (!__offload_init_library()) {
6863 // Couldn't validate library as a fat offload library
6864 LIBOFFLOAD_ERROR(c_unknown_binary_type);
6865 exit(1);
6868 return true;
6871 default:
6872 // something is definitely wrong, issue an error and exit
6873 LIBOFFLOAD_ERROR(c_unknown_binary_type);
6874 exit(1);
6878 // When dlopen is used dlclose may happen after the COI process
6879 // is destroyed. In which case images cannot be unloaded and should
6880 // be skipped. So track if coi has been unloaded.
6881 static bool coi_may_have_been_unloaded = false;
6883 extern "C" void __offload_unregister_image(const void *target_image)
6885 // Target image is packed as follows:
6886 // 8 bytes - size of the target binary
6887 // null-terminated string - binary name
6888 // <size> bytes - binary contents
6889 const struct Image {
6890 int64_t size;
6891 char data[];
6892 } *image = static_cast<const struct Image*>(target_image);
6894 // decode image
6895 const char *name = image->data;
6896 const void *data = image->data + strlen(image->data) + 1;
6898 // our actions depend on the image type
6899 const Elf64_Ehdr *hdr = static_cast<const Elf64_Ehdr*>(data);
6900 if (hdr->e_type == ET_EXEC) {
6901 // We are executing exec's desctructors.
6902 // It is time to do a library cleanup.
6903 if (timer_enabled) {
6904 Offload_Timer_Print();
6907 coi_may_have_been_unloaded = true;
6909 // Do not unload the MYO library if it loaded in dll.
6910 if (!__myo_init_in_so)
6912 #ifdef MYO_SUPPORT
6913 __offload_myoFini();
6914 #endif // MYO_SUPPORT
6916 __offload_fini_library();
6919 else if ((hdr->e_type == ET_DYN) && !coi_may_have_been_unloaded) {
6920 for (int i = 0; i < mic_engines_total; i++) {
6921 mic_engines[i].unload_library(data, name);
6927 extern "C" void __offload_register_task_callback(void (*cb)(void *))
6929 task_completion_callback = cb;
6932 // Runtime trace interface for user programs
6934 void __offload_console_trace(int level)
6936 console_enabled = level;
6939 // User-visible offload API
6941 int _Offload_number_of_devices(void)
6943 __offload_init_library();
6944 return mic_engines_total;
6947 int _Offload_get_device_number(void)
6949 return -1;
6952 int _Offload_get_physical_device_number(void)
6954 return -1;
6957 int _Offload_signaled(int index, void *signal)
6959 __offload_init_library();
6961 // check index value
6962 if (index < 0) {
6963 LIBOFFLOAD_ERROR(c_offload_signaled1, index);
6964 LIBOFFLOAD_ABORT;
6967 index %= mic_engines_total;
6969 // find associated async task
6970 OffloadDescriptor *task =
6971 mic_engines[index].find_signal(signal, false);
6972 if (task == 0) {
6973 LIBOFFLOAD_ERROR(c_offload_signaled2, signal);
6974 LIBOFFLOAD_ABORT;
6976 // if signal is removed by wait completing
6977 else if (task == SIGNAL_HAS_COMPLETED) {
6978 return (true);
6980 return task->is_signaled();
6983 void _Offload_report(int val)
6985 if (val == OFFLOAD_REPORT_ON ||
6986 val == OFFLOAD_REPORT_OFF) {
6987 offload_report_enabled = val;
6991 int _Offload_find_associated_mic_memory(
6992 int target,
6993 const void* cpu_addr,
6994 void** cpu_base_addr,
6995 uint64_t* buf_length,
6996 void** mic_addr,
6997 uint64_t* mic_buf_start_offset,
6998 int* is_static
7001 __offload_init_library();
7003 // check target value
7004 if (target < 0) {
7005 LIBOFFLOAD_ERROR(c_offload_signaled1, target);
7006 LIBOFFLOAD_ABORT;
7008 target %= mic_engines_total;
7010 // find existing association in pointer table
7011 PtrData* ptr_data = mic_engines[target].find_ptr_data(cpu_addr);
7012 if (ptr_data == 0) {
7013 OFFLOAD_TRACE(3, "Association does not exist\n");
7014 return 0;
7017 OFFLOAD_TRACE(3, "Found association: base %p, length %lld, is_static %d\n",
7018 ptr_data->cpu_addr.start(), ptr_data->cpu_addr.length(),
7019 ptr_data->is_static);
7021 if (ptr_data->mic_buf != 0 && ptr_data->mic_addr == 0) {
7022 COIRESULT res = COI::BufferGetSinkAddress(ptr_data->mic_buf,
7023 &ptr_data->mic_addr);
7024 if (res != COI_SUCCESS) {
7025 return 0;
7028 *cpu_base_addr = const_cast<void *>(ptr_data->cpu_addr.start());
7029 *buf_length = ptr_data->cpu_addr.length() - ptr_data->alloc_disp;
7030 *mic_addr = (void *)(ptr_data->mic_addr + ptr_data->mic_offset);
7031 *mic_buf_start_offset = ptr_data->alloc_disp;
7032 *is_static = ptr_data->is_static;
7033 return ptr_data->is_static ? 1 : ptr_data->get_reference();
7036 _Offload_stream _Offload_stream_create(
7037 int device, // MIC device number
7038 int number_of_cpus // Cores allocated to the stream
7041 __offload_init_library();
7043 // check target value
7044 if (device < 0) {
7045 LIBOFFLOAD_ERROR(c_offload_signaled1, device);
7046 LIBOFFLOAD_ABORT;
7048 device %= mic_engines_total;
7050 // Create new stream and get its handle
7051 _Offload_stream handle = Stream::add_stream(device, number_of_cpus);
7052 if (handle == 0) {
7053 OFFLOAD_TRACE(3, "Can't create stream\n");
7054 return 0;
7057 // create pipeline associated with the new stream
7058 mic_engines[device].get_pipeline(handle);
7060 return(handle);
7063 int _Offload_stream_destroy(
7064 int device, // MIC device number
7065 _Offload_stream handle // stream to destroy
7068 if (Stream::get_streams_count() == 0) {
7069 LIBOFFLOAD_ERROR(c_offload_streams_are_absent);
7070 LIBOFFLOAD_ABORT;
7072 // check target value
7073 if (device < 0) {
7074 LIBOFFLOAD_ERROR(c_offload_signaled1, device);
7075 LIBOFFLOAD_ABORT;
7077 device %= mic_engines_total;
7079 mic_engines[device].stream_destroy(handle);
7081 return(true);
7084 int _Offload_stream_delete(
7085 _Offload_stream handle // stream to destroy
7088 int device; // MIC device number
7089 Stream * stream;
7091 if (Stream::get_streams_count() == 0) {
7092 LIBOFFLOAD_ERROR(c_offload_streams_are_absent);
7093 LIBOFFLOAD_ABORT;
7096 stream = Stream::find_stream(handle, false);
7097 // the stream was not created or was destroyed
7098 if (!stream) {
7099 LIBOFFLOAD_ERROR(c_offload_no_stream, device);
7100 LIBOFFLOAD_ABORT;
7103 device = stream->get_device();
7105 mic_engines[device].stream_destroy(handle);
7107 return(true);
7110 int _Offload_stream_completed(int device, _Offload_stream handler)
7112 if (Stream::get_streams_count() == 0) {
7113 LIBOFFLOAD_ERROR(c_offload_streams_are_absent);
7114 LIBOFFLOAD_ABORT;
7116 // check device index value
7117 if (device < -1) {
7118 LIBOFFLOAD_ERROR(c_offload_signaled1, device);
7119 LIBOFFLOAD_ABORT;
7121 else if (device > -1) {
7122 device %= mic_engines_total;
7124 // get stream
7125 Stream * stream;
7127 if (handler != 0) {
7128 stream = Stream::find_stream(handler, false);
7130 // the stream was not created or was destroyed
7131 if (!stream) {
7132 LIBOFFLOAD_ERROR(c_offload_no_stream, device);
7133 LIBOFFLOAD_ABORT;
7136 if (device != stream->get_device()) {
7137 LIBOFFLOAD_ERROR(c_offload_device_doesnt_match_to_stream,
7138 stream->get_device());
7139 LIBOFFLOAD_ABORT;
7141 // find associated async task
7142 OffloadDescriptor *task = stream->get_last_offload();
7144 // offload was completed by offload_wait pragma or wait clause
7145 if (task == 0) {
7146 return(true);
7148 return task->is_signaled();
7150 // zero handler is for all streams at the device
7151 else {
7152 StreamMap stream_map = Stream::all_streams;
7153 for (StreamMap::iterator it = stream_map.begin();
7154 it != stream_map.end(); it++) {
7155 Stream * stream = it->second;
7156 if (device != -1 && device != stream->get_device()) {
7157 continue;
7159 // find associated async task
7160 OffloadDescriptor *task = stream->get_last_offload();
7162 // offload was completed by offload_wait pragma or wait clause
7163 if (task == 0) {
7164 continue;
7166 // if even one stream is not completed result is false
7167 if (!task->is_signaled()) {
7168 return false;
7171 // no uncompleted streams
7172 return true;
7176 int _Offload_stream_is_empty(_Offload_stream handle)
7178 int device;
7180 if (Stream::get_streams_count() == 0) {
7181 LIBOFFLOAD_ERROR(c_offload_streams_are_absent);
7182 LIBOFFLOAD_ABORT;
7184 if (handle != 0) {
7185 Stream * stream = Stream::find_stream(handle, false);
7187 // the stream was not created or was destroyed
7188 if (!stream) {
7189 LIBOFFLOAD_ERROR(c_offload_no_stream, device);
7190 LIBOFFLOAD_ABORT;
7192 device = stream->get_device();
7194 else {
7195 device = -1;
7197 // Use 0 for device index as _Offload_stream_completed
7198 // ignores this value while defining streams completion
7199 return _Offload_stream_completed(device, handle);
7202 int _Offload_device_streams_completed(int device)
7204 if (Stream::get_streams_count() == 0) {
7205 LIBOFFLOAD_ERROR(c_offload_streams_are_absent);
7206 LIBOFFLOAD_ABORT;
7208 // check index value
7209 if (device < -1) {
7210 LIBOFFLOAD_ERROR(c_offload_signaled1, device);
7211 LIBOFFLOAD_ABORT;
7213 else if (device > -1) {
7214 device %= mic_engines_total;
7217 StreamMap stream_map = Stream::all_streams;
7218 for (StreamMap::iterator it = stream_map.begin();
7219 it != stream_map.end(); it++)
7221 Stream * stream = it->second;
7223 if (device != -1 && device != stream->get_device()) {
7224 continue;
7226 // find associated async task
7227 OffloadDescriptor *task = stream->get_last_offload();
7229 // offload was completed by offload_wait pragma or wait clause
7230 if (task == 0) {
7231 continue;
7233 // if even one stream is not completed result is false
7234 if (!task->is_signaled()) {
7235 return false;
7238 // no uncompleted streams
7239 return true;
7242 // IDB support
7243 int __dbg_is_attached = 0;
7244 int __dbg_target_id = -1;
7245 pid_t __dbg_target_so_pid = -1;
7246 char __dbg_target_exe_name[MAX_TARGET_NAME] = {0};
7247 const int __dbg_api_major_version = 1;
7248 const int __dbg_api_minor_version = 0;
7250 void __dbg_target_so_loaded()
7253 void __dbg_target_so_unloaded()