Remove assert in get_def_bb_for_const
[official-gcc.git] / liboffloadmic / runtime / offload_host.cpp
blobe52019dfb28e7733cba65fc506d160ccb27e4fba
1 /*
2 Copyright (c) 2014-2015 Intel Corporation. All Rights Reserved.
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions
6 are met:
8 * Redistributions of source code must retain the above copyright
9 notice, this list of conditions and the following disclaimer.
10 * Redistributions in binary form must reproduce the above copyright
11 notice, this list of conditions and the following disclaimer in the
12 documentation and/or other materials provided with the distribution.
13 * Neither the name of Intel Corporation nor the names of its
14 contributors may be used to endorse or promote products derived
15 from this software without specific prior written permission.
17 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 // Forward declaration as the following 2 functions are declared as friend
32 // in offload_engine.h.
33 // CLANG does not like static to been after friend declaration.
34 static void __offload_init_library_once(void);
35 static void __offload_fini_library(void);
37 #include "offload_host.h"
38 #ifdef MYO_SUPPORT
39 #include "offload_myo_host.h"
40 #endif
42 #include <malloc.h>
43 #ifndef TARGET_WINNT
44 #include <alloca.h>
45 #include <elf.h>
46 #endif // TARGET_WINNT
47 #include <errno.h>
48 #include <fcntl.h>
49 #include <stdlib.h>
50 #include <string.h>
51 #include <sys/stat.h>
52 #include <sys/types.h>
53 #include <sys/stat.h>
55 #include <algorithm>
56 #include <bitset>
58 #if defined(HOST_WINNT)
59 #define PATH_SEPARATOR ";"
60 #else
61 #define PATH_SEPARATOR ":"
62 #endif
64 #define GET_OFFLOAD_NUMBER(timer_data) \
65 timer_data? timer_data->offload_number : 0
67 static void (*task_completion_callback)(void *);
69 extern "C" {
70 #ifdef TARGET_WINNT
71 // Windows does not support imports from libraries without actually
72 // including them as dependence. We don't want to include in the
73 // dependence since is it used only for Fortran when traceback is enabled.
74 // Chose to implement it with GetProcAddress.
75 #define FORTRAN_TRACE_BACK win_for__continue_traceback
76 int win_for__continue_traceback( _Offload_result coi_offload_result )
78 HINSTANCE hDLL;
79 int (* TraceBackRoutine)(_Offload_result value);
81 hDLL = LoadLibrary("libifcoremd.dll");
82 if (hDLL != 0) {
83 TraceBackRoutine = (int (*)(_Offload_result)) GetProcAddress(hDLL,
84 "for__continue_traceback");
85 if (TraceBackRoutine != 0) {
86 return TraceBackRoutine(coi_offload_result);
88 else {
89 OFFLOAD_TRACE(3,
90 "Cannot find for__continue_traceback routine in libifcorert.dll\n");
91 exit(1);
94 else {
95 OFFLOAD_TRACE(3, "Cannot load libifcorert.dll\n");
96 exit(1);
98 return 0;
101 #else // TARGET_WINNT
103 #define FORTRAN_TRACE_BACK for__continue_traceback
105 // for__continue_traceback is provided as a dummy to resolve link time symbols
106 // for C/C++ programs. For Fortran the actual fortran library function in
107 // libifcore.so is used.
108 #pragma weak for__continue_traceback
109 int for__continue_traceback( _Offload_result coi_offload_result )
111 OFFLOAD_TRACE(3,
112 "liboffload function for_continue_traceback should not be called.\n");
113 exit(1);
115 #endif //TARGET_WINNT
116 } // extern "C"
118 #ifdef TARGET_WINNT
119 // Small subset of ELF declarations for Windows which is needed to compile
120 // this file. ELF header is used to understand what binary type is contained
121 // in the target image - shared library or executable.
123 typedef uint16_t Elf64_Half;
124 typedef uint32_t Elf64_Word;
125 typedef uint64_t Elf64_Addr;
126 typedef uint64_t Elf64_Off;
128 #define EI_NIDENT 16
130 #define ET_EXEC 2
131 #define ET_DYN 3
133 typedef struct
135 unsigned char e_ident[EI_NIDENT];
136 Elf64_Half e_type;
137 Elf64_Half e_machine;
138 Elf64_Word e_version;
139 Elf64_Addr e_entry;
140 Elf64_Off e_phoff;
141 Elf64_Off e_shoff;
142 Elf64_Word e_flags;
143 Elf64_Half e_ehsize;
144 Elf64_Half e_phentsize;
145 Elf64_Half e_phnum;
146 Elf64_Half e_shentsize;
147 Elf64_Half e_shnum;
148 Elf64_Half e_shstrndx;
149 } Elf64_Ehdr;
150 #endif // TARGET_WINNT
152 // Host console and file logging
153 const char *prefix;
154 int console_enabled = 0;
155 int offload_number = 0;
157 static const char *htrace_envname = "H_TRACE";
158 static const char *offload_report_envname = "OFFLOAD_REPORT";
159 static const char *timer_envname = "H_TIME";
161 // location of offload_main executable
162 // To be used if the main application has no offload and is not built
163 // with -offload but dynamic library linked in has offload pragma
164 char* mic_device_main = 0;
166 // DMA channel count used by COI and set via
167 // OFFLOAD_DMA_CHANNEL_COUNT environment variable
168 uint32_t mic_dma_channel_count;
170 // Trace information
171 static const char* vardesc_direction_as_string[] = {
172 "NOCOPY",
173 "IN",
174 "OUT",
175 "INOUT"
177 static const char* vardesc_type_as_string[] = {
178 "unknown",
179 "data",
180 "data_ptr",
181 "func_ptr",
182 "void_ptr",
183 "string_ptr",
184 "dv",
185 "dv_data",
186 "dv_data_slice",
187 "dv_ptr",
188 "dv_ptr_data",
189 "dv_ptr_data_slice",
190 "cean_var",
191 "cean_var_ptr",
192 "c_data_ptr_array",
193 "c_func_ptr_array",
194 "c_void_ptr_array",
195 "c_string_ptr_array"
198 Engine* mic_engines = 0;
199 uint32_t mic_engines_total = 0;
200 pthread_key_t mic_thread_key;
201 MicEnvVar mic_env_vars;
202 uint64_t cpu_frequency = 0;
204 // MIC_STACKSIZE
205 uint32_t mic_stack_size = 12 * 1024 * 1024;
207 // MIC_BUFFERSIZE
208 uint64_t mic_buffer_size = 0;
210 // Preallocated 4K page memory size for buffers on MIC
211 uint64_t mic_4k_buffer_size = 0;
213 // Preallocated 2M page memory size for buffers on MIC
214 uint64_t mic_2m_buffer_size = 0;
217 // MIC_LD_LIBRARY_PATH
218 char* mic_library_path = 0;
220 // MIC_PROXY_IO
221 bool mic_proxy_io = true;
223 // MIC_PROXY_FS_ROOT
224 char* mic_proxy_fs_root = 0;
226 // Threshold for creating buffers with large pages. Buffer is created
227 // with large pages hint if its size exceeds the threshold value.
228 // By default large pages are disabled right now (by setting default
229 // value for threshold to MAX) due to HSD 4114629.
230 uint64_t __offload_use_2mb_buffers = 0xffffffffffffffffULL;
231 static const char *mic_use_2mb_buffers_envname =
232 "MIC_USE_2MB_BUFFERS";
234 static uint64_t __offload_use_async_buffer_write = 2 * 1024 * 1024;
235 static const char *mic_use_async_buffer_write_envname =
236 "MIC_USE_ASYNC_BUFFER_WRITE";
238 static uint64_t __offload_use_async_buffer_read = 2 * 1024 * 1024;
239 static const char *mic_use_async_buffer_read_envname =
240 "MIC_USE_ASYNC_BUFFER_READ";
242 // device initialization type
243 OffloadInitType __offload_init_type = c_init_on_offload_all;
244 static const char *offload_init_envname = "OFFLOAD_INIT";
246 // active wait
247 static bool __offload_active_wait = true;
248 static const char *offload_active_wait_envname = "OFFLOAD_ACTIVE_WAIT";
250 // OMP_DEFAULT_DEVICE
251 int __omp_device_num = 0;
252 static const char *omp_device_num_envname = "OMP_DEFAULT_DEVICE";
254 //OFFLOAD_PARALLEL_COPY
255 static bool __offload_parallel_copy = false;
256 static const char *parallel_copy_envname = "OFFLOAD_PARALLEL_COPY";
258 //Use COI interface for noncontiguous transfer if it exists.
259 static bool __offload_use_coi_noncontiguous_transfer = false;
260 static const char *use_coi_noncontiguous_transfer_envname =
261 "MIC_USE_COI_MULTI_D";
263 // The list of pending target libraries
264 static bool __target_libs;
265 static TargetImageList __target_libs_list;
266 static mutex_t __target_libs_lock;
267 static mutex_t stack_alloc_lock;
269 // Target executable
270 TargetImage* __target_exe;
272 // Print readable offload flags
273 static void trace_offload_flags(
274 OffloadHostTimerData* timer_data,
275 OffloadFlags offload_flags
278 // Sized big enough for all flag names
279 char fbuffer[256];
280 bool first = true;
281 if (!OFFLOAD_DO_TRACE && (console_enabled >= 1)) {
282 sprintf(fbuffer, " OffloadFlags=(");
283 if (offload_flags.bits.fortran_traceback) {
284 sprintf(fbuffer+strlen(fbuffer), "fortran_traceback");
285 first = false;
287 if (offload_flags.bits.omp_async) {
288 sprintf(fbuffer+strlen(fbuffer), first ? "omp_async" : ",omp_async");
289 first = false;
291 OFFLOAD_DEBUG_TRACE_1(1,
292 GET_OFFLOAD_NUMBER(timer_data), c_offload_init_func,
293 "%s)\n", fbuffer);
297 // Print readable varDesc flags
298 static void trace_varDesc_flags(
299 OffloadHostTimerData* timer_data,
300 varDescFlags offload_flags
303 // SIzed big enough for all flag names
304 char fbuffer[256];
305 bool first = true;
306 if (!OFFLOAD_DO_TRACE && (console_enabled >= 1)) {
307 sprintf(fbuffer, " varDescFlags=(");
308 if (offload_flags.is_static) {
309 sprintf(fbuffer+strlen(fbuffer), "is_static");
310 first = false;
312 if (offload_flags.is_static_dstn) {
313 sprintf(fbuffer+strlen(fbuffer),
314 first ? "is_static_dstn" : ",is_static_dstn");
315 first = false;
317 if (offload_flags.has_length) {
318 sprintf(fbuffer+strlen(fbuffer),
319 first ? "has_length" : ",has_length");
320 first = false;
322 if (offload_flags.is_stack_buf) {
323 sprintf(fbuffer+strlen(fbuffer),
324 first ? "is_stack_buf" : ",is_stack_buf");
325 first = false;
327 if (offload_flags.targetptr) {
328 sprintf(fbuffer+strlen(fbuffer),
329 first ? "targetptr" : ",targetptr");
330 first = false;
332 if (offload_flags.preallocated) {
333 sprintf(fbuffer+strlen(fbuffer),
334 first ? "preallocated" : ",preallocated");
335 first = false;
337 if (offload_flags.is_pointer) {
338 sprintf(fbuffer+strlen(fbuffer),
339 first ? "is_pointer" : ",is_pointer");
340 first = false;
342 if (offload_flags.sink_addr) {
343 sprintf(fbuffer+strlen(fbuffer),
344 first ? "sink_addr" : ",sink_addr");
345 first = false;
347 if (offload_flags.alloc_disp) {
348 sprintf(fbuffer+strlen(fbuffer),
349 first ? "alloc_disp" : ",alloc_disp");
350 first = false;
352 if (offload_flags.is_noncont_src) {
353 sprintf(fbuffer+strlen(fbuffer),
354 first ? "is_noncont_src" : ",is_noncont_src");
355 first = false;
357 if (offload_flags.is_noncont_dst) {
358 sprintf(fbuffer+strlen(fbuffer),
359 first ? "is_noncont_dst" : ",is_noncont_dst");
360 first = false;
362 if (offload_flags.always_copy) {
363 sprintf(fbuffer+strlen(fbuffer),
364 first ? "always_copy" : ",always_copy");
365 first = false;
367 if (offload_flags.always_delete) {
368 sprintf(fbuffer+strlen(fbuffer),
369 first ? "always_delete" : ",always_delete");
370 first = false;
372 OFFLOAD_DEBUG_TRACE_1(1,
373 GET_OFFLOAD_NUMBER(timer_data), c_offload_init_func,
374 "%s)\n", fbuffer);
378 static char * offload_get_src_base(void * ptr, uint8_t type)
380 char *base;
381 if (VAR_TYPE_IS_PTR(type)) {
382 base = *static_cast<char**>(ptr);
384 else if (VAR_TYPE_IS_SCALAR(type)) {
385 base = static_cast<char*>(ptr);
387 else if (VAR_TYPE_IS_DV_DATA_SLICE(type) || VAR_TYPE_IS_DV_DATA(type)) {
388 ArrDesc *dvp;
389 if (VAR_TYPE_IS_DV_DATA_SLICE(type)) {
390 const Arr_Desc *ap = static_cast<const Arr_Desc*>(ptr);
391 dvp = (type == c_dv_data_slice) ?
392 reinterpret_cast<ArrDesc*>(ap->base) :
393 *reinterpret_cast<ArrDesc**>(ap->base);
395 else {
396 dvp = (type == c_dv_data) ?
397 static_cast<ArrDesc*>(ptr) :
398 *static_cast<ArrDesc**>(ptr);
400 base = reinterpret_cast<char*>(dvp->Base);
402 else {
403 base = NULL;
405 return base;
408 void OffloadDescriptor::report_coi_error(error_types msg, COIRESULT res)
410 // special case for the 'process died' error
411 if (res == COI_PROCESS_DIED) {
412 m_device.fini_process(true);
414 else {
415 switch (msg) {
416 case c_buf_create:
417 if (res == COI_OUT_OF_MEMORY) {
418 msg = c_buf_create_out_of_mem;
420 /* fallthru */
422 case c_buf_create_from_mem:
423 case c_buf_get_address:
424 case c_pipeline_create:
425 case c_pipeline_run_func:
426 LIBOFFLOAD_ERROR(msg, m_device.get_logical_index(), res);
427 break;
429 case c_buf_read:
430 case c_buf_write:
431 case c_buf_copy:
432 case c_buf_map:
433 case c_buf_unmap:
434 case c_buf_destroy:
435 case c_buf_set_state:
436 LIBOFFLOAD_ERROR(msg, res);
437 break;
439 default:
440 break;
444 exit(1);
447 _Offload_result OffloadDescriptor::translate_coi_error(COIRESULT res) const
449 switch (res) {
450 case COI_SUCCESS:
451 return OFFLOAD_SUCCESS;
453 case COI_PROCESS_DIED:
454 return OFFLOAD_PROCESS_DIED;
456 case COI_OUT_OF_MEMORY:
457 return OFFLOAD_OUT_OF_MEMORY;
459 default:
460 return OFFLOAD_ERROR;
464 // is_targetptr == 0 && is_prealloc == 0 - allocation of pointer data;
465 // is_targetptr == 1 && is_prealloc == 0 - allocation of target memory:
466 // allocate memory at target; use its value as base in target table.
467 // is_targetptr == 1 && is_prealloc == 1 - use preallocated target memory:
468 // base - is address at target of preallocated memory; use its value as
469 // base in target table.
471 bool OffloadDescriptor::alloc_ptr_data(
472 PtrData* &ptr_data,
473 void *base,
474 int64_t disp,
475 int64_t size,
476 int64_t alloc_disp,
477 int align,
478 bool is_targptr,
479 bool is_prealloc,
480 bool pin
483 // total length of base
484 int64_t length = size;
485 bool is_new;
486 COIBUFFER targptr_buf;
487 COIRESULT res;
488 uint32_t buffer_flags = 0;
489 char * base_disp = reinterpret_cast<char *>(base) + disp;
491 // create buffer with large pages if data length exceeds
492 // large page threshold
493 if (length >= __offload_use_2mb_buffers) {
494 buffer_flags = COI_OPTIMIZE_HUGE_PAGE_SIZE;
496 // Allocate memory at target for targetptr without preallocated as we need
497 // its address as base argument in call to m_device.insert_ptr_data
498 if (is_targptr && !is_prealloc) {
499 length = alloc_disp ? length : size + disp;
500 res = COI::BufferCreate(
501 length,
502 COI_BUFFER_NORMAL,
503 buffer_flags,
506 &m_device.get_process(),
507 &targptr_buf);
508 if (res != COI_SUCCESS) {
509 if (m_status != 0) {
510 m_status->result = translate_coi_error(res);
512 else if (m_is_mandatory) {
513 report_coi_error(c_buf_create, res);
515 return false;
518 res = COI::BufferGetSinkAddress(
519 targptr_buf, reinterpret_cast<uint64_t *>(&base));
520 if (res != COI_SUCCESS) {
521 if (m_status != 0) {
522 m_status->result = translate_coi_error(res);
524 else if (m_is_mandatory) {
525 report_coi_error(c_buf_get_address, res);
527 return false;
531 OFFLOAD_TRACE(3, "Creating association for data: addr %p, length %lld\n",
532 alloc_disp ? base : base_disp,
533 alloc_disp ? length : size + disp);
535 // add new entry
537 ptr_data = is_targptr ?
538 m_device.find_targetptr_data(base_disp) :
539 m_device.find_ptr_data(base_disp);
540 // if ptr_data is found just need to check it for overlapping
541 if (ptr_data) {
542 is_new = false;
543 base = base_disp;
545 else {
546 // If association is not found we must create it.
547 length = alloc_disp ? length : size + disp;
548 ptr_data = is_targptr ?
549 m_device.insert_targetptr_data(base, length, is_new) :
550 m_device.insert_ptr_data(base, length, is_new);
552 if (is_new) {
554 OFFLOAD_TRACE(3, "Added new association\n");
556 if (length > 0) {
557 OffloadTimer timer(get_timer_data(), c_offload_host_alloc_buffers);
559 // align should be a power of 2
560 if (!pin && !is_targptr &&
561 align > 0 && (align & (align - 1)) == 0) {
562 // offset within mic_buffer. Can do offset optimization
563 // only when source address alignment satisfies requested
564 // alignment on the target (cq172736).
565 if ((reinterpret_cast<intptr_t>(base) & (align - 1)) == 0) {
566 ptr_data->mic_offset =
567 reinterpret_cast<intptr_t>(base) & 4095;
571 // buffer size and flags
572 uint64_t buffer_size = length + ptr_data->mic_offset;
574 // For targetptr there is no CPU buffer
575 if (pin || !is_targptr) {
576 // create CPU buffer
577 OFFLOAD_DEBUG_TRACE_1(3,
578 GET_OFFLOAD_NUMBER(get_timer_data()),
579 c_offload_create_buf_host,
580 "Creating buffer from source memory %p, "
581 "length %lld\n", base, length);
583 // result is not checked because we can continue without cpu
584 // buffer. In this case we will use COIBufferRead/Write
585 // instead of COIBufferCopy.
587 COI::BufferCreateFromMemory(length,
588 COI_BUFFER_NORMAL,
590 base,
592 &m_device.get_process(),
593 &ptr_data->cpu_buf);
596 // create MIC buffer
597 if (is_prealloc) {
598 OFFLOAD_DEBUG_TRACE_1(3,
599 GET_OFFLOAD_NUMBER(get_timer_data()),
600 c_offload_create_buf_mic,
601 "Creating buffer from sink memory: size %lld, offset %d, "
602 "flags =0x%x\n", buffer_size,
603 ptr_data->mic_offset, buffer_flags);
604 res = COI::BufferCreateFromMemory(ptr_data->cpu_addr.length(),
605 COI_BUFFER_NORMAL,
606 COI_SINK_MEMORY,
607 base,
609 &m_device.get_process(),
610 &ptr_data->mic_buf);
611 if (res != COI_SUCCESS) {
612 if (m_status != 0) {
613 m_status->result = translate_coi_error(res);
615 else if (m_is_mandatory) {
616 report_coi_error(c_buf_create, res);
618 ptr_data->alloc_ptr_data_lock.unlock();
619 return false;
622 else if (is_targptr) {
623 ptr_data->mic_buf = targptr_buf;
625 else if (!pin) {
626 OFFLOAD_DEBUG_TRACE_1(3,
627 GET_OFFLOAD_NUMBER(get_timer_data()),
628 c_offload_create_buf_mic,
629 "Creating buffer for sink: size %lld, offset %d, "
630 "flags =0x%x\n", buffer_size,
631 ptr_data->mic_offset, buffer_flags);
632 res = COI::BufferCreate(buffer_size,
633 COI_BUFFER_NORMAL,
634 buffer_flags,
637 &m_device.get_process(),
638 &ptr_data->mic_buf);
639 if (res != COI_SUCCESS) {
640 if (m_status != 0) {
641 m_status->result = translate_coi_error(res);
643 else if (m_is_mandatory) {
644 report_coi_error(c_buf_create, res);
646 ptr_data->alloc_ptr_data_lock.unlock();
647 return false;
651 if (!pin) {
652 // make buffer valid on the device.
653 res = COI::BufferSetState(ptr_data->mic_buf,
654 m_device.get_process(),
655 COI_BUFFER_VALID,
656 COI_BUFFER_NO_MOVE,
657 0, 0, 0);
658 if (res != COI_SUCCESS) {
659 if (m_status != 0) {
660 m_status->result = translate_coi_error(res);
662 else if (m_is_mandatory) {
663 report_coi_error(c_buf_set_state, res);
665 ptr_data->alloc_ptr_data_lock.unlock();
666 return false;
669 res = COI::BufferSetState(ptr_data->mic_buf,
670 COI_PROCESS_SOURCE,
671 COI_BUFFER_INVALID,
672 COI_BUFFER_NO_MOVE,
673 0, 0, 0);
674 if (res != COI_SUCCESS) {
675 if (m_status != 0) {
676 m_status->result = translate_coi_error(res);
678 else if (m_is_mandatory) {
679 report_coi_error(c_buf_set_state, res);
681 ptr_data->alloc_ptr_data_lock.unlock();
682 return false;
686 ptr_data->alloc_disp = alloc_disp;
687 ptr_data->alloc_ptr_data_lock.unlock();
689 else {
690 mutex_locker_t locker(ptr_data->alloc_ptr_data_lock);
692 OFFLOAD_TRACE(3, "Found existing association: addr %p, length %lld, "
693 "is_static %d\n",
694 ptr_data->cpu_addr.start(), ptr_data->cpu_addr.length(),
695 ptr_data->is_static);
697 // This is not a new entry. Make sure that provided address range fits
698 // into existing one.
699 MemRange addr_range(base, length);
700 if (!ptr_data->cpu_addr.contains(addr_range)) {
701 LIBOFFLOAD_ERROR(c_bad_ptr_mem_alloc, base, length,
702 const_cast<void *>(ptr_data->cpu_addr.start()),
703 ptr_data->cpu_addr.length());
704 exit(1);
707 // if the entry is associated with static data it may not have buffers
708 // created because they are created on demand.
709 if (ptr_data->is_static && !init_static_ptr_data(ptr_data)) {
710 return false;
714 return true;
717 bool OffloadDescriptor::find_ptr_data(
718 PtrData* &ptr_data,
719 void *in_base,
720 int64_t disp,
721 int64_t size,
722 bool is_targetptr,
723 bool report_error
726 // total length of base
727 int64_t length = size;
728 char *base = reinterpret_cast<char *>(in_base) + disp;
730 OFFLOAD_TRACE(3, "Looking for association for data: addr %p, "
731 "length %lld\n", base, length);
733 // find existing association in pointer table
734 ptr_data = is_targetptr ?
735 m_device.find_targetptr_data(base) :
736 m_device.find_ptr_data(base);
737 if (ptr_data == 0) {
738 if (report_error) {
739 LIBOFFLOAD_ERROR(c_no_ptr_data, base);
740 exit(1);
742 OFFLOAD_TRACE(3, "Association does not exist\n");
743 return true;
746 OFFLOAD_TRACE(3, "Found association: base %p, length %lld, is_static %d\n",
747 ptr_data->cpu_addr.start(), ptr_data->cpu_addr.length(),
748 ptr_data->is_static);
750 // make sure that provided address range fits into existing one
751 MemRange addr_range(base, length);
752 if (!ptr_data->cpu_addr.contains(addr_range)) {
753 if (report_error) {
754 LIBOFFLOAD_ERROR(c_bad_ptr_mem_range, base, length,
755 const_cast<void *>(ptr_data->cpu_addr.start()),
756 ptr_data->cpu_addr.length());
757 exit(1);
759 OFFLOAD_TRACE(3, "Existing association partially overlaps with "
760 "data address range\n");
761 ptr_data = 0;
762 return true;
765 // if the entry is associated with static data it may not have buffers
766 // created because they are created on demand.
767 if (ptr_data->is_static && !init_static_ptr_data(ptr_data)) {
768 return false;
771 return true;
774 bool OffloadDescriptor::init_static_ptr_data(PtrData *ptr_data)
776 OffloadTimer timer(get_timer_data(), c_offload_host_alloc_buffers);
778 if (ptr_data->cpu_buf == 0) {
779 OFFLOAD_TRACE(3, "Creating buffer from source memory %llx\n",
780 ptr_data->cpu_addr.start());
782 COIRESULT res = COI::BufferCreateFromMemory(
783 ptr_data->cpu_addr.length(),
784 COI_BUFFER_NORMAL,
786 const_cast<void*>(ptr_data->cpu_addr.start()),
787 1, &m_device.get_process(),
788 &ptr_data->cpu_buf);
790 if (res != COI_SUCCESS) {
791 if (m_status != 0) {
792 m_status->result = translate_coi_error(res);
793 return false;
795 report_coi_error(c_buf_create_from_mem, res);
799 if (ptr_data->mic_buf == 0) {
800 OFFLOAD_TRACE(3, "Creating buffer from sink memory %llx\n",
801 ptr_data->mic_addr);
803 COIRESULT res = COI::BufferCreateFromMemory(
804 ptr_data->cpu_addr.length(),
805 COI_BUFFER_NORMAL,
806 COI_SINK_MEMORY,
807 reinterpret_cast<void*>(ptr_data->mic_addr),
808 1, &m_device.get_process(),
809 &ptr_data->mic_buf);
811 if (res != COI_SUCCESS) {
812 if (m_status != 0) {
813 m_status->result = translate_coi_error(res);
814 return false;
816 report_coi_error(c_buf_create_from_mem, res);
820 return true;
823 bool OffloadDescriptor::init_mic_address(PtrData *ptr_data)
825 if (ptr_data->mic_buf != 0 && ptr_data->mic_addr == 0) {
826 COIRESULT res = COI::BufferGetSinkAddress(ptr_data->mic_buf,
827 &ptr_data->mic_addr);
828 if (res != COI_SUCCESS) {
829 if (m_status != 0) {
830 m_status->result = translate_coi_error(res);
832 else if (m_is_mandatory) {
833 report_coi_error(c_buf_get_address, res);
835 return false;
838 return true;
841 bool OffloadDescriptor::nullify_target_stack(
842 COIBUFFER targ_buf,
843 uint64_t size
846 char * ptr = (char*)malloc(size);
847 if (ptr == NULL)
848 LIBOFFLOAD_ERROR(c_malloc);
849 COIRESULT res;
851 memset(ptr, 0, size);
852 res = COI::BufferWrite(
853 targ_buf,
855 ptr,
856 size,
857 COI_COPY_UNSPECIFIED,
858 0, 0, 0);
859 free(ptr);
860 if (res != COI_SUCCESS) {
861 if (m_status != 0) {
862 m_status->result = translate_coi_error(res);
863 return false;
865 report_coi_error(c_buf_write, res);
867 return true;
870 bool OffloadDescriptor::offload_stack_memory_manager(
871 const void * stack_begin,
872 int routine_id,
873 int buf_size,
874 int align,
875 bool *is_new)
877 mutex_locker_t locker(stack_alloc_lock);
879 PersistData * new_el;
880 PersistDataList::iterator it_begin = m_device.m_persist_list.begin();
881 PersistDataList::iterator it_end;
882 int erase = 0;
883 uint64_t cur_thread_id = m_device.get_thread_id();
885 *is_new = false;
887 for (PersistDataList::iterator it = m_device.m_persist_list.begin();
888 it != m_device.m_persist_list.end(); it++) {
889 PersistData cur_el = *it;
891 if (stack_begin > it->stack_cpu_addr) {
892 // this stack data must be destroyed
893 if (cur_thread_id == cur_el.thread_id) {
894 m_destroy_stack.push_front(cur_el.stack_ptr_data);
895 it_end = it;
896 erase++;
899 else if (stack_begin == it->stack_cpu_addr) {
900 if (routine_id != it-> routine_id) {
901 // this stack data must be destroyed
902 m_destroy_stack.push_front(cur_el.stack_ptr_data);
903 it_end = it;
904 erase++;
905 break;
907 else {
908 // stack data is reused
909 m_stack_ptr_data = it->stack_ptr_data;
910 if (erase > 0) {
911 // all obsolete stack sections must be erased from the list
912 m_device.m_persist_list.erase(it_begin, ++it_end);
914 m_in_datalen +=
915 erase * sizeof(new_el->stack_ptr_data->mic_addr);
917 OFFLOAD_TRACE(3, "Reuse of stack buffer with addr %p\n",
918 m_stack_ptr_data->mic_addr);
919 return true;
922 else if (stack_begin < it->stack_cpu_addr &&
923 cur_thread_id == cur_el.thread_id) {
924 break;
928 if (erase > 0) {
929 // all obsolete stack sections must be erased from the list
930 m_device.m_persist_list.erase(it_begin, ++it_end);
931 m_in_datalen += erase * sizeof(new_el->stack_ptr_data->mic_addr);
933 // new stack table is created
934 new_el = new PersistData(stack_begin, routine_id, buf_size, cur_thread_id);
935 // create MIC buffer
936 COIRESULT res;
937 uint32_t buffer_flags = 0;
939 // create buffer with large pages if data length exceeds
940 // large page threshold
941 if (buf_size >= __offload_use_2mb_buffers) {
942 buffer_flags = COI_OPTIMIZE_HUGE_PAGE_SIZE;
944 res = COI::BufferCreate(buf_size,
945 COI_BUFFER_NORMAL,
946 buffer_flags,
949 &m_device.get_process(),
950 &new_el->stack_ptr_data->mic_buf);
951 if (res != COI_SUCCESS) {
952 if (m_status != 0) {
953 m_status->result = translate_coi_error(res);
955 else if (m_is_mandatory) {
956 report_coi_error(c_buf_create, res);
958 return false;
960 // make buffer valid on the device.
961 res = COI::BufferSetState(new_el->stack_ptr_data->mic_buf,
962 m_device.get_process(),
963 COI_BUFFER_VALID,
964 COI_BUFFER_NO_MOVE,
965 0, 0, 0);
966 if (res != COI_SUCCESS) {
967 if (m_status != 0) {
968 m_status->result = translate_coi_error(res);
970 else if (m_is_mandatory) {
971 report_coi_error(c_buf_set_state, res);
973 return false;
975 res = COI::BufferSetState(new_el->stack_ptr_data->mic_buf,
976 COI_PROCESS_SOURCE,
977 COI_BUFFER_INVALID,
978 COI_BUFFER_NO_MOVE,
979 0, 0, 0);
980 if (res != COI_SUCCESS) {
981 if (m_status != 0) {
982 m_status->result = translate_coi_error(res);
984 else if (m_is_mandatory) {
985 report_coi_error(c_buf_set_state, res);
987 return false;
989 // persistence algorithm requires target stack initialy to be nullified
990 if (!nullify_target_stack(new_el->stack_ptr_data->mic_buf, buf_size)) {
991 return false;
994 m_stack_ptr_data = new_el->stack_ptr_data;
995 init_mic_address(m_stack_ptr_data);
996 OFFLOAD_TRACE(3, "Allocating stack buffer with addr %p\n",
997 m_stack_ptr_data->mic_addr);
998 m_device.m_persist_list.push_front(*new_el);
999 init_mic_address(new_el->stack_ptr_data);
1000 *is_new = true;
1001 return true;
1004 bool OffloadDescriptor::setup_descriptors(
1005 VarDesc *vars,
1006 VarDesc2 *vars2,
1007 int vars_total,
1008 int entry_id,
1009 const void *stack_addr
1012 COIRESULT res;
1014 OffloadTimer timer(get_timer_data(), c_offload_host_setup_buffers);
1016 // make a copy of variable descriptors
1017 m_vars_total = vars_total;
1018 if (vars_total > 0) {
1019 m_vars = (VarDesc*) malloc(m_vars_total * sizeof(VarDesc));
1020 if (m_vars == NULL)
1021 LIBOFFLOAD_ERROR(c_malloc);
1022 memcpy(m_vars, vars, m_vars_total * sizeof(VarDesc));
1023 m_vars_extra = (VarExtra*) malloc(m_vars_total * sizeof(VarExtra));
1024 if (m_vars_extra == NULL)
1025 LIBOFFLOAD_ERROR(c_malloc);
1028 // dependencies
1029 m_in_deps_allocated = m_vars_total + 1;
1030 m_in_deps = (COIEVENT*) malloc(sizeof(COIEVENT) * m_in_deps_allocated);
1031 if (m_in_deps == NULL)
1032 LIBOFFLOAD_ERROR(c_malloc);
1033 if (m_vars_total > 0) {
1034 m_out_deps_allocated = m_vars_total;
1035 m_out_deps = (COIEVENT*) malloc(sizeof(COIEVENT) * m_out_deps_allocated);
1036 if (m_out_deps == NULL)
1037 LIBOFFLOAD_ERROR(c_malloc);
1040 // copyin/copyout data length
1041 m_in_datalen = 0;
1042 m_out_datalen = 0;
1044 // First pass over variable descriptors
1045 // - Calculate size of the input and output non-pointer data
1046 // - Allocate buffers for input and output pointers
1047 for (int i = 0; i < m_vars_total; i++) {
1048 void* alloc_base = NULL;
1049 int64_t alloc_disp = 0;
1050 int64_t alloc_size = 0;
1051 bool src_is_for_mic = (m_vars[i].direction.out ||
1052 m_vars[i].into == NULL);
1054 const char *var_sname = "";
1055 if (vars2 != NULL && i < vars_total) {
1056 if (vars2[i].sname != NULL) {
1057 var_sname = vars2[i].sname;
1060 OFFLOAD_TRACE(2, " VarDesc %d, var=%s, %s, %s\n",
1061 i, var_sname,
1062 vardesc_direction_as_string[m_vars[i].direction.bits],
1063 vardesc_type_as_string[m_vars[i].type.src]);
1064 if (vars2 != NULL && i < vars_total && vars2[i].dname != NULL) {
1065 OFFLOAD_TRACE(2, " into=%s, %s\n", vars2[i].dname,
1066 vardesc_type_as_string[m_vars[i].type.dst]);
1068 OFFLOAD_TRACE(2,
1069 " type_src=%d, type_dstn=%d, direction=%d, "
1070 "alloc_if=%d, free_if=%d, align=%d, mic_offset=%d, flags=0x%x, "
1071 "offset=%lld, size=%lld, count/disp=%lld, ptr=%p, into=%p\n",
1072 m_vars[i].type.src,
1073 m_vars[i].type.dst,
1074 m_vars[i].direction.bits,
1075 m_vars[i].alloc_if,
1076 m_vars[i].free_if,
1077 m_vars[i].align,
1078 m_vars[i].mic_offset,
1079 m_vars[i].flags.bits,
1080 m_vars[i].offset,
1081 m_vars[i].size,
1082 m_vars[i].count,
1083 m_vars[i].ptr,
1084 m_vars[i].into);
1085 // If any varDesc flags bits set, show them
1086 if (console_enabled >= 1 && m_vars[i].flags.bits != 0) {
1087 trace_varDesc_flags(get_timer_data(), m_vars[i].flags);
1090 // preallocated implies targetptr
1091 if (m_vars[i].flags.preallocated) {
1092 // targetptr preallocated alloc_if(1) may not be used with
1093 // an in clause
1094 if (m_vars[i].direction.in && m_vars[i].alloc_if) {
1095 LIBOFFLOAD_ERROR(c_in_with_preallocated);
1096 exit(1);
1098 m_vars[i].flags.targetptr = 1;
1100 if (m_vars[i].alloc != NULL) {
1101 // array descriptor
1102 const Arr_Desc *ap =
1103 static_cast<const Arr_Desc*>(m_vars[i].alloc);
1105 // debug dump
1106 ARRAY_DESC_DUMP(" ", "ALLOC", ap, 0, 1);
1108 __arr_data_offset_and_length(ap, alloc_disp, alloc_size);
1110 alloc_base = reinterpret_cast<void*>(ap->base);
1113 m_vars_extra[i].alloc = m_vars[i].alloc;
1114 m_vars_extra[i].cpu_disp = 0;
1115 m_vars_extra[i].cpu_offset = 0;
1116 m_vars_extra[i].src_data = 0;
1117 m_vars_extra[i].read_rng_src = 0;
1118 m_vars_extra[i].read_rng_dst = 0;
1119 m_vars_extra[i].omp_last_event_type = c_last_not;
1120 // flag is_arr_ptr_el is 1 only for var_descs generated
1121 // for c_data_ptr_array type
1122 if (i < vars_total) {
1123 m_vars_extra[i].is_arr_ptr_el = 0;
1126 switch (m_vars[i].type.src) {
1127 case c_data_ptr_array:
1129 const Arr_Desc *ap;
1130 const VarDesc3 *vd3 =
1131 static_cast<const VarDesc3*>(m_vars[i].ptr);
1132 int flags = vd3->array_fields;
1133 OFFLOAD_TRACE(2,
1134 " pointer array flags = %04x\n", flags);
1135 OFFLOAD_TRACE(2,
1136 " pointer array type is %s\n",
1137 vardesc_type_as_string[flags & 0x3f]);
1138 ap = static_cast<const Arr_Desc*>(vd3->ptr_array);
1139 ARRAY_DESC_DUMP(" ", "ptr array", ap,
1140 m_vars[i].flags.is_pointer, 1);
1141 if (m_vars[i].into) {
1142 ap = static_cast<const Arr_Desc*>(m_vars[i].into);
1143 ARRAY_DESC_DUMP(
1144 " ", "into array", ap, 0, 1);
1146 if ((flags & (1<<flag_align_is_array)) != 0) {
1147 ap = static_cast<const Arr_Desc*>(vd3->align_array);
1148 ARRAY_DESC_DUMP(
1149 " ", "align array", ap, 0, 1);
1151 if ((flags & (1<<flag_alloc_if_is_array)) != 0) {
1152 ap = static_cast<const Arr_Desc*>(vd3->alloc_if_array);
1153 ARRAY_DESC_DUMP(
1154 " ", "alloc_if array", ap, 0, 1);
1156 if ((flags & (1<<flag_free_if_is_array)) != 0) {
1157 ap = static_cast<const Arr_Desc*>(vd3->free_if_array);
1158 ARRAY_DESC_DUMP(
1159 " ", "free_if array", ap, 0, 1);
1161 if ((flags & (1<<flag_extent_start_is_array)) != 0) {
1162 ap = static_cast<const Arr_Desc*>(vd3->extent_start);
1163 ARRAY_DESC_DUMP(
1164 " ", "extent_start array", ap, 0, 1);
1165 } else if ((flags &
1166 (1<<flag_extent_start_is_scalar)) != 0) {
1167 OFFLOAD_TRACE(2,
1168 " extent_start scalar = %d\n",
1169 (int64_t)vd3->extent_start);
1171 if ((flags & (1<<flag_extent_elements_is_array)) != 0) {
1172 ap = static_cast<const Arr_Desc*>
1173 (vd3->extent_elements);
1174 ARRAY_DESC_DUMP(" ",
1175 "extent_elements array", ap, 0, 1);
1176 } else if ((flags &
1177 (1<<flag_extent_elements_is_scalar)) != 0) {
1178 OFFLOAD_TRACE(2,
1179 " extent_elements scalar = %d\n",
1180 (int64_t)vd3->extent_elements);
1182 if ((flags & (1<<flag_into_start_is_array)) != 0) {
1183 ap = static_cast<const Arr_Desc*>(vd3->into_start);
1184 ARRAY_DESC_DUMP(
1185 " ", "into_start array", ap, 0, 1);
1186 } else if ((flags &
1187 (1<<flag_into_start_is_scalar)) != 0) {
1188 OFFLOAD_TRACE(2,
1189 " into_start scalar = %d\n",
1190 (int64_t)vd3->into_start);
1192 if ((flags & (1<<flag_into_elements_is_array)) != 0) {
1193 ap = static_cast<const Arr_Desc*>(vd3->into_elements);
1194 ARRAY_DESC_DUMP(
1195 " ", "into_elements array", ap, 0, 1);
1196 } else if ((flags &
1197 (1<<flag_into_elements_is_scalar)) != 0) {
1198 OFFLOAD_TRACE(2,
1199 " into_elements scalar = %d\n",
1200 (int64_t)vd3->into_elements);
1202 if ((flags & (1<<flag_alloc_start_is_array)) != 0) {
1203 ap = static_cast<const Arr_Desc*>(vd3->alloc_start);
1204 ARRAY_DESC_DUMP(
1205 " ", "alloc_start array", ap, 0, 1);
1206 } else if ((flags &
1207 (1<<flag_alloc_start_is_scalar)) != 0) {
1208 OFFLOAD_TRACE(2,
1209 " alloc_start scalar = %d\n",
1210 (int64_t)vd3->alloc_start);
1212 if ((flags & (1<<flag_alloc_elements_is_array)) != 0) {
1213 ap = static_cast<const Arr_Desc*>(vd3->alloc_elements);
1214 ARRAY_DESC_DUMP(" ",
1215 "alloc_elements array", ap, 0, 1);
1216 } else if ((flags &
1217 (1<<flag_alloc_elements_is_scalar)) != 0) {
1218 OFFLOAD_TRACE(2,
1219 " alloc_elements scalar = %d\n",
1220 (int64_t)vd3->alloc_elements);
1223 if (!gen_var_descs_for_pointer_array(i)) {
1224 return false;
1226 break;
1228 case c_data:
1229 case c_void_ptr:
1230 case c_cean_var:
1231 // In all uses later
1232 // VarDesc.size will have the length of the data to be
1233 // transferred
1234 // VarDesc.disp will have an offset from base
1235 if (m_vars[i].type.src == c_cean_var) {
1236 // array descriptor
1237 const Arr_Desc *ap =
1238 static_cast<const Arr_Desc*>(m_vars[i].ptr);
1240 // debug dump
1241 ARRAY_DESC_DUMP("", "IN/OUT", ap, 0, !src_is_for_mic);
1243 // offset and length are derived from the array descriptor
1244 __arr_data_offset_and_length(ap, m_vars[i].disp,
1245 m_vars[i].size);
1246 if (!is_arr_desc_contiguous(ap)) {
1247 m_vars[i].flags.is_noncont_src = 1;
1248 m_vars_extra[i].read_rng_src =
1249 init_read_ranges_arr_desc(ap);
1251 // all necessary information about length and offset is
1252 // transferred in var descriptor. There is no need to send
1253 // array descriptor to the target side.
1254 m_vars[i].ptr = reinterpret_cast<void*>(ap->base);
1256 else {
1257 m_vars[i].size *= m_vars[i].count;
1258 m_vars[i].disp = 0;
1261 if (m_vars[i].direction.bits) {
1262 // make sure that transfer size > 0
1263 if (m_vars[i].size <= 0) {
1264 LIBOFFLOAD_ERROR(c_zero_or_neg_transfer_size);
1265 exit(1);
1268 if (m_vars[i].flags.is_static) {
1269 PtrData *ptr_data;
1271 // find data associated with variable
1272 if (!find_ptr_data(ptr_data,
1273 m_vars[i].ptr,
1274 m_vars[i].disp,
1275 m_vars[i].size,
1276 false, false)) {
1277 return false;
1280 if (ptr_data != 0) {
1281 // offset to base from the beginning of the buffer
1282 // memory
1283 m_vars[i].offset =
1284 (char*) m_vars[i].ptr -
1285 (char*) ptr_data->cpu_addr.start();
1287 else {
1288 m_vars[i].flags.is_static = false;
1289 if (m_vars[i].into == NULL) {
1290 m_vars[i].flags.is_static_dstn = false;
1293 m_vars_extra[i].src_data = ptr_data;
1296 if (m_is_openmp) {
1297 if (m_vars[i].flags.is_static) {
1298 // Static data is transferred either by omp target
1299 // update construct which passes zeros for
1300 // alloc_if and free_if or by always modifier.
1301 if (!m_vars[i].flags.always_copy &&
1302 (m_vars[i].alloc_if || m_vars[i].free_if)) {
1303 m_vars[i].direction.bits = c_parameter_nocopy;
1306 else {
1307 AutoData *auto_data;
1308 if (m_vars[i].alloc_if) {
1309 auto_data = m_device.insert_auto_data(
1310 m_vars[i].ptr, m_vars[i].size);
1311 auto_data->add_reference();
1313 else {
1314 // TODO: what should be done if var is not in
1315 // the table?
1316 auto_data = m_device.find_auto_data(
1317 m_vars[i].ptr);
1320 // For automatic variables data is transferred:
1321 // - if always modifier is used OR
1322 // - if alloc_if == 0 && free_if == 0 OR
1323 // - if reference count is 1
1324 if (!m_vars[i].flags.always_copy &&
1325 (m_vars[i].alloc_if || m_vars[i].free_if) &&
1326 auto_data != 0 &&
1327 auto_data->get_reference() != 1) {
1328 m_vars[i].direction.bits = c_parameter_nocopy;
1331 // save data for later use
1332 m_vars_extra[i].auto_data = auto_data;
1336 if (m_vars[i].direction.in &&
1337 !m_vars[i].flags.is_static) {
1338 m_in_datalen += m_vars[i].size;
1340 // for non-static target destination defined as CEAN
1341 // expression we pass to target its size and dist
1342 if (m_vars[i].into == NULL &&
1343 m_vars[i].type.src == c_cean_var) {
1344 m_in_datalen += 2 * sizeof(uint64_t);
1346 m_need_runfunction = true;
1348 if (m_vars[i].direction.out &&
1349 !m_vars[i].flags.is_static) {
1350 m_out_datalen += m_vars[i].size;
1351 m_need_runfunction = true;
1354 break;
1356 case c_dv:
1357 if (m_vars[i].direction.bits ||
1358 m_vars[i].alloc_if ||
1359 m_vars[i].free_if) {
1360 ArrDesc *dvp = static_cast<ArrDesc*>(m_vars[i].ptr);
1362 // debug dump
1363 __dv_desc_dump("IN/OUT", dvp);
1365 // send dope vector contents excluding base
1366 m_in_datalen += m_vars[i].size - sizeof(uint64_t);
1367 m_need_runfunction = true;
1369 break;
1371 case c_string_ptr:
1372 if ((m_vars[i].direction.bits ||
1373 m_vars[i].alloc_if ||
1374 m_vars[i].free_if) &&
1375 m_vars[i].size == 0) {
1376 m_vars[i].size = 1;
1377 m_vars[i].count =
1378 strlen(*static_cast<char**>(m_vars[i].ptr)) + 1;
1380 /* fallthru */
1382 case c_data_ptr:
1383 if (m_vars[i].flags.is_stack_buf &&
1384 !m_vars[i].direction.bits &&
1385 m_vars[i].alloc_if) {
1386 // this var_desc is for stack buffer
1387 bool is_new;
1389 if (!offload_stack_memory_manager(
1390 stack_addr, entry_id,
1391 m_vars[i].count, m_vars[i].align, &is_new)) {
1392 return false;
1394 if (is_new) {
1395 m_compute_buffers.push_back(
1396 m_stack_ptr_data->mic_buf);
1397 m_device.m_persist_list.front().cpu_stack_addr =
1398 static_cast<char*>(m_vars[i].ptr);
1400 else {
1401 m_vars[i].flags.sink_addr = 1;
1402 m_in_datalen += sizeof(m_stack_ptr_data->mic_addr);
1404 m_vars[i].size = m_destroy_stack.size();
1405 m_vars_extra[i].src_data = m_stack_ptr_data;
1407 // need to add or remove references for stack buffer at target
1408 if (is_new || m_destroy_stack.size()) {
1409 m_need_runfunction = true;
1412 break;
1414 /* fallthru */
1416 case c_cean_var_ptr:
1417 case c_dv_ptr:
1418 if (m_vars[i].type.src == c_cean_var_ptr) {
1419 // array descriptor
1420 const Arr_Desc *ap =
1421 static_cast<const Arr_Desc*>(m_vars[i].ptr);
1423 // debug dump
1424 ARRAY_DESC_DUMP("", "IN/OUT", ap, 1, !src_is_for_mic);
1426 // offset and length are derived from the array descriptor
1427 __arr_data_offset_and_length(ap, m_vars[i].disp,
1428 m_vars[i].size);
1430 if (!is_arr_desc_contiguous(ap)) {
1431 m_vars[i].flags.is_noncont_src = 1;
1432 m_vars_extra[i].read_rng_src =
1433 init_read_ranges_arr_desc(ap);
1435 // all necessary information about length and offset is
1436 // transferred in var descriptor. There is no need to send
1437 // array descriptor to the target side.
1438 m_vars[i].ptr = reinterpret_cast<void*>(ap->base);
1440 else if (m_vars[i].type.src == c_dv_ptr) {
1441 // need to send DV to the device unless it is 'nocopy'
1442 if (m_vars[i].direction.bits ||
1443 m_vars[i].alloc_if ||
1444 m_vars[i].free_if) {
1445 ArrDesc *dvp = *static_cast<ArrDesc**>(m_vars[i].ptr);
1447 // debug dump
1448 __dv_desc_dump("IN/OUT", dvp);
1450 m_vars[i].direction.bits = c_parameter_in;
1453 // no displacement
1454 m_vars[i].disp = 0;
1456 else {
1457 // c_data_ptr or c_string_ptr
1458 m_vars[i].size *= m_vars[i].count;
1459 m_vars[i].disp = 0;
1462 if (m_vars[i].direction.bits ||
1463 m_vars[i].alloc_if ||
1464 m_vars[i].free_if) {
1465 PtrData *ptr_data;
1467 // check that buffer length > 0
1468 if (m_vars[i].alloc_if &&
1469 m_vars[i].disp + m_vars[i].size <
1470 (m_is_openmp ? 0 : 1)) {
1471 LIBOFFLOAD_ERROR(c_zero_or_neg_ptr_len);
1472 exit(1);
1475 // base address
1476 void *base = *static_cast<void**>(m_vars[i].ptr);
1478 // allocate buffer if we have no INTO and don't need
1479 // allocation for the ptr at target
1480 if (src_is_for_mic) {
1481 if (m_vars[i].flags.is_stack_buf) {
1482 // for stack persistent objects ptr data is created
1483 // by var_desc with number 0.
1484 // Its ptr_data is stored at m_stack_ptr_data
1485 ptr_data = m_stack_ptr_data;
1486 m_vars[i].flags.sink_addr = 1;
1488 else if (m_vars[i].alloc_if) {
1489 if (m_vars[i].flags.preallocated) {
1490 m_out_datalen += sizeof(void*);
1491 m_need_runfunction = true;
1492 break;
1494 // add new entry
1495 if (!alloc_ptr_data(
1496 ptr_data,
1497 reinterpret_cast<char *>(base) + alloc_disp,
1498 (alloc_base != NULL) ?
1499 alloc_disp : m_vars[i].disp,
1500 (alloc_base != NULL) ?
1501 alloc_size : m_vars[i].size,
1502 alloc_disp,
1503 (alloc_base != NULL) ?
1504 0 : m_vars[i].align,
1505 m_vars[i].flags.targetptr,
1507 m_vars[i].flags.pin)) {
1508 return false;
1510 if (m_vars[i].flags.targetptr) {
1511 if (!init_mic_address(ptr_data)) {
1512 return false;
1514 *static_cast<void**>(m_vars[i].ptr) = base =
1515 reinterpret_cast<void*>(ptr_data->mic_addr);
1517 if (ptr_data->add_reference() == 0 &&
1518 ptr_data->mic_buf != 0) {
1519 // add buffer to the list of buffers that
1520 // are passed to dispatch call
1521 m_compute_buffers.push_back(
1522 ptr_data->mic_buf);
1524 else if (!m_vars[i].flags.pin &&
1525 !m_vars[i].flags.preallocated) {
1526 // will send buffer address to device
1527 m_vars[i].flags.sink_addr = 1;
1530 if (!m_vars[i].flags.pin &&
1531 !ptr_data->is_static) {
1532 // need to add reference for buffer
1533 m_need_runfunction = true;
1536 else {
1537 bool error_if_not_found = true;
1538 if (m_is_openmp) {
1539 // For omp target update variable is ignored
1540 // if it does not exist.
1541 if (m_vars[i].flags.always_copy ||
1542 (!m_vars[i].alloc_if &&
1543 !m_vars[i].free_if)) {
1544 error_if_not_found = false;
1548 // use existing association from pointer table
1549 if (!find_ptr_data(ptr_data,
1550 base,
1551 m_vars[i].disp,
1552 m_vars[i].size,
1553 m_vars[i].flags.targetptr,
1554 error_if_not_found)) {
1555 return false;
1558 if (m_is_openmp) {
1559 // make var nocopy if it does not exist
1560 if (ptr_data == 0) {
1561 m_vars[i].direction.bits =
1562 c_parameter_nocopy;
1566 if (ptr_data != 0) {
1567 m_vars[i].flags.sink_addr = 1;
1571 if (ptr_data != 0) {
1572 if (m_is_openmp) {
1573 // data is transferred only if
1574 // alloc_if == 0 && free_if == 0
1575 // or reference count is 1
1576 if (!m_vars[i].flags.always_copy &&
1577 ((m_vars[i].alloc_if ||
1578 m_vars[i].free_if) &&
1579 ptr_data->get_reference() != 1)) {
1580 m_vars[i].direction.bits =
1581 c_parameter_nocopy;
1585 if (ptr_data->alloc_disp != 0) {
1586 m_vars[i].flags.alloc_disp = 1;
1587 m_in_datalen += sizeof(alloc_disp);
1590 if (m_vars[i].flags.sink_addr) {
1591 // get buffers's address on the sink
1592 if (!init_mic_address(ptr_data)) {
1593 return false;
1596 m_in_datalen += sizeof(ptr_data->mic_addr);
1599 if (!m_vars[i].flags.pin &&
1600 !ptr_data->is_static && m_vars[i].free_if) {
1601 // need to decrement buffer reference on target
1602 m_need_runfunction = true;
1605 // offset to base from the beginning of the buffer
1606 // memory
1607 m_vars[i].offset = (char*) base -
1608 (char*) ptr_data->cpu_addr.start();
1610 // copy other pointer properties to var descriptor
1611 m_vars[i].mic_offset = ptr_data->mic_offset;
1612 m_vars[i].flags.is_static = ptr_data->is_static;
1615 else {
1616 if (!find_ptr_data(ptr_data,
1617 base,
1618 m_vars[i].disp,
1619 m_vars[i].size,
1620 false, false)) {
1621 return false;
1623 if (ptr_data) {
1624 m_vars[i].offset =
1625 (char*) base -
1626 (char*) ptr_data->cpu_addr.start();
1630 // save pointer data
1631 m_vars_extra[i].src_data = ptr_data;
1633 break;
1635 case c_func_ptr:
1636 if (m_vars[i].direction.in) {
1637 m_in_datalen += __offload_funcs.max_name_length();
1639 if (m_vars[i].direction.out) {
1640 m_out_datalen += __offload_funcs.max_name_length();
1642 m_need_runfunction = true;
1643 break;
1645 case c_dv_data:
1646 case c_dv_ptr_data:
1647 case c_dv_data_slice:
1648 case c_dv_ptr_data_slice:
1649 ArrDesc *dvp;
1650 if (VAR_TYPE_IS_DV_DATA_SLICE(m_vars[i].type.src)) {
1651 const Arr_Desc *ap;
1652 ap = static_cast<const Arr_Desc*>(m_vars[i].ptr);
1654 dvp = (m_vars[i].type.src == c_dv_data_slice) ?
1655 reinterpret_cast<ArrDesc*>(ap->base) :
1656 *reinterpret_cast<ArrDesc**>(ap->base);
1658 else {
1659 dvp = (m_vars[i].type.src == c_dv_data) ?
1660 static_cast<ArrDesc*>(m_vars[i].ptr) :
1661 *static_cast<ArrDesc**>(m_vars[i].ptr);
1664 // if allocatable dope vector isn't allocated don't
1665 // transfer its data
1666 if (!__dv_is_allocated(dvp)) {
1667 m_vars[i].direction.bits = c_parameter_nocopy;
1668 m_vars[i].alloc_if = 0;
1669 m_vars[i].free_if = 0;
1671 if (m_vars[i].direction.bits ||
1672 m_vars[i].alloc_if ||
1673 m_vars[i].free_if) {
1674 const Arr_Desc *ap;
1676 if (VAR_TYPE_IS_DV_DATA_SLICE(m_vars[i].type.src)) {
1677 ap = static_cast<const Arr_Desc*>(m_vars[i].ptr);
1679 // debug dump
1680 ARRAY_DESC_DUMP("", "IN/OUT", ap, 0, !src_is_for_mic);
1682 if (!__dv_is_contiguous(dvp)) {
1683 m_vars[i].flags.is_noncont_src = 1;
1684 m_vars_extra[i].read_rng_src =
1685 init_read_ranges_dv(dvp);
1688 // size and displacement
1689 if (VAR_TYPE_IS_DV_DATA_SLICE(m_vars[i].type.src)) {
1690 // offset and length are derived from the
1691 // array descriptor
1692 __arr_data_offset_and_length(ap,
1693 m_vars[i].disp,
1694 m_vars[i].size);
1695 if (m_vars[i].direction.bits) {
1696 if (!is_arr_desc_contiguous(ap)) {
1697 if (m_vars[i].flags.is_noncont_src) {
1698 LIBOFFLOAD_ERROR(c_slice_of_noncont_array);
1699 return false;
1701 m_vars[i].flags.is_noncont_src = 1;
1702 m_vars_extra[i].read_rng_src =
1703 init_read_ranges_arr_desc(ap);
1707 else {
1708 if (m_vars[i].flags.has_length) {
1709 m_vars[i].size =
1710 __dv_data_length(dvp, m_vars[i].count);
1712 else {
1713 m_vars[i].size = __dv_data_length(dvp);
1715 m_vars[i].disp = 0;
1718 // check that length >= 0
1719 if (m_vars[i].alloc_if &&
1720 (m_vars[i].disp + m_vars[i].size < 0)) {
1721 LIBOFFLOAD_ERROR(c_zero_or_neg_ptr_len);
1722 exit(1);
1725 // base address
1726 void *base = reinterpret_cast<void*>(dvp->Base);
1727 PtrData *ptr_data;
1729 // allocate buffer if we have no INTO and don't need
1730 // allocation for the ptr at target
1731 if (src_is_for_mic) {
1732 if (m_vars[i].alloc_if) {
1733 // add new entry
1734 if (!alloc_ptr_data(
1735 ptr_data,
1736 reinterpret_cast<char *>(base) + alloc_disp,
1737 (alloc_base != NULL) ?
1738 alloc_disp : m_vars[i].disp,
1739 (alloc_base != NULL) ?
1740 alloc_size : m_vars[i].size,
1741 alloc_disp,
1742 (alloc_base != NULL) ?
1743 0 : m_vars[i].align,
1744 m_vars[i].flags.targetptr,
1745 m_vars[i].flags.preallocated,
1746 m_vars[i].flags.pin)) {
1747 return false;
1750 if (ptr_data->add_reference() == 0 &&
1751 ptr_data->mic_buf != 0) {
1752 // add buffer to the list of buffers
1753 // that are passed to dispatch call
1754 m_compute_buffers.push_back(
1755 ptr_data->mic_buf);
1757 else {
1758 // will send buffer address to device
1759 m_vars[i].flags.sink_addr = 1;
1762 if (!ptr_data->is_static) {
1763 // need to add reference for buffer
1764 m_need_runfunction = true;
1767 else {
1768 bool error_if_not_found = true;
1769 if (m_is_openmp) {
1770 // For omp target update variable is ignored
1771 // if it does not exist.
1772 if (m_vars[i].flags.always_copy ||
1773 (!m_vars[i].alloc_if &&
1774 !m_vars[i].free_if)) {
1775 error_if_not_found = false;
1779 // use existing association from pointer table
1780 if (!find_ptr_data(ptr_data,
1781 base,
1782 m_vars[i].disp,
1783 m_vars[i].size,
1784 m_vars[i].flags.targetptr,
1785 error_if_not_found)) {
1786 return false;
1789 if (m_is_openmp) {
1790 // make var nocopy if it does not exist
1791 if (ptr_data == 0) {
1792 m_vars[i].direction.bits =
1793 c_parameter_nocopy;
1797 if (ptr_data != 0) {
1798 // need to update base in dope vector on device
1799 m_vars[i].flags.sink_addr = 1;
1803 if (ptr_data != 0) {
1804 if (m_is_openmp) {
1805 // data is transferred if
1806 // - if always modifier is used OR
1807 // - if alloc_if == 0 && free_if == 0 OR
1808 // - if reference count is 1
1809 if (!m_vars[i].flags.always_copy &&
1810 (m_vars[i].alloc_if ||
1811 m_vars[i].free_if) &&
1812 ptr_data->get_reference() != 1) {
1813 m_vars[i].direction.bits =
1814 c_parameter_nocopy;
1818 if (ptr_data->alloc_disp != 0) {
1819 m_vars[i].flags.alloc_disp = 1;
1820 m_in_datalen += sizeof(alloc_disp);
1823 if (m_vars[i].flags.sink_addr) {
1824 // get buffers's address on the sink
1825 if (!init_mic_address(ptr_data)) {
1826 return false;
1829 m_in_datalen += sizeof(ptr_data->mic_addr);
1832 if (!ptr_data->is_static && m_vars[i].free_if) {
1833 // need to decrement buffer reference on target
1834 m_need_runfunction = true;
1837 // offset to base from the beginning of the buffer
1838 // memory
1839 m_vars[i].offset =
1840 (char*) base -
1841 (char*) ptr_data->cpu_addr.start();
1843 // copy other pointer properties to var descriptor
1844 m_vars[i].mic_offset = ptr_data->mic_offset;
1845 m_vars[i].flags.is_static = ptr_data->is_static;
1848 else { // !src_is_for_mic
1849 if (!find_ptr_data(ptr_data,
1850 base,
1851 m_vars[i].disp,
1852 m_vars[i].size,
1853 false, false)) {
1854 return false;
1856 m_vars[i].offset = !ptr_data ? 0 :
1857 (char*) base -
1858 (char*) ptr_data->cpu_addr.start();
1861 // save pointer data
1862 m_vars_extra[i].src_data = ptr_data;
1864 break;
1866 default:
1867 LIBOFFLOAD_ERROR(c_unknown_var_type, m_vars[i].type.src);
1868 LIBOFFLOAD_ABORT;
1870 if (m_vars[i].type.src == c_data_ptr_array) {
1871 continue;
1874 if (src_is_for_mic && m_vars[i].flags.is_stack_buf) {
1875 m_vars[i].offset = static_cast<char*>(m_vars[i].ptr) -
1876 m_device.m_persist_list.front().cpu_stack_addr;
1878 // if source is used at CPU save its offset and disp
1879 if (m_vars[i].into == NULL || m_vars[i].direction.in) {
1880 m_vars_extra[i].cpu_offset = m_vars[i].offset;
1881 m_vars_extra[i].cpu_disp = m_vars[i].disp;
1884 // If "into" is define we need to do the similar work for it
1885 if (!m_vars[i].into) {
1886 continue;
1889 int64_t into_disp =0, into_offset = 0;
1891 switch (m_vars[i].type.dst) {
1892 case c_data_ptr_array:
1893 break;
1894 case c_data:
1895 case c_void_ptr:
1896 case c_cean_var: {
1897 int64_t size = m_vars[i].size;
1899 if (m_vars[i].type.dst == c_cean_var) {
1900 // array descriptor
1901 const Arr_Desc *ap =
1902 static_cast<const Arr_Desc*>(m_vars[i].into);
1904 // debug dump
1905 ARRAY_DESC_DUMP(" ", "INTO", ap, 0, src_is_for_mic);
1907 // offset and length are derived from the array descriptor
1908 __arr_data_offset_and_length(ap, into_disp, size);
1910 if (!is_arr_desc_contiguous(ap)) {
1911 m_vars[i].flags.is_noncont_dst = 1;
1912 m_vars_extra[i].read_rng_dst =
1913 init_read_ranges_arr_desc(ap);
1914 if (!cean_ranges_match(
1915 m_vars_extra[i].read_rng_src,
1916 m_vars_extra[i].read_rng_dst)) {
1917 LIBOFFLOAD_ERROR(c_ranges_dont_match);
1918 exit(1);
1921 m_vars[i].into = reinterpret_cast<void*>(ap->base);
1924 int64_t size_src = m_vars_extra[i].read_rng_src ?
1925 cean_get_transf_size(m_vars_extra[i].read_rng_src) :
1926 m_vars[i].size;
1927 int64_t size_dst = m_vars_extra[i].read_rng_dst ?
1928 cean_get_transf_size(m_vars_extra[i].read_rng_dst) :
1929 size;
1930 // It's supposed that "into" size must be not less
1931 // than src size
1932 if (size_src > size_dst) {
1933 LIBOFFLOAD_ERROR(c_different_src_and_dstn_sizes,
1934 size_src, size_dst);
1935 exit(1);
1938 if (m_vars[i].direction.bits) {
1939 if (m_vars[i].flags.is_static_dstn) {
1940 PtrData *ptr_data;
1942 // find data associated with variable
1943 if (!find_ptr_data(ptr_data, m_vars[i].into,
1944 into_disp, size, false, false)) {
1945 return false;
1947 if (ptr_data != 0) {
1948 // offset to base from the beginning of the buffer
1949 // memory
1950 into_offset =
1951 (char*) m_vars[i].into -
1952 (char*) ptr_data->cpu_addr.start();
1954 else {
1955 m_vars[i].flags.is_static_dstn = false;
1957 m_vars_extra[i].dst_data = ptr_data;
1961 if (m_vars[i].direction.in &&
1962 !m_vars[i].flags.is_static_dstn) {
1963 m_in_datalen += m_vars[i].size;
1965 // for non-static target destination defined as CEAN
1966 // expression we pass to target its size and dist
1967 if (m_vars[i].type.dst == c_cean_var) {
1968 m_in_datalen += 2 * sizeof(uint64_t);
1970 m_need_runfunction = true;
1972 break;
1975 case c_dv:
1976 if (m_vars[i].direction.bits ||
1977 m_vars[i].alloc_if ||
1978 m_vars[i].free_if) {
1979 ArrDesc *dvp = static_cast<ArrDesc*>(m_vars[i].into);
1981 // debug dump
1982 __dv_desc_dump("INTO", dvp);
1984 // send dope vector contents excluding base
1985 m_in_datalen += m_vars[i].size - sizeof(uint64_t);
1986 m_need_runfunction = true;
1988 break;
1990 case c_string_ptr:
1991 case c_data_ptr:
1992 case c_cean_var_ptr:
1993 case c_dv_ptr: {
1994 int64_t size = m_vars[i].size;
1996 if (m_vars[i].type.dst == c_cean_var_ptr) {
1997 // array descriptor
1998 const Arr_Desc *ap =
1999 static_cast<const Arr_Desc*>(m_vars[i].into);
2001 // debug dump
2002 ARRAY_DESC_DUMP(" ", "INTO", ap, 1, src_is_for_mic);
2004 // offset and length are derived from the array descriptor
2005 __arr_data_offset_and_length(ap, into_disp, size);
2007 if (!is_arr_desc_contiguous(ap)) {
2008 m_vars[i].flags.is_noncont_src = 1;
2009 m_vars_extra[i].read_rng_dst =
2010 init_read_ranges_arr_desc(ap);
2011 if (!cean_ranges_match(
2012 m_vars_extra[i].read_rng_src,
2013 m_vars_extra[i].read_rng_dst)) {
2014 LIBOFFLOAD_ERROR(c_ranges_dont_match);
2017 m_vars[i].into = reinterpret_cast<char**>(ap->base);
2019 else if (m_vars[i].type.dst == c_dv_ptr) {
2020 // need to send DV to the device unless it is 'nocopy'
2021 if (m_vars[i].direction.bits ||
2022 m_vars[i].alloc_if ||
2023 m_vars[i].free_if) {
2024 ArrDesc *dvp = *static_cast<ArrDesc**>(m_vars[i].into);
2026 // debug dump
2027 __dv_desc_dump("INTO", dvp);
2029 m_vars[i].direction.bits = c_parameter_in;
2033 int64_t size_src = m_vars_extra[i].read_rng_src ?
2034 cean_get_transf_size(m_vars_extra[i].read_rng_src) :
2035 m_vars[i].size;
2036 int64_t size_dst = m_vars_extra[i].read_rng_dst ?
2037 cean_get_transf_size(m_vars_extra[i].read_rng_dst) :
2038 size;
2039 // It's supposed that "into" size must be not less than
2040 // src size
2041 if (size_src > size_dst) {
2042 LIBOFFLOAD_ERROR(c_different_src_and_dstn_sizes,
2043 size_src, size_dst);
2044 exit(1);
2047 if (m_vars[i].direction.bits) {
2048 PtrData *ptr_data;
2050 // base address
2051 void *base = *static_cast<void**>(m_vars[i].into);
2053 if (m_vars[i].direction.in) {
2054 // allocate buffer
2055 if (m_vars[i].flags.is_stack_buf) {
2056 // for stack persistent objects ptr data is created
2057 // by var_desc with number 0.
2058 // Its ptr_data is stored at m_stack_ptr_data
2059 ptr_data = m_stack_ptr_data;
2060 m_vars[i].flags.sink_addr = 1;
2062 else if (m_vars[i].alloc_if) {
2063 if (m_vars[i].flags.preallocated) {
2064 m_out_datalen += sizeof(void*);
2065 m_need_runfunction = true;
2066 break;
2068 // add new entry
2069 if (!alloc_ptr_data(
2070 ptr_data,
2071 reinterpret_cast<char *>(base) + alloc_disp,
2072 (alloc_base != NULL) ?
2073 alloc_disp : into_disp,
2074 (alloc_base != NULL) ?
2075 alloc_size : size,
2076 alloc_disp,
2077 (alloc_base != NULL) ?
2078 0 : m_vars[i].align,
2079 m_vars[i].flags.targetptr,
2080 m_vars[i].flags.preallocated,
2081 m_vars[i].flags.pin)) {
2082 return false;
2084 if (m_vars[i].flags.targetptr) {
2085 if (!init_mic_address(ptr_data)) {
2086 return false;
2088 *static_cast<void**>(m_vars[i].into) = base =
2089 reinterpret_cast<void*>(ptr_data->mic_addr);
2091 if (ptr_data->add_reference() == 0 &&
2092 ptr_data->mic_buf != 0) {
2093 // add buffer to the list of buffers that
2094 // are passed to dispatch call
2095 m_compute_buffers.push_back(
2096 ptr_data->mic_buf);
2098 else {
2099 // will send buffer address to device
2100 m_vars[i].flags.sink_addr = 1;
2103 if (!ptr_data->is_static) {
2104 // need to add reference for buffer
2105 m_need_runfunction = true;
2108 else {
2109 // use existing association from pointer table
2110 if (!find_ptr_data(ptr_data, base, into_disp,
2111 size, m_vars[i].flags.targetptr, true)) {
2112 return false;
2114 m_vars[i].flags.sink_addr = 1;
2117 if (ptr_data->alloc_disp != 0) {
2118 m_vars[i].flags.alloc_disp = 1;
2119 m_in_datalen += sizeof(alloc_disp);
2122 if (m_vars[i].flags.sink_addr) {
2123 // get buffers's address on the sink
2124 if (!init_mic_address(ptr_data)) {
2125 return false;
2128 m_in_datalen += sizeof(ptr_data->mic_addr);
2131 if (!ptr_data->is_static && m_vars[i].free_if) {
2132 // need to decrement buffer reference on target
2133 m_need_runfunction = true;
2136 // copy other pointer properties to var descriptor
2137 m_vars[i].mic_offset = ptr_data->mic_offset;
2138 m_vars[i].flags.is_static_dstn = ptr_data->is_static;
2140 else {
2141 if (!find_ptr_data(ptr_data,
2142 base,
2143 into_disp,
2144 m_vars[i].size,
2145 false, false)) {
2146 return false;
2149 if (ptr_data) {
2150 into_offset = ptr_data ?
2151 (char*) base -
2152 (char*) ptr_data->cpu_addr.start() :
2155 // save pointer data
2156 m_vars_extra[i].dst_data = ptr_data;
2158 break;
2161 case c_func_ptr:
2162 break;
2164 case c_dv_data:
2165 case c_dv_ptr_data:
2166 case c_dv_data_slice:
2167 case c_dv_ptr_data_slice:
2168 if (m_vars[i].direction.bits ||
2169 m_vars[i].alloc_if ||
2170 m_vars[i].free_if) {
2171 const Arr_Desc *ap;
2172 ArrDesc *dvp;
2173 PtrData *ptr_data;
2174 int64_t disp;
2175 int64_t size;
2177 if (VAR_TYPE_IS_DV_DATA_SLICE(m_vars[i].type.dst)) {
2178 ap = static_cast<const Arr_Desc*>(m_vars[i].into);
2180 // debug dump
2181 ARRAY_DESC_DUMP(" ", "INTO", ap, 0, src_is_for_mic);
2183 dvp = (m_vars[i].type.dst == c_dv_data_slice) ?
2184 reinterpret_cast<ArrDesc*>(ap->base) :
2185 *reinterpret_cast<ArrDesc**>(ap->base);
2187 else {
2188 dvp = (m_vars[i].type.dst == c_dv_data) ?
2189 static_cast<ArrDesc*>(m_vars[i].into) :
2190 *static_cast<ArrDesc**>(m_vars[i].into);
2192 if (!__dv_is_contiguous(dvp)) {
2193 m_vars[i].flags.is_noncont_dst = 1;
2194 m_vars_extra[i].read_rng_dst =
2195 init_read_ranges_dv(dvp);
2197 // size and displacement
2198 if (VAR_TYPE_IS_DV_DATA_SLICE(m_vars[i].type.dst)) {
2199 // offset and length are derived from the array
2200 // descriptor
2201 __arr_data_offset_and_length(ap, into_disp, size);
2202 if (m_vars[i].direction.bits) {
2203 if (!is_arr_desc_contiguous(ap)) {
2204 if (m_vars[i].flags.is_noncont_dst) {
2205 LIBOFFLOAD_ERROR(c_slice_of_noncont_array);
2206 return false;
2208 m_vars[i].flags.is_noncont_dst = 1;
2209 m_vars_extra[i].read_rng_dst =
2210 init_read_ranges_arr_desc(ap);
2211 if (!cean_ranges_match(
2212 m_vars_extra[i].read_rng_src,
2213 m_vars_extra[i].read_rng_dst)) {
2214 LIBOFFLOAD_ERROR(c_ranges_dont_match);
2219 else {
2220 if (m_vars[i].flags.has_length) {
2221 size = __dv_data_length(dvp, m_vars[i].count);
2223 else {
2224 size = __dv_data_length(dvp);
2226 disp = 0;
2229 int64_t size_src =
2230 m_vars_extra[i].read_rng_src ?
2231 cean_get_transf_size(m_vars_extra[i].read_rng_src) :
2232 m_vars[i].size;
2233 int64_t size_dst =
2234 m_vars_extra[i].read_rng_dst ?
2235 cean_get_transf_size(m_vars_extra[i].read_rng_dst) :
2236 size;
2237 // It's supposed that "into" size must be not less
2238 // than src size
2239 if (size_src > size_dst) {
2240 LIBOFFLOAD_ERROR(c_different_src_and_dstn_sizes,
2241 size_src, size_dst);
2242 exit(1);
2245 // base address
2246 void *base = reinterpret_cast<void*>(dvp->Base);
2248 // allocate buffer
2249 if (m_vars[i].direction.in) {
2250 if (m_vars[i].alloc_if) {
2251 // add new entry
2252 if (!alloc_ptr_data(
2253 ptr_data,
2254 reinterpret_cast<char *>(base) + alloc_disp,
2255 (alloc_base != NULL) ?
2256 alloc_disp : into_disp,
2257 (alloc_base != NULL) ?
2258 alloc_size : size,
2259 alloc_disp,
2260 (alloc_base != NULL) ?
2261 0 : m_vars[i].align,
2262 m_vars[i].flags.targetptr,
2263 m_vars[i].flags.preallocated,
2264 m_vars[i].flags.pin)) {
2265 return false;
2267 if (ptr_data->add_reference() == 0 &&
2268 ptr_data->mic_buf !=0) {
2269 // add buffer to the list of buffers
2270 // that are passed to dispatch call
2271 m_compute_buffers.push_back(
2272 ptr_data->mic_buf);
2274 else {
2275 // will send buffer address to device
2276 m_vars[i].flags.sink_addr = 1;
2279 if (!ptr_data->is_static) {
2280 // need to add reference for buffer
2281 m_need_runfunction = true;
2284 else {
2285 // use existing association from pointer table
2286 if (!find_ptr_data(ptr_data, base, into_disp,
2287 size, m_vars[i].flags.targetptr, true)) {
2288 return false;
2291 // need to update base in dope vector on device
2292 m_vars[i].flags.sink_addr = 1;
2295 if (ptr_data->alloc_disp != 0) {
2296 m_vars[i].flags.alloc_disp = 1;
2297 m_in_datalen += sizeof(alloc_disp);
2300 if (m_vars[i].flags.sink_addr) {
2301 // get buffers's address on the sink
2302 if (!init_mic_address(ptr_data)) {
2303 return false;
2305 m_in_datalen += sizeof(ptr_data->mic_addr);
2308 if (!ptr_data->is_static && m_vars[i].free_if) {
2309 // need to decrement buffer reference on target
2310 m_need_runfunction = true;
2313 // offset to base from the beginning of the buffer
2314 // memory
2315 into_offset =
2316 (char*) base - (char*) ptr_data->cpu_addr.start();
2318 // copy other pointer properties to var descriptor
2319 m_vars[i].mic_offset = ptr_data->mic_offset;
2320 m_vars[i].flags.is_static_dstn = ptr_data->is_static;
2322 else { // src_is_for_mic
2323 if (!find_ptr_data(ptr_data,
2324 base,
2325 into_disp,
2326 size,
2327 false, false)) {
2328 return false;
2330 into_offset = !ptr_data ?
2332 (char*) base - (char*) ptr_data->cpu_addr.start();
2335 // save pointer data
2336 m_vars_extra[i].dst_data = ptr_data;
2338 break;
2340 default:
2341 LIBOFFLOAD_ERROR(c_unknown_var_type, m_vars[i].type.src);
2342 LIBOFFLOAD_ABORT;
2344 // if into is used at CPU save its offset and disp
2345 if (m_vars[i].direction.out) {
2346 m_vars_extra[i].cpu_offset = into_offset;
2347 m_vars_extra[i].cpu_disp = into_disp;
2349 else {
2350 if (m_vars[i].flags.is_stack_buf) {
2351 into_offset = static_cast<char*>(m_vars[i].into) -
2352 m_device.m_persist_list.front().cpu_stack_addr;
2354 m_vars[i].offset = into_offset;
2355 m_vars[i].disp = into_disp;
2359 return true;
2362 bool OffloadDescriptor::setup_misc_data(const char *name)
2364 OffloadTimer timer(get_timer_data(), c_offload_host_setup_misc_data);
2366 // we can skip run functon call together with wait if offloaded
2367 // region is empty and there is no user defined non-pointer IN/OUT data
2368 if (m_need_runfunction) {
2369 // variable descriptors are sent as input data
2370 m_in_datalen += m_vars_total * sizeof(VarDesc);
2372 // timer data is sent as a part of the output data
2373 m_out_datalen += OFFLOAD_TIMER_DATALEN();
2375 // max from input data and output data length
2376 uint64_t data_len = m_in_datalen > m_out_datalen ? m_in_datalen :
2377 m_out_datalen;
2379 // Misc data has the following layout
2380 // <Function Descriptor>
2381 // <Function Name>
2382 // <In/Out Data> (optional)
2384 // We can transfer copyin/copyout data in misc/return data which can
2385 // be passed to run function call if its size does not exceed
2386 // COI_PIPELINE_MAX_IN_MISC_DATA_LEN. Otherwise we have to allocate
2387 // buffer for it.
2389 m_func_desc_size = sizeof(FunctionDescriptor) + strlen(name) + 1;
2390 m_func_desc_size = (m_func_desc_size + 7) & ~7;
2392 int misc_data_offset = 0;
2393 int misc_data_size = 0;
2394 if (data_len > 0) {
2395 if (m_func_desc_size +
2396 m_in_datalen <= COI_PIPELINE_MAX_IN_MISC_DATA_LEN &&
2397 m_out_datalen <= COI_PIPELINE_MAX_IN_MISC_DATA_LEN) {
2398 // use misc/return data for copyin/copyout
2399 misc_data_offset = m_func_desc_size;
2400 misc_data_size = data_len;
2402 else {
2403 OffloadTimer timer_buf(get_timer_data(),
2404 c_offload_host_alloc_data_buffer);
2406 // send/receive data using buffer
2407 COIRESULT res = COI::BufferCreate(data_len,
2408 COI_BUFFER_NORMAL,
2409 0, 0,
2410 1, &m_device.get_process(),
2411 &m_inout_buf);
2412 if (res != COI_SUCCESS) {
2413 if (m_status != 0) {
2414 m_status->result = translate_coi_error(res);
2415 return false;
2417 report_coi_error(c_buf_create, res);
2420 m_compute_buffers.push_back(m_inout_buf);
2421 m_destroy_buffers.push_back(m_inout_buf);
2425 // initialize function descriptor
2426 m_func_desc = (FunctionDescriptor*) calloc(1, m_func_desc_size
2427 + misc_data_size);
2428 if (m_func_desc == NULL)
2429 LIBOFFLOAD_ERROR(c_malloc);
2430 m_func_desc->console_enabled = console_enabled;
2431 m_func_desc->timer_enabled = offload_report_enabled &&
2432 (timer_enabled || offload_report_level);
2433 m_func_desc->offload_report_level = offload_report_enabled ?
2434 offload_report_level : 0;
2435 m_func_desc->offload_number = GET_OFFLOAD_NUMBER(get_timer_data());
2436 m_func_desc->in_datalen = m_in_datalen;
2437 m_func_desc->out_datalen = m_out_datalen;
2438 m_func_desc->vars_num = m_vars_total;
2439 m_func_desc->data_offset = misc_data_offset;
2441 // append entry name
2442 strcpy(m_func_desc->data, name);
2445 return true;
2448 void OffloadDescriptor::setup_omp_async_info()
2450 OFFLOAD_TRACE(2, "setup_omp_async_info\n");
2451 OmpAsyncLastEventType event_type = m_need_runfunction ?
2452 c_last_runfunc : c_last_write;
2453 int last_in = m_need_runfunction ? 0 : -1;
2454 int i;
2456 for (i = m_vars_total - 1; i >=0; i--) {
2457 switch (m_vars[i].type.dst) {
2458 case c_data:
2459 case c_void_ptr:
2460 case c_cean_var:
2461 if (m_vars[i].direction.out &&
2462 m_vars[i].flags.is_static_dstn) {
2463 event_type = c_last_read;
2465 else if (last_in < 0 && m_vars[i].direction.in &&
2466 m_vars[i].flags.is_static_dstn) {
2467 last_in = i;
2469 break;
2470 case c_string_ptr:
2471 case c_data_ptr:
2472 case c_cean_var_ptr:
2473 case c_dv_ptr:
2474 case c_dv_data:
2475 case c_dv_ptr_data:
2476 case c_dv_data_slice:
2477 case c_dv_ptr_data_slice:
2479 if (m_vars[i].direction.out) {
2480 event_type = c_last_read;
2482 else if (last_in < 0 && m_vars[i].direction.in) {
2483 last_in = i;
2485 break;
2486 default:
2487 break;
2489 if (event_type == c_last_read) {
2490 break;
2494 if (event_type == c_last_read) {
2495 m_vars_extra[i].omp_last_event_type = c_last_read;
2497 else if (event_type == c_last_write) {
2498 m_vars_extra[last_in].omp_last_event_type = c_last_write;
2500 m_omp_async_last_event_type = event_type;
2501 OFFLOAD_TRACE(2, "setup_omp_async_info: event_type=%d\n",
2502 m_omp_async_last_event_type);
2505 extern "C" {
2506 void offload_proxy_task_completed_ooo(
2507 COIEVENT e,
2508 const COIRESULT r,
2509 const void *info
2512 task_completion_callback ((void *) info);
2516 void OffloadDescriptor::register_omp_event_call_back(
2517 const COIEVENT *event,
2518 const void *info)
2520 OFFLOAD_TRACE(2, "register_omp_event_call_back(event=%p, info=%p)\n",
2521 event, info);
2522 if (COI::EventRegisterCallback) {
2523 COI::EventRegisterCallback(
2524 *event,
2525 &offload_proxy_task_completed_ooo,
2526 info, 0);
2527 OFFLOAD_TRACE(2,
2528 "COI::EventRegisterCallback found; callback registered\n");
2532 bool OffloadDescriptor::wait_dependencies(
2533 const void **waits,
2534 int num_waits,
2535 _Offload_stream handle
2538 OffloadTimer timer(get_timer_data(), c_offload_host_wait_deps);
2539 bool ret = true;
2540 OffloadDescriptor *task;
2541 if (num_waits == 0) {
2542 return true;
2545 // wait for streams
2546 if (num_waits == -1) {
2547 Stream * stream;
2548 // some specific stream of the device
2549 if (handle != 0) {
2550 stream = Stream::find_stream(handle, false);
2552 // the stream was not created or was destroyed
2553 if (!stream) {
2554 LIBOFFLOAD_ERROR(c_offload_no_stream, m_device.get_logical_index());
2555 LIBOFFLOAD_ABORT;
2557 task = stream->get_last_offload();
2559 // offload was completed by previous offload_wait pragma
2560 // or wait clause
2561 if (task == 0) {
2562 return true;
2564 if (!task->offload_finish(0)) { //arg is 0 for is_traceback
2565 ret = false;
2567 task->cleanup();
2568 stream->set_last_offload(NULL);
2569 delete task;
2571 // all streams of the device or over all devices
2572 else {
2573 StreamMap stream_map = Stream::all_streams;
2574 for (StreamMap::iterator it = stream_map.begin();
2575 it != stream_map.end(); it++) {
2576 Stream * stream = it->second;
2578 if (!m_wait_all_devices &&
2579 stream->get_device() != m_device.get_logical_index()) {
2580 continue;
2582 // get associated async task
2583 OffloadDescriptor *task = stream->get_last_offload();
2585 // offload was completed by offload_wait pragma or wait clause
2586 if (task == 0) {
2587 continue;
2589 if (!task->offload_finish(0)) { //arg is 0 for is_traceback
2590 ret = false;
2592 task->cleanup();
2593 stream->set_last_offload(NULL);
2594 delete task;
2596 // no uncompleted streams
2597 return true;
2600 else {
2601 // if handle is equal to no_stream it's wait for signals
2602 for (int i = 0; i < num_waits; i++) {
2603 _Offload_stream stream_handle;
2604 Stream *stream;
2605 task = m_device.find_signal(waits[i], true);
2606 if (task == 0) {
2607 LIBOFFLOAD_ERROR(c_offload1, m_device.get_logical_index(),
2608 waits[i]);
2609 LIBOFFLOAD_ABORT;
2611 else if (task == SIGNAL_IS_REMOVED) {
2612 continue;
2614 if (!task->offload_finish(0)) { //arg is 0 for is_traceback
2615 ret = false;
2617 task->cleanup();
2618 // if the offload both has signal and is last offload of its
2619 // stream, we must wipe out the "last_offload" reference as
2620 // the offload already is finished.
2621 stream_handle = task->m_stream;
2622 if (stream_handle != -1) {
2623 stream = Stream::find_stream(stream_handle, false);
2624 if (stream && stream->get_last_offload() == task) {
2625 stream->set_last_offload(NULL);
2628 delete task;
2631 return ret;
2634 bool OffloadDescriptor::offload_wrap(
2635 const char *name,
2636 bool is_empty,
2637 VarDesc *vars,
2638 VarDesc2 *vars2,
2639 int vars_total,
2640 const void **waits,
2641 int num_waits,
2642 const void **signal,
2643 int entry_id,
2644 const void *stack_addr,
2645 OffloadFlags offload_flags
2648 OffloadWaitKind wait_kind = c_offload_wait_signal;
2649 bool is_traceback = offload_flags.bits.fortran_traceback;
2651 // define kind of wait if any;
2652 // there can be one off the following kind:
2653 // 1. c_offload_wait_signal for "offload_wait wait(signal)"
2654 // 2. c_offload_wait_stream for "offload_wait stream(stream)"
2655 // 3. c_offload_wait_all_streams for "offload_wait stream(0)"
2656 if (num_waits == -1) {
2657 wait_kind = (m_stream == 0) ?
2658 c_offload_wait_all_streams :
2659 c_offload_wait_stream;
2661 char buf[35];
2662 const char *stream_str;
2664 if (m_stream == no_stream || num_waits >= 0) {
2665 stream_str = "none";
2667 else if (m_stream == 0) {
2668 stream_str = "all";
2670 else {
2671 sprintf(buf, "%#llx", m_stream);
2672 stream_str = buf;
2675 if (signal == 0) {
2676 OFFLOAD_DEBUG_TRACE_1(1,
2677 GET_OFFLOAD_NUMBER(get_timer_data()),
2678 c_offload_init_func,
2679 "Offload function %s, is_empty=%d, #varDescs=%d, "
2680 "signal=none, stream=%s, #waits=%d%c",
2681 name, is_empty, vars_total, stream_str, num_waits,
2682 num_waits == 0 ? '\n' : ' ');
2683 // Breaks the norm of using OFFLOAD_DEBUG_TRACE to print the waits
2684 // since the number of waits is not fixed.
2685 if (!OFFLOAD_DO_TRACE && (console_enabled >= 1)) {
2686 if (num_waits) {
2687 printf("(");
2688 if (m_stream == no_stream) {
2689 printf("%p", waits[0]);
2690 for (int i = 1; i < num_waits; i++) {
2691 printf(", %p", waits[i]);
2694 else if (m_stream != 0) {
2695 printf("%#x", m_stream);
2697 else {
2698 printf(" all streams");
2700 printf(")");
2702 printf("\n");
2703 fflush(NULL);
2705 // stream in wait is reported further in OFFLOAD_REPORT for waits
2706 if (m_stream != no_stream && num_waits == 0) {
2707 OFFLOAD_REPORT(3, GET_OFFLOAD_NUMBER(get_timer_data()),
2708 c_offload_stream,
2709 "%d\n", m_stream);
2711 OFFLOAD_REPORT(3, GET_OFFLOAD_NUMBER(get_timer_data()),
2712 c_offload_signal,
2713 "none %d\n", 0);
2715 else {
2716 OFFLOAD_DEBUG_TRACE_1(1,
2717 GET_OFFLOAD_NUMBER(get_timer_data()),
2718 c_offload_init_func,
2719 "Offload function %s, is_empty=%d, #varDescs=%d, "
2720 "signal=%p, stream=%s, #waits=%d%c",
2721 name, is_empty, vars_total, *signal, stream_str, num_waits,
2722 num_waits == 0 ? '\n' : ' ');
2723 // Breaks the norm of using OFFLOAD_DEBUG_TRACE to print the waits
2724 // since the number of waits is not fixed.
2725 if (!OFFLOAD_DO_TRACE && (console_enabled >= 1)) {
2726 if (num_waits) {
2727 printf("(");
2728 if (m_stream == no_stream) {
2729 printf("%p", waits[0]);
2730 for (int i = 1; i < num_waits; i++) {
2731 printf(", %p", waits[i]);
2733 printf(")");
2735 else if (m_stream != 0) {
2736 printf("%#x", m_stream);
2738 else {
2739 printf(" all streams");
2741 printf(")");
2743 printf("\n");
2744 fflush(NULL);
2746 // stream in wait is reported further in OFFLOAD_REPORT for waits
2747 if (m_stream != no_stream && num_waits == 0) {
2748 OFFLOAD_REPORT(3, GET_OFFLOAD_NUMBER(get_timer_data()),
2749 c_offload_stream,
2750 "%d\n", m_stream);
2752 OFFLOAD_REPORT(3, GET_OFFLOAD_NUMBER(get_timer_data()),
2753 c_offload_signal,
2754 "%d\n", signal);
2756 if (console_enabled >= 1 && offload_flags.flags != 0) {
2757 trace_offload_flags(get_timer_data(), offload_flags);
2760 OFFLOAD_REPORT(3, GET_OFFLOAD_NUMBER(get_timer_data()),
2761 c_offload_wait, "%d\n",
2762 wait_kind, num_waits,
2763 (wait_kind == c_offload_wait_signal) ?
2764 waits :
2765 reinterpret_cast<const void **>(m_stream));
2767 if (m_status != 0) {
2768 m_status->result = OFFLOAD_SUCCESS;
2769 m_status->device_number = m_device.get_logical_index();
2772 m_initial_need_runfunction = m_need_runfunction = !is_empty;
2774 // wait for dependencies to finish
2775 if (!wait_dependencies(waits, num_waits, m_stream)) {
2776 cleanup();
2777 return false;
2780 // setup buffers
2781 if (!setup_descriptors(vars, vars2, vars_total, entry_id, stack_addr)) {
2782 cleanup();
2783 return false;
2786 if (offload_flags.bits.omp_async) {
2787 setup_omp_async_info();
2790 // initiate send for pointers. Want to do it as early as possible.
2791 if (!send_pointer_data(signal != 0 || offload_flags.bits.omp_async,
2792 signal)) {
2793 cleanup();
2794 return false;
2797 // setup misc data for run function
2798 if (!setup_misc_data(name)) {
2799 cleanup();
2800 return false;
2803 // gather copyin data into buffer
2804 if (!gather_copyin_data()) {
2805 cleanup();
2806 return false;
2809 // Start the computation
2810 if (!compute(signal)) {
2811 cleanup();
2812 return false;
2815 // initiate receive for pointers
2816 if (!receive_pointer_data(signal != 0 || offload_flags.bits.omp_async,
2817 true, signal)) {
2818 cleanup();
2819 return false;
2821 if (offload_flags.bits.omp_async) {
2822 return true;
2824 // if there is a signal or stream save descriptor for the later use.
2825 // num_waits == -1 is for offload_wait and there is nothing to save
2826 if (num_waits != -1 && (signal != 0 || m_stream != no_stream)) {
2827 if (signal != 0) {
2828 m_device.add_signal(*signal, this);
2831 if (m_stream != no_stream && m_stream != 0) {
2832 Stream* stream = Stream::find_stream(m_stream, false);
2833 if (stream) {
2834 stream->set_last_offload(this);
2836 else {
2837 LIBOFFLOAD_ERROR(c_offload_no_stream, m_device.get_logical_index());
2838 LIBOFFLOAD_ABORT;
2841 // if there is a clause with alloc_if(1) and preallocated need to call
2842 // offload_finish after runfunction
2843 if (!m_preallocated_alloc) {
2844 return true;
2848 // wait for the offload to finish.
2849 if (!offload_finish(is_traceback)) {
2850 cleanup();
2851 return false;
2854 cleanup();
2855 return true;
2858 bool OffloadDescriptor::offload(
2859 const char *name,
2860 bool is_empty,
2861 VarDesc *vars,
2862 VarDesc2 *vars2,
2863 int vars_total,
2864 const void **waits,
2865 int num_waits,
2866 const void **signal,
2867 int entry_id,
2868 const void *stack_addr,
2869 OffloadFlags offload_flags
2872 bool res;
2873 res = offload_wrap(name, is_empty, vars, vars2, vars_total,
2874 waits, num_waits, signal, entry_id,
2875 stack_addr, offload_flags);
2876 if (res == false && !m_traceback_called) {
2877 if (offload_flags.bits.fortran_traceback) {
2878 OFFLOAD_TRACE(3,
2879 "Calling Fortran library to continue traceback from MIC\n");
2880 FORTRAN_TRACE_BACK(m_status->result);
2881 m_traceback_called = true;
2884 return res;
2887 bool OffloadDescriptor::offload_finish(
2888 bool is_traceback
2891 COIRESULT res;
2893 // wait for compute dependencies to become signaled
2894 if (m_in_deps_total > 0) {
2895 OffloadTimer timer(get_timer_data(), c_offload_host_wait_compute);
2897 if (__offload_active_wait) {
2898 // keep CPU busy
2899 do {
2900 res = COI::EventWait(m_in_deps_total, m_in_deps, 0, 1, 0, 0);
2902 while (res == COI_TIME_OUT_REACHED);
2904 else {
2905 res = COI::EventWait(m_in_deps_total, m_in_deps, -1, 1, 0, 0);
2908 if (res != COI_SUCCESS) {
2909 if (m_status != 0 && !m_traceback_called) {
2910 m_status->result = translate_coi_error(res);
2911 if (is_traceback) {
2912 OFFLOAD_TRACE(3,
2913 "Calling Fortran library to continue traceback from MIC\n");
2914 FORTRAN_TRACE_BACK(m_status->result);
2915 m_traceback_called = true;
2917 return false;
2920 if (is_traceback && !m_traceback_called) {
2921 OFFLOAD_TRACE(3,
2922 "Calling Fortran library to continue traceback from MIC\n");
2923 FORTRAN_TRACE_BACK(OFFLOAD_ERROR);
2924 m_traceback_called = true;
2927 report_coi_error(c_event_wait, res);
2931 // scatter copyout data received from target
2932 if (!scatter_copyout_data()) {
2933 return false;
2936 if (m_out_with_preallocated &&
2937 !receive_pointer_data(m_out_deps_total > 0, false, NULL)) {
2938 cleanup();
2939 return false;
2942 // wait for receive dependencies to become signaled
2943 if (m_out_deps_total > 0) {
2944 OffloadTimer timer(get_timer_data(), c_offload_host_wait_buffers_reads);
2946 if (__offload_active_wait) {
2947 // keep CPU busy
2948 do {
2949 res = COI::EventWait(m_out_deps_total, m_out_deps, 0, 1, 0, 0);
2951 while (res == COI_TIME_OUT_REACHED);
2953 else {
2954 res = COI::EventWait(m_out_deps_total, m_out_deps, -1, 1, 0, 0);
2957 if (res != COI_SUCCESS) {
2958 if (m_status != 0) {
2959 m_status->result = translate_coi_error(res);
2960 return false;
2962 report_coi_error(c_event_wait, res);
2966 // destroy buffers
2968 OffloadTimer timer(get_timer_data(), c_offload_host_destroy_buffers);
2970 for (BufferList::const_iterator it = m_destroy_buffers.begin();
2971 it != m_destroy_buffers.end(); it++) {
2972 res = COI::BufferDestroy(*it);
2973 if (res != COI_SUCCESS) {
2974 if (m_status != 0) {
2975 m_status->result = translate_coi_error(res);
2976 return false;
2978 report_coi_error(c_buf_destroy, res);
2983 return true;
2986 void OffloadDescriptor::cleanup()
2988 // release device in orsl
2989 ORSL::release(m_device.get_logical_index());
2991 OFFLOAD_TIMER_STOP(get_timer_data(), c_offload_host_total_offload);
2993 // report stuff
2994 Offload_Report_Epilog(get_timer_data());
2997 bool OffloadDescriptor::is_signaled()
2999 bool signaled = true;
3000 COIRESULT res;
3002 // check compute and receive dependencies
3003 if (m_in_deps_total > 0) {
3004 res = COI::EventWait(m_in_deps_total, m_in_deps, 0, 1, 0, 0);
3005 signaled = signaled && (res == COI_SUCCESS);
3007 if (m_out_deps_total > 0) {
3008 res = COI::EventWait(m_out_deps_total, m_out_deps, 0, 1, 0, 0);
3009 signaled = signaled && (res == COI_SUCCESS);
3012 return signaled;
3015 static Arr_Desc * make_arr_desc(
3016 void* ptr_val,
3017 int64_t extent_start_val,
3018 int64_t extent_elements_val,
3019 int64_t size
3022 Arr_Desc *res;
3023 res = (Arr_Desc *)malloc(sizeof(Arr_Desc));
3024 if (res == NULL)
3025 LIBOFFLOAD_ERROR(c_malloc);
3026 res->base = reinterpret_cast<int64_t>(ptr_val);
3027 res->rank = 1;
3028 res->dim[0].size = size;
3029 res->dim[0].lindex = 0;
3030 res->dim[0].lower = extent_start_val;
3031 res->dim[0].upper = extent_elements_val + extent_start_val - 1;
3032 res->dim[0].stride = 1;
3033 return res;
3036 // Send pointer data if source or destination or both of them are
3037 // noncontiguous. There is guarantee that length of destination enough for
3038 // transferred data.
3039 bool OffloadDescriptor::send_noncontiguous_pointer_data(
3040 int i,
3041 PtrData* src_data,
3042 PtrData* dst_data,
3043 COIEVENT *event,
3044 uint64_t &data_sent,
3045 uint32_t in_deps_amount,
3046 COIEVENT *in_deps
3049 int64_t offset_src, offset_dst;
3050 int64_t length_src, length_dst;
3051 int64_t length_src_cur, length_dst_cur;
3052 int64_t send_size;
3053 COIRESULT res;
3054 bool dst_is_empty = true;
3055 bool src_is_empty = true;
3057 data_sent = 0;
3059 // Set length_src and length_dst
3060 length_src = (m_vars_extra[i].read_rng_src) ?
3061 m_vars_extra[i].read_rng_src->range_size : m_vars[i].size;
3062 length_dst = !m_vars[i].into ? length_src :
3063 (m_vars_extra[i].read_rng_dst) ?
3064 m_vars_extra[i].read_rng_dst->range_size : m_vars[i].size;
3065 send_size = (length_src < length_dst) ? length_src : length_dst;
3067 // If BufferWriteMultiD is defined we can set values of required arguments
3068 // and transfer noncontiguous data via call to the COI routine.
3069 if (__offload_use_coi_noncontiguous_transfer && COI::BufferWriteMultiD) {
3070 struct Arr_Desc* arr_desc_dst;
3071 struct Arr_Desc* arr_desc_src;
3072 int64_t size_src, size_dst;
3073 char *base = offload_get_src_base(static_cast<char*>(m_vars[i].ptr),
3074 m_vars[i].type.src);
3075 COIBUFFER dst_buf = m_vars[i].into ?
3076 m_vars_extra[i].dst_data->mic_buf :
3077 m_vars_extra[i].src_data->mic_buf;
3079 offset_src = (m_vars_extra[i].read_rng_src)?
3080 m_vars_extra[i].read_rng_src->init_offset : m_vars_extra[i].cpu_disp;
3081 size_src = m_vars_extra[i].read_rng_src ?
3082 cean_get_transf_size(m_vars_extra[i].read_rng_src) :
3083 m_vars[i].size;
3085 offset_dst = (m_vars_extra[i].read_rng_dst)?
3086 m_vars_extra[i].read_rng_dst->init_offset : m_vars[i].disp;
3087 size_dst = m_vars_extra[i].read_rng_dst ?
3088 cean_get_transf_size(m_vars_extra[i].read_rng_dst) : m_vars[i].size;
3090 int64_t el_size = (!m_vars[i].into ||
3091 (m_vars_extra[i].read_rng_src && m_vars_extra[i].read_rng_dst)) ?
3093 m_vars_extra[i].read_rng_src ?
3094 m_vars_extra[i].read_rng_src->arr_desc->dim[
3095 m_vars_extra[i].read_rng_src->arr_desc->rank - 1].size :
3096 m_vars_extra[i].read_rng_dst->arr_desc->dim[
3097 m_vars_extra[i].read_rng_dst->arr_desc->rank - 1].size;
3099 arr_desc_src = (m_vars_extra[i].read_rng_src) ?
3100 m_vars_extra[i].read_rng_src->arr_desc :
3101 make_arr_desc(NULL, // don't required for source
3102 offset_src/el_size, size_src/el_size, el_size);
3104 arr_desc_dst = !m_vars[i].into ?
3105 arr_desc_src :
3106 (m_vars_extra[i].read_rng_dst) ?
3107 m_vars_extra[i].read_rng_dst->arr_desc :
3108 make_arr_desc(NULL,
3109 offset_dst/el_size, size_src/el_size, el_size);
3111 int64_t alloc_disp = m_vars[i].into ?
3112 m_vars_extra[i].dst_data->alloc_disp :
3113 m_vars_extra[i].src_data->alloc_disp;
3115 arr_desc_src->base = reinterpret_cast<int64_t>(base);
3116 arr_desc_dst->base = 0;
3118 res = COI::BufferWriteMultiD(
3119 dst_buf, // in_DestBuffer,
3120 m_device.get_process(), // DestProcess,
3121 m_vars[i].offset + m_vars[i].mic_offset -
3122 alloc_disp, // Offset
3123 (void*)arr_desc_dst, // descriptor of DestArray
3124 (void*)arr_desc_src, // descriptor of SrcArray
3125 COI_COPY_UNSPECIFIED, // Type
3126 in_deps_amount, // Number of in Dependencies
3127 in_deps, // array of in Dependencies
3128 event); // out Dependency
3129 if (res != COI_SUCCESS) {
3130 if (m_status != 0) {
3131 m_status->result = translate_coi_error(res);
3132 return false;
3134 report_coi_error(c_buf_copy, res);
3136 return(true);
3139 // if event is defined we must multiplate it for all contiguous intervals
3140 // that will be Copied/Write.
3141 // Take in account that we already have 1 event.
3142 if (event) {
3143 m_in_deps_allocated += (length_src / send_size) *
3144 ((m_vars_extra[i].read_rng_src) ?
3145 m_vars_extra[i].read_rng_src->range_max_number : 1) ;
3146 m_in_deps =
3147 (COIEVENT*)realloc(m_in_deps, sizeof(COIEVENT) * m_in_deps_allocated);
3148 m_in_deps_total--;
3151 // consequently get contiguous ranges,
3152 // define corresponded destination offset and send data
3153 do {
3154 if (src_is_empty) {
3155 if (m_vars_extra[i].read_rng_src) {
3156 if (!get_next_range(m_vars_extra[i].read_rng_src,
3157 &offset_src)) {
3158 // source ranges are over - nothing to send
3159 break;
3162 else if (data_sent == 0) {
3163 offset_src = m_vars_extra[i].cpu_disp;
3165 else {
3166 break;
3168 length_src_cur = length_src;
3170 else {
3171 // if source is contiguous or its contiguous range is greater
3172 // than destination one
3173 offset_src += send_size;
3175 length_src_cur -= send_size;
3176 src_is_empty = length_src_cur == 0;
3178 if (dst_is_empty) {
3179 if (m_vars[i].into) {
3180 if (m_vars_extra[i].read_rng_dst) {
3181 if (!get_next_range(m_vars_extra[i].read_rng_dst,
3182 &offset_dst)) {
3183 // destination ranges are over
3184 LIBOFFLOAD_ERROR(c_destination_is_over);
3185 return false;
3188 // into is contiguous.
3189 else {
3190 offset_dst = m_vars[i].disp;
3192 length_dst_cur = length_dst;
3194 // same as source
3195 else {
3196 offset_dst = offset_src;
3197 length_dst_cur = length_src;
3200 else {
3201 // if destination is contiguous or its contiguous range is greater
3202 // than source one
3203 offset_dst += send_size;
3205 length_dst_cur -= send_size;
3206 dst_is_empty = length_dst_cur == 0;
3208 if (event) {
3209 event = &m_in_deps[m_in_deps_total++];
3211 if (src_data != 0 && src_data->cpu_buf != 0) {
3212 res = COI::BufferCopy(
3213 dst_data->mic_buf,
3214 src_data->cpu_buf,
3215 m_vars[i].mic_offset +
3216 m_vars[i].offset + offset_dst,
3217 m_vars_extra[i].cpu_offset + offset_src,
3218 send_size,
3219 COI_COPY_UNSPECIFIED,
3220 in_deps_amount, in_deps,
3221 event);
3222 if (res != COI_SUCCESS) {
3223 if (m_status != 0) {
3224 m_status->result = translate_coi_error(res);
3225 return false;
3227 report_coi_error(c_buf_copy, res);
3230 else {
3231 char *base = offload_get_src_base(m_vars[i].ptr,
3232 m_vars[i].type.src);
3234 res = COI::BufferWrite(
3235 dst_data->mic_buf,
3236 m_vars[i].mic_offset +
3237 m_vars[i].offset + offset_dst,
3238 base + offset_src,
3239 send_size,
3240 COI_COPY_UNSPECIFIED,
3241 in_deps_amount, in_deps,
3242 event);
3243 if (res != COI_SUCCESS) {
3244 if (m_status != 0) {
3245 m_status->result = translate_coi_error(res);
3246 return false;
3248 report_coi_error(c_buf_write, res);
3251 data_sent += send_size;
3253 while (true);
3254 return true;
3257 bool OffloadDescriptor::send_pointer_data(bool is_async, void* info)
3259 OffloadTimer timer(get_timer_data(), c_offload_host_send_pointers);
3261 bool should_use_async_buffer_write = m_initial_need_runfunction;
3262 uint64_t ptr_sent = 0;
3263 COIRESULT res;
3264 uint32_t in_deps_amount = 0;
3265 COIEVENT *in_deps = NULL;
3267 // For offload_transfer and offload with empty body without signal:
3268 // - if there is only one buffer copy - send data synchronously
3269 // - if there are multiple buffer copy and
3270 // __offload_parallel_copy is false - send data synchronously
3271 // - if there are multiple buffer copy and
3272 // __offload_parallel_copy is true - send data asynchronously
3273 // It concerns only big size data - greater than __offload_use_async_buffer_write.
3274 // Data of size less than __offload_use_async_buffer_write are sent synchronously.
3275 // Synchronous transfer results in better performance in COI.
3276 // __offload_parallel_copy is false by default but can be changed
3277 // via environment variable OFFLOAD_PARALLEL_COPY
3278 if (!m_initial_need_runfunction && __offload_parallel_copy) {
3279 int big_size_count = 0;
3280 for (int i = 0; i < m_vars_total; i++) {
3281 if (m_vars[i].direction.in &&
3282 m_vars[i].size >= __offload_use_async_buffer_write) {
3283 switch (m_vars[i].type.dst) {
3284 case c_data:
3285 case c_void_ptr:
3286 case c_cean_var:
3287 if (m_vars[i].flags.is_static_dstn) {
3288 big_size_count++;
3290 break;
3291 case c_string_ptr:
3292 case c_data_ptr:
3293 case c_cean_var_ptr:
3294 case c_dv_ptr:
3295 case c_dv_data:
3296 case c_dv_ptr_data:
3297 case c_dv_data_slice:
3298 case c_dv_ptr_data_slice:
3299 big_size_count++;
3300 break;
3301 default:
3302 break;
3306 if (big_size_count > 1) {
3307 should_use_async_buffer_write = true;
3311 if (m_stream != no_stream && m_vars_total != 0) {
3312 get_stream_in_dependencies(in_deps_amount, in_deps);
3315 // Initiate send for pointer data
3316 for (int i = 0; i < m_vars_total; i++) {
3317 uint64_t sent_data = m_vars[i].size;
3318 uint32_t in_deps_amount_save;
3319 COIEVENT *in_deps_save;
3321 if (m_vars_extra[i].omp_last_event_type == c_last_write) {
3322 in_deps_amount_save = in_deps_amount;
3323 in_deps_save = in_deps;
3324 in_deps_amount = m_in_deps_total;
3325 if (in_deps_amount > 0) {
3326 in_deps = (COIEVENT*) malloc(sizeof(COIEVENT) * in_deps_amount);
3327 if (in_deps == NULL)
3328 LIBOFFLOAD_ERROR(c_malloc);
3329 memcpy(in_deps, m_in_deps,in_deps_amount * sizeof(COIEVENT));
3332 switch (m_vars[i].type.dst) {
3333 case c_data_ptr_array:
3334 break;
3335 case c_data:
3336 case c_void_ptr:
3337 case c_cean_var:
3338 if (m_vars[i].direction.in &&
3339 m_vars[i].flags.is_static_dstn) {
3340 COIEVENT *event =
3341 (is_async ||
3342 (should_use_async_buffer_write &&
3343 m_vars[i].size >= __offload_use_async_buffer_write)) ?
3344 &m_in_deps[m_in_deps_total++] : 0;
3345 PtrData* dst_data = m_vars[i].into ?
3346 m_vars_extra[i].dst_data :
3347 m_vars_extra[i].src_data;
3348 PtrData* src_data =
3349 VAR_TYPE_IS_PTR(m_vars[i].type.src) ||
3350 VAR_TYPE_IS_SCALAR(m_vars[i].type.src) &&
3351 m_vars[i].flags.is_static ?
3352 m_vars_extra[i].src_data : 0;
3354 if (m_vars[i].flags.is_noncont_src ||
3355 m_vars[i].flags.is_noncont_dst) {
3356 if (!send_noncontiguous_pointer_data(
3357 i, src_data, dst_data, event, sent_data,
3358 in_deps_amount, in_deps)) {
3359 return false;
3362 else if (src_data != 0 && src_data->cpu_buf != 0) {
3363 res = COI::BufferCopy(
3364 dst_data->mic_buf,
3365 src_data->cpu_buf,
3366 m_vars[i].mic_offset +
3367 m_vars[i].offset + m_vars[i].disp,
3368 m_vars_extra[i].cpu_offset +
3369 m_vars_extra[i].cpu_disp,
3370 m_vars[i].size,
3371 COI_COPY_UNSPECIFIED,
3372 in_deps_amount, in_deps,
3373 event);
3374 if (res != COI_SUCCESS) {
3375 if (m_status != 0) {
3376 m_status->result = translate_coi_error(res);
3377 return false;
3379 report_coi_error(c_buf_copy, res);
3382 else {
3383 char *base = offload_get_src_base(m_vars[i].ptr,
3384 m_vars[i].type.src);
3385 res = COI::BufferWrite(
3386 dst_data->mic_buf,
3387 m_vars[i].mic_offset +
3388 m_vars[i].offset + m_vars[i].disp,
3389 base + m_vars_extra[i].cpu_disp,
3390 m_vars[i].size,
3391 COI_COPY_UNSPECIFIED,
3392 in_deps_amount, in_deps,
3393 event);
3394 if (res != COI_SUCCESS) {
3395 if (m_status != 0) {
3396 m_status->result = translate_coi_error(res);
3397 return false;
3399 report_coi_error(c_buf_write, res);
3402 ptr_sent += sent_data;
3404 break;
3406 case c_string_ptr:
3407 case c_data_ptr:
3408 case c_cean_var_ptr:
3409 case c_dv_ptr:
3410 if (m_vars[i].direction.in && m_vars[i].size > 0) {
3411 COIEVENT *event =
3412 (is_async ||
3413 (should_use_async_buffer_write &&
3414 m_vars[i].size >= __offload_use_async_buffer_write)) ?
3415 &m_in_deps[m_in_deps_total++] : 0;
3416 PtrData* dst_data = m_vars[i].into ?
3417 m_vars_extra[i].dst_data :
3418 m_vars_extra[i].src_data;
3419 PtrData* src_data =
3420 VAR_TYPE_IS_PTR(m_vars[i].type.src) ||
3421 VAR_TYPE_IS_SCALAR(m_vars[i].type.src) &&
3422 m_vars[i].flags.is_static ?
3423 m_vars_extra[i].src_data : 0;
3425 if (m_vars[i].flags.is_noncont_src ||
3426 m_vars[i].flags.is_noncont_dst) {
3427 send_noncontiguous_pointer_data(
3428 i, src_data, dst_data, event, sent_data,
3429 in_deps_amount, in_deps);
3431 else if (src_data != 0 && src_data->cpu_buf != 0) {
3432 res = COI::BufferCopy(
3433 dst_data->mic_buf,
3434 src_data->cpu_buf,
3435 m_vars[i].mic_offset +
3436 m_vars[i].offset + m_vars[i].disp,
3437 m_vars_extra[i].cpu_offset +
3438 m_vars_extra[i].cpu_disp,
3439 m_vars[i].size,
3440 COI_COPY_UNSPECIFIED,
3441 in_deps_amount, in_deps,
3442 event);
3443 if (res != COI_SUCCESS) {
3444 if (m_status != 0) {
3445 m_status->result = translate_coi_error(res);
3446 return false;
3448 report_coi_error(c_buf_copy, res);
3451 else {
3452 char *base = offload_get_src_base(m_vars[i].ptr,
3453 m_vars[i].type.src);
3454 res = COI::BufferWrite(
3455 dst_data->mic_buf,
3456 m_vars[i].mic_offset +
3457 m_vars[i].offset + m_vars[i].disp,
3458 base + m_vars_extra[i].cpu_disp,
3459 m_vars[i].size,
3460 COI_COPY_UNSPECIFIED,
3461 in_deps_amount, in_deps,
3462 event);
3463 if (res != COI_SUCCESS) {
3464 if (m_status != 0) {
3465 m_status->result = translate_coi_error(res);
3466 return false;
3468 report_coi_error(c_buf_write, res);
3472 ptr_sent += sent_data;
3474 break;
3476 case c_dv_data:
3477 case c_dv_ptr_data:
3478 if (m_vars[i].direction.in &&
3479 m_vars[i].size > 0) {
3480 PtrData *ptr_data = m_vars[i].into ?
3481 m_vars_extra[i].dst_data :
3482 m_vars_extra[i].src_data;
3483 PtrData* src_data = m_vars_extra[i].src_data;
3485 COIEVENT *event =
3486 (is_async ||
3487 (should_use_async_buffer_write &&
3488 m_vars[i].size >= __offload_use_async_buffer_write)) ?
3489 &m_in_deps[m_in_deps_total++] : 0;
3491 if (m_vars[i].flags.is_noncont_src ||
3492 m_vars[i].flags.is_noncont_dst) {
3493 send_noncontiguous_pointer_data(
3494 i, src_data, ptr_data, event, sent_data,
3495 in_deps_amount, in_deps);
3497 else if (src_data && src_data->cpu_buf != 0) {
3498 res = COI::BufferCopy(
3499 ptr_data->mic_buf,
3500 src_data->cpu_buf,
3501 m_vars[i].offset + ptr_data->mic_offset +
3502 m_vars[i].disp,
3503 m_vars_extra[i].cpu_offset +
3504 m_vars_extra[i].cpu_disp,
3505 m_vars[i].size,
3506 COI_COPY_UNSPECIFIED,
3507 in_deps_amount, in_deps,
3508 event);
3509 if (res != COI_SUCCESS) {
3510 if (m_status != 0) {
3511 m_status->result = translate_coi_error(res);
3512 return false;
3514 report_coi_error(c_buf_copy, res);
3517 else {
3518 char *base = offload_get_src_base(m_vars[i].ptr,
3519 m_vars[i].type.src);
3520 res = COI::BufferWrite(
3521 ptr_data->mic_buf,
3522 ptr_data->mic_offset +
3523 m_vars[i].offset + m_vars[i].disp,
3524 base + m_vars_extra[i].cpu_disp,
3525 m_vars[i].size,
3526 COI_COPY_UNSPECIFIED,
3527 in_deps_amount, in_deps,
3528 event);
3529 if (res != COI_SUCCESS) {
3530 if (m_status != 0) {
3531 m_status->result = translate_coi_error(res);
3532 return false;
3534 report_coi_error(c_buf_write, res);
3537 ptr_sent += sent_data;
3539 break;
3541 case c_dv_data_slice:
3542 case c_dv_ptr_data_slice:
3543 if (m_vars[i].direction.in &&
3544 m_vars[i].size > 0) {
3545 PtrData *dst_data = m_vars[i].into ?
3546 m_vars_extra[i].dst_data :
3547 m_vars_extra[i].src_data;
3548 PtrData* src_data =
3549 (VAR_TYPE_IS_PTR(m_vars[i].type.src) ||
3550 VAR_TYPE_IS_DV_DATA(m_vars[i].type.src) ||
3551 VAR_TYPE_IS_DV_DATA_SLICE(m_vars[i].type.src) ||
3552 VAR_TYPE_IS_SCALAR(m_vars[i].type.src) &&
3553 m_vars[i].flags.is_static) ?
3554 m_vars_extra[i].src_data : 0;
3555 COIEVENT *event =
3556 (is_async ||
3557 (should_use_async_buffer_write &&
3558 m_vars[i].size >= __offload_use_async_buffer_write)) ?
3559 &m_in_deps[m_in_deps_total++] : 0;
3560 if (m_vars[i].flags.is_noncont_src ||
3561 m_vars[i].flags.is_noncont_dst) {
3562 send_noncontiguous_pointer_data(
3563 i, src_data, dst_data, event, sent_data,
3564 in_deps_amount, in_deps);
3566 else if (src_data && src_data->cpu_buf != 0) {
3567 res = COI::BufferCopy(
3568 dst_data->mic_buf,
3569 src_data->cpu_buf,
3570 m_vars[i].offset +
3571 dst_data->mic_offset +
3572 m_vars[i].disp,
3573 m_vars_extra[i].cpu_offset +
3574 m_vars_extra[i].cpu_disp,
3575 m_vars[i].size,
3576 COI_COPY_UNSPECIFIED,
3577 in_deps_amount, in_deps,
3578 event);
3579 if (res != COI_SUCCESS) {
3580 if (m_status != 0) {
3581 m_status->result = translate_coi_error(res);
3582 return false;
3584 report_coi_error(c_buf_copy, res);
3587 else {
3588 char *base = offload_get_src_base(m_vars[i].ptr,
3589 m_vars[i].type.src);
3590 res = COI::BufferWrite(
3591 dst_data->mic_buf,
3592 dst_data->mic_offset +
3593 m_vars[i].offset + m_vars[i].disp,
3594 base + m_vars_extra[i].cpu_disp,
3595 m_vars[i].size,
3596 COI_COPY_UNSPECIFIED,
3597 in_deps_amount, in_deps,
3598 event);
3599 if (res != COI_SUCCESS) {
3600 if (m_status != 0) {
3601 m_status->result = translate_coi_error(res);
3602 return false;
3604 report_coi_error(c_buf_write, res);
3608 ptr_sent += sent_data;
3610 break;
3612 default:
3613 break;
3615 if (m_vars_extra[i].omp_last_event_type == c_last_write) {
3616 in_deps_amount = in_deps_amount_save;
3617 in_deps = in_deps_save;
3618 register_omp_event_call_back(&m_in_deps[m_in_deps_total - 1], info);
3620 // alloc field isn't used at target.
3621 // We can reuse it for offset of array pointers.
3622 if (m_vars_extra[i].is_arr_ptr_el) {
3623 m_vars[i].ptr_arr_offset = m_vars_extra[i].ptr_arr_offset;
3627 if (m_status) {
3628 m_status->data_sent += ptr_sent;
3631 OFFLOAD_TIMER_HOST_SDATA(get_timer_data(), ptr_sent);
3632 OFFLOAD_DEBUG_TRACE_1(1, GET_OFFLOAD_NUMBER(get_timer_data()),
3633 c_offload_sent_pointer_data,
3634 "Total pointer data sent to target: [%lld] bytes\n",
3635 ptr_sent);
3637 return true;
3640 bool OffloadDescriptor::gather_copyin_data()
3642 OffloadTimer timer(get_timer_data(), c_offload_host_gather_inputs);
3644 if (m_need_runfunction && m_in_datalen > 0) {
3645 COIMAPINSTANCE map_inst;
3646 char *data;
3648 // init marshaller
3649 if (m_inout_buf != 0) {
3650 OffloadTimer timer_map(get_timer_data(),
3651 c_offload_host_map_in_data_buffer);
3653 COIRESULT res = COI::BufferMap(m_inout_buf, 0, m_in_datalen,
3654 COI_MAP_WRITE_ENTIRE_BUFFER,
3655 0, 0, 0, &map_inst,
3656 reinterpret_cast<void**>(&data));
3657 if (res != COI_SUCCESS) {
3658 if (m_status != 0) {
3659 m_status->result = translate_coi_error(res);
3660 return false;
3662 report_coi_error(c_buf_map, res);
3665 else {
3666 data = (char*) m_func_desc + m_func_desc->data_offset;
3669 // send variable descriptors
3670 memcpy(data, m_vars, m_vars_total * sizeof(VarDesc));
3671 data += m_vars_total * sizeof(VarDesc);
3673 // init marshaller
3674 m_in.init_buffer(data, m_in_datalen);
3676 // Gather copy data into buffer
3677 for (int i = 0; i < m_vars_total; i++) {
3678 bool src_is_for_mic = (m_vars[i].direction.out ||
3679 m_vars[i].into == NULL);
3680 PtrData* ptr_data = src_is_for_mic ?
3681 m_vars_extra[i].src_data :
3682 m_vars_extra[i].dst_data;
3683 if (m_vars[i].flags.alloc_disp) {
3684 m_in.send_data(&ptr_data->alloc_disp,
3685 sizeof(ptr_data->alloc_disp));
3688 // send sink address to the target
3689 if (m_vars[i].flags.sink_addr) {
3690 m_in.send_data(&ptr_data->mic_addr,
3691 sizeof(ptr_data->mic_addr));
3694 switch (m_vars[i].type.dst) {
3695 case c_data_ptr_array:
3696 break;
3697 case c_data:
3698 case c_void_ptr:
3699 case c_cean_var:
3700 if (m_vars[i].direction.in &&
3701 !m_vars[i].flags.is_static_dstn) {
3703 char *ptr = offload_get_src_base(m_vars[i].ptr,
3704 m_vars[i].type.src);
3705 if (m_vars[i].type.dst == c_cean_var) {
3706 // offset and length are derived from the array
3707 // descriptor
3708 int64_t size = m_vars[i].size;
3709 int64_t disp = m_vars[i].disp;
3710 m_in.send_data(reinterpret_cast<char*>(&size),
3711 sizeof(int64_t));
3712 m_in.send_data(reinterpret_cast<char*>(&disp),
3713 sizeof(int64_t));
3716 m_in.send_data(ptr + m_vars_extra[i].cpu_disp,
3717 m_vars[i].size);
3719 break;
3721 case c_dv:
3722 if (m_vars[i].direction.bits ||
3723 m_vars[i].alloc_if ||
3724 m_vars[i].free_if) {
3725 // send dope vector excluding base
3726 char *ptr = static_cast<char*>(m_vars[i].ptr);
3727 m_in.send_data(ptr + sizeof(uint64_t),
3728 m_vars[i].size - sizeof(uint64_t));
3730 break;
3732 case c_data_ptr:
3733 // send to target addresses of obsolete
3734 // stacks to be released
3735 if (m_vars[i].flags.is_stack_buf &&
3736 !m_vars[i].direction.bits &&
3737 m_vars[i].alloc_if &&
3738 m_vars[i].size != 0) {
3739 for (PtrDataList::iterator it =
3740 m_destroy_stack.begin();
3741 it != m_destroy_stack.end(); it++) {
3742 PtrData * ptr_data = *it;
3743 m_in.send_data(&(ptr_data->mic_addr),
3744 sizeof(ptr_data->mic_addr));
3747 break;
3748 case c_func_ptr:
3749 if (m_vars[i].direction.in) {
3750 m_in.send_func_ptr(*((const void**) m_vars[i].ptr));
3752 break;
3754 default:
3755 break;
3759 if (m_status) {
3760 m_status->data_sent += m_in.get_tfr_size();
3763 if (m_func_desc->data_offset == 0) {
3764 OffloadTimer timer_unmap(get_timer_data(),
3765 c_offload_host_unmap_in_data_buffer);
3766 COIRESULT res = COI::BufferUnmap(map_inst, 0, 0, 0);
3767 if (res != COI_SUCCESS) {
3768 if (m_status != 0) {
3769 m_status->result = translate_coi_error(res);
3770 return false;
3772 report_coi_error(c_buf_unmap, res);
3777 OFFLOAD_TIMER_HOST_SDATA(get_timer_data(), m_in.get_tfr_size());
3778 OFFLOAD_DEBUG_TRACE_1(1,
3779 GET_OFFLOAD_NUMBER(get_timer_data()), c_offload_copyin_data,
3780 "Total copyin data sent to target: [%lld] bytes\n",
3781 m_in.get_tfr_size());
3783 return true;
3786 bool OffloadDescriptor::compute(void *info)
3788 OffloadTimer timer(get_timer_data(), c_offload_host_start_compute);
3790 if (m_need_runfunction) {
3791 OFFLOAD_DEBUG_TRACE_1(2, GET_OFFLOAD_NUMBER(get_timer_data()),
3792 c_offload_compute, "Compute task on MIC\n");
3794 void* misc = m_func_desc;
3795 int misc_len = m_func_desc_size;
3796 void* ret = 0;
3797 int ret_len = 0;
3799 if (m_func_desc->data_offset != 0) {
3800 misc_len += m_in_datalen;
3802 if (m_out_datalen > 0) {
3803 ret = (char*) m_func_desc + m_func_desc->data_offset;
3804 ret_len = m_out_datalen;
3808 // dispatch task
3809 COIRESULT res;
3810 COIEVENT event;
3811 uint32_t in_deps_amount = m_in_deps_total;
3812 COIEVENT *in_deps = m_in_deps_total > 0 ? m_in_deps : 0;
3814 if (0 == m_in_deps_total && m_stream != no_stream) {
3815 get_stream_in_dependencies(in_deps_amount, in_deps);
3818 res = m_device.compute(m_stream,
3819 m_compute_buffers,
3820 misc, misc_len,
3821 ret, ret_len,
3822 in_deps_amount,
3823 in_deps,
3824 &event);
3826 if (res != COI_SUCCESS) {
3827 if (m_status != 0) {
3828 m_status->result = translate_coi_error(res);
3829 return false;
3831 report_coi_error(c_pipeline_run_func, res);
3834 if (m_omp_async_last_event_type == c_last_runfunc) {
3835 register_omp_event_call_back(&event, info);
3838 m_in_deps_total = 1;
3839 m_in_deps[0] = event;
3842 return true;
3845 // receive pointer data if source or destination or both of them are
3846 // noncontiguous. There is guarantee that length of destination enough for
3847 // transferred data.
3848 bool OffloadDescriptor::receive_noncontiguous_pointer_data(
3849 int i,
3850 COIBUFFER dst_buf,
3851 COIEVENT *event,
3852 uint64_t &received_data,
3853 uint32_t in_deps_amount,
3854 COIEVENT *in_deps
3857 int64_t offset_src, offset_dst;
3858 int64_t length_src, length_dst;
3859 int64_t length_src_cur, length_dst_cur;
3860 int64_t receive_size;
3861 COIRESULT res;
3862 bool dst_is_empty = true;
3863 bool src_is_empty = true;
3865 char *base = offload_get_src_base(
3866 m_vars[i].into ?
3867 static_cast<char*>(m_vars[i].into) :
3868 static_cast<char*>(m_vars[i].ptr),
3869 m_vars[i].type.dst);
3870 received_data = 0;
3872 // Set length_src and length_dst
3873 length_src = (m_vars_extra[i].read_rng_src) ?
3874 m_vars_extra[i].read_rng_src->range_size : m_vars[i].size;
3875 length_dst = !m_vars[i].into ? length_src :
3876 (m_vars_extra[i].read_rng_dst) ?
3877 m_vars_extra[i].read_rng_dst->range_size : m_vars[i].size;
3878 receive_size = (length_src < length_dst) ? length_src : length_dst;
3880 // If BufferReadMultiD is defined we can set values of required arguments
3881 // and transfer noncontiguous data via call to the COI routine.
3882 if (__offload_use_coi_noncontiguous_transfer && COI::BufferReadMultiD) {
3883 struct Arr_Desc* arr_desc_dst;
3884 struct Arr_Desc* arr_desc_src;
3885 int64_t size_src, size_dst;
3887 offset_src = (m_vars_extra[i].read_rng_src)?
3888 m_vars_extra[i].read_rng_src->init_offset : m_vars[i].disp;
3889 size_src = m_vars_extra[i].read_rng_src ?
3890 cean_get_transf_size(m_vars_extra[i].read_rng_src) :
3891 m_vars[i].size;
3893 offset_dst = (m_vars_extra[i].read_rng_dst)?
3894 m_vars_extra[i].read_rng_dst->init_offset : m_vars_extra[i].cpu_disp;
3895 size_dst = m_vars_extra[i].read_rng_dst ?
3896 cean_get_transf_size(m_vars_extra[i].read_rng_dst) : m_vars[i].size;
3898 int64_t el_size = (!m_vars[i].into ||
3899 (m_vars_extra[i].read_rng_src &&
3900 m_vars_extra[i].read_rng_dst)) ?
3902 m_vars_extra[i].read_rng_src ?
3903 m_vars_extra[i].read_rng_src->arr_desc->dim[
3904 m_vars_extra[i].read_rng_src->arr_desc->rank - 1].size :
3905 m_vars_extra[i].read_rng_dst->arr_desc->dim[
3906 m_vars_extra[i].read_rng_dst->arr_desc->rank - 1].size;
3907 arr_desc_src = (m_vars_extra[i].read_rng_src) ?
3908 m_vars_extra[i].read_rng_src->arr_desc :
3909 make_arr_desc(NULL, // don't required for source
3910 offset_src/el_size, size_src/el_size,
3911 el_size);
3912 arr_desc_dst = !m_vars[i].into ? arr_desc_src :
3913 (m_vars_extra[i].read_rng_dst) ?
3914 m_vars_extra[i].read_rng_dst->arr_desc :
3915 make_arr_desc(NULL,
3916 offset_dst/el_size, size_src/el_size, el_size);
3918 arr_desc_dst->base = reinterpret_cast<int64_t>(base);
3920 res = COI::BufferReadMultiD(
3921 m_vars_extra[i].src_data->mic_buf, // SourceBuffer
3922 m_vars[i].offset + m_vars[i].mic_offset -
3923 m_vars_extra[i].src_data->alloc_disp, // Offset
3924 (void*)arr_desc_dst, // descriptor of DestArray
3925 (void*)arr_desc_src, // descriptor of SrcArray
3926 COI_COPY_UNSPECIFIED, // Type
3927 in_deps_amount, // Number of in Dependencies
3928 in_deps, // array of in Dependencies
3929 event); // out Dependency
3930 if (res != COI_SUCCESS) {
3931 if (m_status != 0) {
3932 m_status->result = translate_coi_error(res);
3933 return false;
3935 report_coi_error(c_buf_copy, res);
3937 return(true);
3939 // if event is defined we must multiplate for all contiguous intervals
3940 // that will be Copied/Read.
3941 // Take in account that we already have 1 event.
3942 if (event) {
3943 m_out_deps_allocated += (length_src / receive_size) *
3944 ((m_vars_extra[i].read_rng_src) ?
3945 m_vars_extra[i].read_rng_src->range_max_number : 1) ;
3946 m_out_deps =
3947 (COIEVENT*)realloc(m_out_deps, sizeof(COIEVENT) * m_out_deps_allocated);
3948 m_out_deps_total--;
3951 // consequently get contiguous ranges,
3952 // define corresponded destination offset and receive data
3953 do {
3954 // get sorce offset
3955 if (src_is_empty) {
3956 if (m_vars_extra[i].read_rng_src) {
3957 if (!get_next_range(m_vars_extra[i].read_rng_src,
3958 &offset_src)) {
3959 // source ranges are over - nothing to send
3960 break;
3963 else if (received_data == 0) {
3964 offset_src = m_vars[i].disp;
3966 else {
3967 break;
3969 length_src_cur = length_src;
3971 else {
3972 // if source is contiguous or its contiguous range is greater
3973 // than destination one
3974 offset_src += receive_size;
3976 length_src_cur -= receive_size;
3977 src_is_empty = length_src_cur == 0;
3979 // get destination offset
3980 if (dst_is_empty) {
3981 if (m_vars[i].into) {
3982 if (m_vars_extra[i].read_rng_dst) {
3983 if (!get_next_range(m_vars_extra[i].read_rng_dst,
3984 &offset_dst)) {
3985 // destination ranges are over
3986 LIBOFFLOAD_ERROR(c_destination_is_over);
3987 return false;
3990 // destination is contiguous.
3991 else {
3992 offset_dst = m_vars_extra[i].cpu_disp;
3994 length_dst_cur = length_dst;
3996 // same as source
3997 else {
3998 offset_dst = offset_src;
3999 length_dst_cur = length_src;
4002 else {
4003 // if destination is contiguous or its contiguous range is greater
4004 // than source one
4005 offset_dst += receive_size;
4007 length_dst_cur -= receive_size;
4008 dst_is_empty = length_dst_cur == 0;
4009 if (event) {
4010 event = &m_out_deps[m_out_deps_total++];
4012 if (dst_buf != 0) {
4013 res = COI::BufferCopy(
4014 dst_buf,
4015 m_vars_extra[i].src_data->mic_buf,
4016 m_vars_extra[i].cpu_offset + offset_dst,
4017 m_vars[i].offset + offset_src +
4018 m_vars[i].mic_offset,
4019 receive_size,
4020 COI_COPY_UNSPECIFIED,
4021 in_deps_amount,
4022 in_deps,
4023 event);
4024 if (res != COI_SUCCESS) {
4025 if (m_status != 0) {
4026 m_status->result = translate_coi_error(res);
4027 return false;
4029 report_coi_error(c_buf_copy, res);
4032 else {
4033 res = COI::BufferRead(
4034 m_vars_extra[i].src_data->mic_buf,
4035 m_vars[i].offset + offset_src +
4036 m_vars[i].mic_offset,
4037 base + offset_dst,
4038 receive_size,
4039 COI_COPY_UNSPECIFIED,
4040 in_deps_amount,
4041 in_deps,
4042 event);
4043 if (res != COI_SUCCESS) {
4044 if (m_status != 0) {
4045 m_status->result = translate_coi_error(res);
4046 return false;
4048 report_coi_error(c_buf_read, res);
4051 received_data += receive_size;
4053 while (true);
4054 return true;
4057 bool OffloadDescriptor::receive_pointer_data(bool is_async,
4058 bool first_run, void *info)
4060 OffloadTimer timer(get_timer_data(), c_offload_host_start_buffers_reads);
4062 bool should_use_async_buffer_read = m_initial_need_runfunction;
4063 uint64_t ptr_received = 0;
4064 COIRESULT res;
4066 // For offload_transfer and offload with empty body without signal:
4067 // - if there is only one buffer copy - get data synchronously
4068 // - if there are multiple buffer copy and
4069 // __offload_parallel_copy is false - get data synchronously
4070 // - if there are multiple buffer copy
4071 // and __offload_parallel_copy is true - get data asynchronously
4072 // It concerns only data with size greater than __offload_use_async_buffer_read.
4073 // Data of size less than __offload_use_async_buffer_read are received synchronously.
4074 // Synchronous transfer results in better performance in COI.
4075 // __offload_parallel_copy is false by default but can be changed
4076 // via environment variable OFFLOAD_PARALLEL_COPY
4077 if (!m_initial_need_runfunction && __offload_parallel_copy) {
4078 int big_size_count = 0;
4080 for (int i = 0; i < m_vars_total; i++) {
4081 if (m_vars[i].direction.out &&
4082 m_vars[i].size >= __offload_use_async_buffer_read) {
4083 // preallocated OUT only at second run
4084 if (first_run == m_vars[i].flags.preallocated) {
4085 continue;
4087 switch (m_vars[i].type.src) {
4088 case c_data:
4089 case c_void_ptr:
4090 case c_cean_var:
4091 if (m_vars[i].flags.is_static) {
4092 big_size_count++;
4094 break;
4095 case c_string_ptr:
4096 case c_data_ptr:
4097 case c_cean_var_ptr:
4098 case c_dv_data:
4099 case c_dv_ptr_data:
4100 case c_dv_data_slice:
4101 case c_dv_ptr_data_slice:
4102 case c_dv_ptr:
4103 big_size_count++;
4104 break;
4105 default:
4106 break;
4110 if (big_size_count > 1) {
4111 should_use_async_buffer_read = true;
4114 uint32_t in_deps_amount = m_in_deps_total;
4115 COIEVENT *in_deps = m_in_deps_total > 0 ? m_in_deps : 0;
4117 if (0 == m_in_deps_total &&
4118 m_stream != no_stream &&
4119 m_vars_total != 0) {
4120 get_stream_in_dependencies(in_deps_amount, in_deps);
4123 for (int i = 0; i < m_vars_total; i++) {
4124 uint64_t received_data = m_vars[i].size;
4125 uint32_t in_deps_amount_save;
4126 COIEVENT *in_deps_save;
4128 if (m_vars_extra[i].omp_last_event_type == c_last_read) {
4129 in_deps_amount_save = in_deps_amount;
4130 in_deps_save = in_deps;
4132 in_deps_amount += m_out_deps_total;
4133 if (in_deps_amount > 0) {
4134 in_deps = (COIEVENT*) malloc(sizeof(COIEVENT) * in_deps_amount);
4135 if (in_deps == NULL)
4136 LIBOFFLOAD_ERROR(c_malloc);
4137 memcpy(in_deps, in_deps_save,
4138 in_deps_amount_save * sizeof(COIEVENT));
4139 memcpy(in_deps + in_deps_amount_save * sizeof(COIEVENT),
4140 m_out_deps,
4141 m_out_deps_total * sizeof(COIEVENT));
4144 // At first run don't receive by preallocated target pointer as the
4145 //pointer value will be ready later after call to scatter_copyout_data
4146 if (first_run && m_vars[i].alloc_if && m_vars[i].flags.preallocated) {
4147 m_preallocated_alloc = true;
4148 // need one more call to OffloadDescriptor::receive_pointer_data
4149 if (m_vars[i].direction.out) {
4150 m_out_with_preallocated = true;
4152 continue;
4154 switch (m_vars[i].type.src) {
4155 case c_data_ptr_array:
4156 break;
4157 case c_data:
4158 case c_void_ptr:
4159 case c_cean_var:
4160 if (m_vars[i].direction.out &&
4161 m_vars[i].flags.is_static) {
4162 COIEVENT *event =
4163 (is_async ||
4164 m_in_deps_total > 0 ||
4165 (should_use_async_buffer_read &&
4166 m_vars[i].size >= __offload_use_async_buffer_read)) ?
4167 &m_out_deps[m_out_deps_total++] : 0;
4168 PtrData *ptr_data = NULL;
4169 COIBUFFER dst_buf = NULL; // buffer at host
4170 char *base;
4172 if (VAR_TYPE_IS_PTR(m_vars[i].type.dst)) {
4173 ptr_data = m_vars[i].into ?
4174 m_vars_extra[i].dst_data :
4175 m_vars_extra[i].src_data;
4177 else if (VAR_TYPE_IS_SCALAR(m_vars[i].type.dst)) {
4178 if (m_vars[i].flags.is_static_dstn) {
4179 ptr_data = m_vars[i].into ?
4180 m_vars_extra[i].dst_data :
4181 m_vars_extra[i].src_data;
4184 dst_buf = ptr_data ? ptr_data->cpu_buf : NULL;
4185 if (dst_buf == NULL) {
4186 base = offload_get_src_base(
4187 m_vars[i].into ?
4188 static_cast<char*>(m_vars[i].into) :
4189 static_cast<char*>(m_vars[i].ptr),
4190 m_vars[i].type.dst);
4193 if (m_vars[i].flags.is_noncont_src ||
4194 m_vars[i].flags.is_noncont_dst) {
4195 receive_noncontiguous_pointer_data(
4196 i, dst_buf, event, received_data,
4197 in_deps_amount, in_deps);
4199 else if (dst_buf != 0) {
4200 res = COI::BufferCopy(
4201 dst_buf,
4202 m_vars_extra[i].src_data->mic_buf,
4203 m_vars_extra[i].cpu_offset +
4204 m_vars_extra[i].cpu_disp,
4205 m_vars[i].offset + m_vars[i].disp,
4206 m_vars[i].size,
4207 COI_COPY_UNSPECIFIED,
4208 in_deps_amount,
4209 in_deps,
4210 event);
4211 if (res != COI_SUCCESS) {
4212 if (m_status != 0) {
4213 m_status->result = translate_coi_error(res);
4214 return false;
4216 report_coi_error(c_buf_copy, res);
4219 else {
4220 res = COI::BufferRead(
4221 m_vars_extra[i].src_data->mic_buf,
4222 m_vars[i].offset + m_vars[i].disp,
4223 base + m_vars_extra[i].cpu_offset +
4224 m_vars_extra[i].cpu_disp,
4225 m_vars[i].size,
4226 COI_COPY_UNSPECIFIED,
4227 in_deps_amount,
4228 in_deps,
4229 event);
4230 if (res != COI_SUCCESS) {
4231 if (m_status != 0) {
4232 m_status->result = translate_coi_error(res);
4233 return false;
4235 report_coi_error(c_buf_read, res);
4238 ptr_received += received_data;
4240 break;
4242 case c_string_ptr:
4243 case c_data_ptr:
4244 case c_cean_var_ptr:
4245 case c_dv_data:
4246 case c_dv_ptr_data:
4247 case c_dv_data_slice:
4248 case c_dv_ptr_data_slice:
4249 case c_dv_ptr: {
4250 COIBUFFER dst_buf = NULL; // buffer on host
4251 if (m_vars[i].direction.out && m_vars[i].size > 0) {
4252 COIEVENT *event =
4253 (is_async ||
4254 m_in_deps_total > 0 ||
4255 (should_use_async_buffer_read &&
4256 m_vars[i].size >= __offload_use_async_buffer_read)) ?
4257 &m_out_deps[m_out_deps_total++] : 0;
4259 uint64_t dst_offset = 0;
4260 char *base = static_cast<char*>(m_vars[i].ptr);
4262 if (VAR_TYPE_IS_PTR(m_vars[i].type.dst)) {
4263 PtrData *ptr_data = m_vars[i].into ?
4264 m_vars_extra[i].dst_data :
4265 m_vars_extra[i].src_data;
4266 dst_buf = ptr_data ? ptr_data->cpu_buf : NULL;
4267 if (dst_buf == NULL) {
4268 base = m_vars[i].into ?
4269 *static_cast<char**>(m_vars[i].into) :
4270 *static_cast<char**>(m_vars[i].ptr);
4272 dst_offset = m_vars_extra[i].cpu_offset +
4273 m_vars_extra[i].cpu_disp;
4275 else if (VAR_TYPE_IS_SCALAR(m_vars[i].type.dst)) {
4276 if (m_vars[i].flags.is_static_dstn) {
4277 dst_buf = m_vars[i].into ?
4278 m_vars_extra[i].dst_data->cpu_buf :
4279 m_vars_extra[i].src_data->cpu_buf;
4281 if (dst_buf == NULL) {
4282 base = offload_get_src_base(
4283 m_vars[i].into ?
4284 static_cast<char*>(m_vars[i].into) :
4285 static_cast<char*>(m_vars[i].ptr),
4286 m_vars[i].type.dst);
4288 dst_offset = m_vars_extra[i].cpu_offset +
4289 m_vars_extra[i].cpu_disp;
4291 else if (VAR_TYPE_IS_DV_DATA(m_vars[i].type.dst) ||
4292 VAR_TYPE_IS_DV_DATA_SLICE(m_vars[i].type.dst)) {
4293 PtrData *ptr_data = m_vars[i].into != 0 ?
4294 m_vars_extra[i].dst_data :
4295 m_vars_extra[i].src_data;
4296 dst_buf = ptr_data != 0 ? ptr_data->cpu_buf : 0;
4297 if (dst_buf == NULL) {
4298 base = offload_get_src_base(
4299 m_vars[i].into ?
4300 static_cast<char*>(m_vars[i].into) :
4301 static_cast<char*>(m_vars[i].ptr),
4302 m_vars[i].type.dst);
4305 dst_offset = m_vars_extra[i].cpu_offset +
4306 m_vars_extra[i].cpu_disp;
4309 if (m_vars[i].flags.is_noncont_src ||
4310 m_vars[i].flags.is_noncont_dst) {
4311 receive_noncontiguous_pointer_data(
4312 i, dst_buf, event, received_data,
4313 in_deps_amount,
4314 in_deps);
4316 else if (dst_buf != 0) {
4317 res = COI::BufferCopy(
4318 dst_buf,
4319 m_vars_extra[i].src_data->mic_buf,
4320 dst_offset,
4321 m_vars[i].offset + m_vars[i].disp +
4322 m_vars[i].mic_offset,
4323 m_vars[i].size,
4324 COI_COPY_UNSPECIFIED,
4325 in_deps_amount,
4326 in_deps,
4327 event);
4328 if (res != COI_SUCCESS) {
4329 if (m_status != 0) {
4330 m_status->result = translate_coi_error(res);
4331 return false;
4333 report_coi_error(c_buf_copy, res);
4336 else {
4337 res = COI::BufferRead(
4338 m_vars_extra[i].src_data->mic_buf,
4339 m_vars[i].offset + m_vars[i].disp +
4340 m_vars[i].mic_offset,
4341 base + dst_offset,
4342 m_vars[i].size,
4343 COI_COPY_UNSPECIFIED,
4344 in_deps_amount,
4345 in_deps,
4346 event);
4347 if (res != COI_SUCCESS) {
4348 if (m_status != 0) {
4349 m_status->result = translate_coi_error(res);
4350 return false;
4352 report_coi_error(c_buf_read, res);
4355 ptr_received += received_data;
4357 break;
4360 default:
4361 break;
4364 if (m_vars_extra[i].omp_last_event_type == c_last_read) {
4365 in_deps_amount = in_deps_amount_save;
4366 in_deps = in_deps_save;
4367 register_omp_event_call_back(&m_out_deps[m_out_deps_total - 1], info);
4369 // destroy buffers for obsolete stacks
4370 if (m_destroy_stack.size() != 0) {
4371 for (PtrDataList::iterator it = m_destroy_stack.begin();
4372 it != m_destroy_stack.end(); it++) {
4373 PtrData *ptr_data = *it;
4374 m_destroy_buffers.push_back(ptr_data->mic_buf);
4375 OFFLOAD_TRACE(3, "Removing stack buffer with addr %p\n",
4376 ptr_data->mic_addr);
4378 m_destroy_stack.clear();
4380 if (m_vars[i].free_if) {
4381 // remove association for automatic variables
4382 if (m_is_openmp && !m_vars[i].flags.is_static &&
4383 (m_vars[i].type.src == c_data ||
4384 m_vars[i].type.src == c_void_ptr ||
4385 m_vars[i].type.src == c_cean_var)) {
4386 AutoData *auto_data = m_vars_extra[i].auto_data;
4387 if (auto_data != 0) {
4388 if (m_vars[i].flags.always_delete) {
4389 auto_data->nullify_reference();
4391 else if(auto_data->remove_reference() == 0) {
4392 m_device.remove_auto_data(auto_data->cpu_addr.start());
4397 // destroy buffers
4398 if (m_vars[i].direction.out || m_vars[i].into == NULL) {
4399 if (!VAR_TYPE_IS_PTR(m_vars[i].type.src) &&
4400 !VAR_TYPE_IS_DV_DATA_SLICE(m_vars[i].type.src) &&
4401 !VAR_TYPE_IS_DV_DATA(m_vars[i].type.src)) {
4402 continue;
4405 PtrData *ptr_data = m_vars_extra[i].src_data;
4406 if (ptr_data->remove_reference() == 0) {
4407 // destroy buffers
4408 if (ptr_data->cpu_buf != 0) {
4409 m_destroy_buffers.push_back(ptr_data->cpu_buf);
4411 if (ptr_data->mic_buf != 0) {
4412 m_destroy_buffers.push_back(ptr_data->mic_buf);
4414 OFFLOAD_TRACE(3, "Removing association for addr %p\n",
4415 ptr_data->cpu_addr.start());
4417 // remove association from map
4418 if (m_vars[i].flags.targetptr) {
4419 m_device.remove_targetptr_data(ptr_data->cpu_addr.start());
4421 else {
4422 m_device.remove_ptr_data(ptr_data->cpu_addr.start());
4426 else if (VAR_TYPE_IS_PTR(m_vars[i].type.dst) ||
4427 VAR_TYPE_IS_DV_DATA_SLICE(m_vars[i].type.dst) ||
4428 VAR_TYPE_IS_DV_DATA(m_vars[i].type.dst)) {
4429 PtrData *ptr_data = m_vars_extra[i].dst_data;
4430 if (ptr_data->remove_reference() == 0) {
4431 // destroy buffers
4432 if (ptr_data->cpu_buf != 0) {
4433 m_destroy_buffers.push_back(ptr_data->cpu_buf);
4435 if (ptr_data->mic_buf != 0) {
4436 m_destroy_buffers.push_back(ptr_data->mic_buf);
4438 OFFLOAD_TRACE(3, "Removing association for addr %p\n",
4439 ptr_data->cpu_addr.start());
4441 // remove association from map
4442 if (m_vars[i].flags.targetptr) {
4443 m_device.remove_targetptr_data(ptr_data->cpu_addr.start());
4445 else {
4446 m_device.remove_ptr_data(ptr_data->cpu_addr.start());
4453 if (m_status) {
4454 m_status->data_received += ptr_received;
4457 OFFLOAD_TIMER_HOST_RDATA(get_timer_data(), ptr_received);
4458 OFFLOAD_DEBUG_TRACE_1(1, GET_OFFLOAD_NUMBER(get_timer_data()),
4459 c_offload_received_pointer_data,
4460 "Total pointer data received from target: [%lld] bytes\n",
4461 ptr_received);
4463 return true;
4466 bool OffloadDescriptor::scatter_copyout_data()
4468 OffloadTimer timer(get_timer_data(), c_offload_host_scatter_outputs);
4470 if (m_need_runfunction && m_out_datalen > 0) {
4472 // total size that need to be transferred from target to host
4473 COIMAPINSTANCE map_inst;
4474 COIRESULT res;
4475 char *data;
4477 // output data buffer
4478 if (m_func_desc->data_offset == 0) {
4479 OffloadTimer timer_map(get_timer_data(),
4480 c_offload_host_map_out_data_buffer);
4482 COIRESULT res = COI::BufferMap(m_inout_buf, 0, m_out_datalen,
4483 COI_MAP_READ_ONLY, 0, 0, 0,
4484 &map_inst,
4485 reinterpret_cast<void**>(&data));
4486 if (res != COI_SUCCESS) {
4487 if (m_status != 0) {
4488 m_status->result = translate_coi_error(res);
4489 return false;
4491 report_coi_error(c_buf_map, res);
4494 else {
4495 data = (char*) m_func_desc + m_func_desc->data_offset;
4498 // get timing data
4499 OFFLOAD_TIMER_TARGET_DATA(get_timer_data(), data);
4500 data += OFFLOAD_TIMER_DATALEN();
4502 // initialize output marshaller
4503 m_out.init_buffer(data, m_out_datalen);
4505 for (int i = 0; i < m_vars_total; i++) {
4506 bool src_is_for_mic = (m_vars[i].direction.out ||
4507 m_vars[i].into == NULL);
4509 if (m_vars[i].type.src != c_data_ptr_array &&
4510 m_vars[i].flags.preallocated && m_vars[i].alloc_if) {
4511 PtrData *ptr_data;
4512 void *ptr_value;
4513 void ** cpu_ptr = src_is_for_mic ?
4514 reinterpret_cast<void**>(m_vars[i].ptr) :
4515 reinterpret_cast<void**>(m_vars[i].into);
4516 void* alloc_base = NULL;
4517 int64_t alloc_disp = 0;
4518 int64_t alloc_size;
4519 if (m_vars_extra[i].alloc != NULL) {
4520 // array descriptor
4521 const Arr_Desc *ap =
4522 static_cast<const Arr_Desc*>(m_vars_extra[i].alloc);
4524 __arr_data_offset_and_length(ap, alloc_disp, alloc_size);
4526 alloc_base = reinterpret_cast<void*>(ap->base);
4529 // get pointer to target memory
4530 m_out.receive_data(&ptr_value, sizeof(void*));
4532 // add new entry
4533 if (!alloc_ptr_data(
4534 ptr_data,
4535 ptr_value,
4536 (alloc_base != NULL) ?
4537 alloc_disp : m_vars[i].disp,
4538 (alloc_base != NULL) ?
4539 alloc_size : m_vars[i].size,
4540 alloc_disp,
4542 m_vars[i].flags.targetptr,
4543 m_vars[i].flags.preallocated,
4544 m_vars[i].flags.pin)) {
4545 return false;
4548 ptr_data->add_reference();
4549 *cpu_ptr = ptr_value;
4550 if (src_is_for_mic) {
4551 m_vars_extra[i].src_data = ptr_data;
4553 else {
4554 m_vars_extra[i].dst_data = ptr_data;
4556 m_vars[i].offset = (char*) ptr_value -
4557 (char*) ptr_data->cpu_addr.start();
4560 switch (m_vars[i].type.src) {
4561 case c_data_ptr_array:
4562 break;
4563 case c_data:
4564 case c_void_ptr:
4565 case c_cean_var:
4566 if (m_vars[i].direction.out &&
4567 !m_vars[i].flags.is_static) {
4569 if (m_vars[i].into) {
4570 char *ptr = offload_get_src_base(
4571 static_cast<char*>(m_vars[i].into),
4572 m_vars[i].type.dst);
4573 m_out.receive_data(ptr + m_vars_extra[i].cpu_disp,
4574 m_vars[i].size);
4576 else {
4577 m_out.receive_data(
4578 static_cast<char*>(m_vars[i].ptr) +
4579 m_vars_extra[i].cpu_disp,
4580 m_vars[i].size);
4583 break;
4585 case c_func_ptr:
4586 if (m_vars[i].direction.out) {
4587 m_out.receive_func_ptr((const void**) m_vars[i].ptr);
4589 break;
4591 default:
4592 break;
4596 if (m_status) {
4597 m_status->data_received += m_out.get_tfr_size();
4600 if (m_func_desc->data_offset == 0) {
4601 OffloadTimer timer_unmap(get_timer_data(),
4602 c_offload_host_unmap_out_data_buffer);
4604 COIRESULT res = COI::BufferUnmap(map_inst, 0, 0, 0);
4605 if (res != COI_SUCCESS) {
4606 if (m_status != 0) {
4607 m_status->result = translate_coi_error(res);
4608 return false;
4610 report_coi_error(c_buf_unmap, res);
4615 OFFLOAD_TIMER_HOST_RDATA(get_timer_data(), m_out.get_tfr_size());
4616 OFFLOAD_TRACE(1, "Total copyout data received from target: [%lld] bytes\n",
4617 m_out.get_tfr_size());
4619 return true;
4622 static void get_arr_desc_numbers(
4623 const Arr_Desc *ap,
4624 int64_t el_size,
4625 int64_t &offset,
4626 int64_t &size,
4627 int &el_number,
4628 CeanReadRanges* &ptr_ranges
4631 if (is_arr_desc_contiguous(ap)) {
4632 ptr_ranges = NULL;
4633 __arr_data_offset_and_length(ap, offset, size);
4634 el_number = size / el_size;
4636 else {
4637 ptr_ranges = init_read_ranges_arr_desc(ap);
4638 el_number = (ptr_ranges->range_size / el_size) *
4639 ptr_ranges->range_max_number;
4640 size = ptr_ranges->range_size;
4644 bool OffloadDescriptor::gen_var_descs_for_pointer_array(int i)
4646 int pointers_number;
4647 int tmp_val;
4648 int new_index = m_vars_total;
4649 const Arr_Desc *ap;
4650 const VarDesc3 *vd3 = static_cast<const VarDesc3*>(m_vars[i].ptr);
4651 int flags = vd3->array_fields;
4652 bool src_is_for_mic = (m_vars[i].direction.out ||
4653 m_vars[i].into == NULL);
4655 ReadArrElements<void *> ptr;
4656 ReadArrElements<void *> into;
4657 ReadArrElements<int64_t> ext_start;
4658 ReadArrElements<int64_t> ext_elements;
4659 ReadArrElements<int64_t> align;
4660 ReadArrElements<int64_t> alloc_if;
4661 ReadArrElements<int64_t> free_if;
4662 ReadArrElements<int64_t> into_start;
4663 ReadArrElements<int64_t> into_elem;
4664 ReadArrElements<int64_t> alloc_start;
4665 ReadArrElements<int64_t> alloc_elem;
4668 ap = static_cast<const Arr_Desc*>(vd3->ptr_array);
4670 // "pointers_number" for total number of transferred pointers.
4671 // For each of them we create new var_desc and put it at the bottom
4672 // of the var_desc's array
4673 get_arr_desc_numbers(ap, sizeof(void *), ptr.offset, ptr.size,
4674 pointers_number, ptr.ranges);
4675 ptr.base = (m_vars[i].flags.is_pointer) ?
4676 *(reinterpret_cast<char**>(ap->base)) :
4677 reinterpret_cast<char*>(ap->base);
4679 // 2. prepare memory for new var_descs
4680 m_vars_total += pointers_number;
4681 m_vars = (VarDesc*)realloc(m_vars, m_vars_total * sizeof(VarDesc));
4682 if (m_vars == NULL)
4683 LIBOFFLOAD_ERROR(c_malloc);
4684 m_vars_extra =
4685 (VarExtra*)realloc(m_vars_extra, m_vars_total * sizeof(VarExtra));
4686 if (m_vars_extra == NULL)
4687 LIBOFFLOAD_ERROR(c_malloc);
4688 m_in_deps =
4689 (COIEVENT*)realloc(m_in_deps, sizeof(COIEVENT) * (m_vars_total + 1));
4690 if (m_in_deps == NULL)
4691 LIBOFFLOAD_ERROR(c_malloc);
4692 m_out_deps =
4693 (COIEVENT*)realloc(m_out_deps, sizeof(COIEVENT) * m_vars_total);
4694 if (m_out_deps == NULL)
4695 LIBOFFLOAD_ERROR(c_malloc);
4697 // 3. Prepare for reading new var_desc's fields
4698 // EXTENT START
4699 if ((flags & (1<<flag_extent_start_is_array)) != 0) {
4700 ap = static_cast<const Arr_Desc*>(vd3->extent_start);
4701 get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, ext_start.offset,
4702 ext_start.size, tmp_val, ext_start.ranges);
4703 ext_start.base = reinterpret_cast<char*>(ap->base);
4704 ext_start.el_size = ap->dim[ap->rank - 1].size;
4706 if (tmp_val < pointers_number) {
4707 LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "extent start");
4708 return false;
4711 else if ((flags & (1<<flag_extent_start_is_scalar)) != 0) {
4712 ext_start.val = (int64_t)vd3->extent_start;
4714 else {
4715 ext_start.val = 0;
4718 // EXTENT ELEMENTS NUMBER
4719 if ((flags & (1<<flag_extent_elements_is_array)) != 0) {
4720 ap = static_cast<const Arr_Desc*>(vd3->extent_elements);
4721 get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size,
4722 ext_elements.offset, ext_elements.size,
4723 tmp_val, ext_elements.ranges);
4724 ext_elements.base = reinterpret_cast<char*>(ap->base);
4725 ext_elements.el_size = ap->dim[ap->rank - 1].size;
4727 if (tmp_val < pointers_number) {
4728 LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "extent elements");
4729 return false;
4732 else if ((flags & (1<<flag_extent_elements_is_scalar)) != 0) {
4733 ext_elements.val = (int64_t)vd3->extent_elements;
4735 else {
4736 ext_elements.val = m_vars[i].count;
4739 // ALLOC_IF
4740 if ((flags & (1<<flag_alloc_if_is_array)) != 0) {
4741 ap = static_cast<const Arr_Desc*>(vd3->alloc_if_array);
4742 get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, alloc_if.offset,
4743 alloc_if.size, tmp_val, alloc_if.ranges);
4744 alloc_if.base = reinterpret_cast<char*>(ap->base);
4745 alloc_if.el_size = ap->dim[ap->rank - 1].size;
4747 if (tmp_val < pointers_number) {
4748 LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "alloc_if");
4749 return false;
4752 else {
4753 alloc_if.val = m_vars[i].alloc_if;
4756 // FREE_IF
4757 if ((flags & (1<<flag_free_if_is_array)) != 0) {
4758 ap = static_cast<const Arr_Desc*>(vd3->free_if_array);
4759 get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, free_if.offset,
4760 free_if.size, tmp_val, free_if.ranges);
4761 free_if.base = reinterpret_cast<char*>(ap->base);
4762 free_if.el_size = ap->dim[ap->rank - 1].size;
4764 if (tmp_val < pointers_number) {
4765 LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "free_if");
4766 return false;
4769 else {
4770 free_if.val = m_vars[i].free_if;
4773 // ALIGN
4775 if ((flags & (1<<flag_align_is_array)) != 0) {
4776 ap = static_cast<const Arr_Desc*>(vd3->align_array);
4777 get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, align.offset,
4778 align.size, tmp_val, align.ranges);
4779 align.base = reinterpret_cast<char*>(ap->base);
4780 align.el_size = ap->dim[ap->rank - 1].size;
4782 if (tmp_val < pointers_number) {
4783 LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "align");
4784 return false;
4787 else {
4788 align.val = m_vars[i].align;
4791 // 3.1 INTO
4793 if (m_vars[i].into) {
4794 ap = static_cast<const Arr_Desc*>(m_vars[i].into);
4795 get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, into.offset,
4796 into.size, tmp_val, into.ranges);
4797 into.base = reinterpret_cast<char*>(ap->base);
4799 if (tmp_val < pointers_number) {
4800 LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "into");
4801 return false;
4805 // 3.2 INTO_START
4807 if ((flags & (1<<flag_into_start_is_array)) != 0) {
4808 ap = static_cast<const Arr_Desc*>(vd3->into_start);
4809 get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, into_start.offset,
4810 into_start.size, tmp_val, into_start.ranges);
4811 into_start.base = reinterpret_cast<char*>(ap->base);
4812 into_start.el_size = ap->dim[ap->rank - 1].size;
4814 if (tmp_val < pointers_number) {
4815 LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "into_extent start");
4816 return false;
4819 else if ((flags & (1<<flag_into_start_is_scalar)) != 0) {
4820 into_start.val = (int64_t)vd3->into_start;
4822 else {
4823 into_start.val = 0;
4826 // 3.3 INTO_ELEMENTS
4828 if ((flags & (1<<flag_into_elements_is_array)) != 0) {
4829 ap = static_cast<const Arr_Desc*>(vd3->into_elements);
4830 get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, into_elem.offset,
4831 into_elem.size, tmp_val, into_elem.ranges);
4832 into_elem.base = reinterpret_cast<char*>(ap->base);
4833 into_elem.el_size = ap->dim[ap->rank - 1].size;
4835 if (tmp_val < pointers_number) {
4836 LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "into_extent elements");
4837 return false;
4840 else if ((flags & (1<<flag_into_elements_is_scalar)) != 0) {
4841 into_elem.val = (int64_t)vd3->into_elements;
4843 else {
4844 into_elem.val = m_vars[i].count;
4847 // alloc_start
4849 if ((flags & (1<<flag_alloc_start_is_array)) != 0) {
4850 ap = static_cast<const Arr_Desc*>(vd3->alloc_start);
4851 get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size,
4852 alloc_start.offset, alloc_start.size, tmp_val,
4853 alloc_start.ranges);
4854 alloc_start.base = reinterpret_cast<char*>(ap->base);
4855 alloc_start.el_size = ap->dim[ap->rank - 1].size;
4857 if (tmp_val < pointers_number) {
4858 LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "alloc_extent start");
4859 return false;
4862 else if ((flags & (1<<flag_alloc_start_is_scalar)) != 0) {
4863 alloc_start.val = (int64_t)vd3->alloc_start;
4865 else {
4866 alloc_start.val = 0;
4869 // alloc_elem
4871 if ((flags & (1<<flag_alloc_elements_is_array)) != 0) {
4872 ap = static_cast<const Arr_Desc*>(vd3->alloc_elements);
4873 get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, alloc_elem.offset,
4874 alloc_elem.size, tmp_val, alloc_elem.ranges);
4875 alloc_elem.base = reinterpret_cast<char*>(ap->base);
4876 alloc_elem.el_size = ap->dim[ap->rank - 1].size;
4877 if (tmp_val < pointers_number) {
4878 LIBOFFLOAD_ERROR(c_pointer_array_mismatch,
4879 "alloc_extent elements");
4880 return false;
4883 else if ((flags & (1<<flag_alloc_elements_is_scalar)) != 0) {
4884 alloc_elem.val = (int64_t)vd3->alloc_elements;
4886 else {
4887 alloc_elem.val = 0;
4890 for (int k = 0; k < pointers_number; k++) {
4891 int type = flags & 0x3f;
4892 int type_src, type_dst;
4893 // Get new values
4894 // type_src, type_dst
4895 type_src = type_dst = (type == c_data_ptr_array) ?
4896 c_data_ptr : (type == c_func_ptr_array) ?
4897 c_func_ptr : (type == c_void_ptr_array) ?
4898 c_void_ptr : (type == c_string_ptr_array) ?
4899 c_string_ptr : 0;
4901 // Get ptr val
4902 if (!ptr.read_next(true)) {
4903 break;
4905 else {
4906 ptr.val = (void*)(ptr.base + ptr.offset);
4909 // !!! If we got error at phase of reading - it's an internal
4910 // !!! error, as we must detect mismatch before
4912 // Get into val
4913 if (m_vars[i].into) {
4914 if (!into.read_next(true)) {
4915 LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "into");
4916 LIBOFFLOAD_ABORT;
4918 else {
4919 into.val = (void*)(into.base + into.offset);
4923 // Get other components of the clause
4924 if (!ext_start.read_next(flags & (1<<flag_extent_start_is_array))) {
4925 LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "extent start");
4926 LIBOFFLOAD_ABORT;
4928 if (!ext_elements.read_next(
4929 flags & (1<<flag_extent_elements_is_array))) {
4930 LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "extent elements");
4931 LIBOFFLOAD_ABORT;
4933 if (!alloc_if.read_next(flags & (1<<flag_alloc_if_is_array))) {
4934 LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "alloc_if");
4935 LIBOFFLOAD_ABORT;
4937 if (!free_if.read_next(flags & (1<<flag_free_if_is_array))) {
4938 LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "free_if");
4939 LIBOFFLOAD_ABORT;
4941 if (!align.read_next(flags & (1<<flag_align_is_array))) {
4942 LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "align");
4943 LIBOFFLOAD_ABORT;
4945 if (!into_start.read_next(flags & (1<<flag_into_start_is_array))) {
4946 LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "into_extent start");
4947 LIBOFFLOAD_ABORT;
4949 if (!into_elem.read_next(flags & (1<<flag_into_elements_is_array))) {
4950 LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "into_extent elements");
4951 LIBOFFLOAD_ABORT;
4953 if (!alloc_start.read_next(flags & (1<<flag_alloc_start_is_array))) {
4954 LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "alloc_extent start");
4955 LIBOFFLOAD_ABORT;
4957 if (!alloc_elem.read_next(
4958 flags & (1<<flag_alloc_elements_is_array))) {
4959 LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "alloc_extent elements");
4960 LIBOFFLOAD_ABORT;
4963 m_vars[new_index + k].direction.bits = m_vars[i].direction.bits;
4964 m_vars[new_index + k].alloc_if = alloc_if.val;
4965 m_vars[new_index + k].free_if = free_if.val;
4966 m_vars[new_index + k].align = align.val;
4967 m_vars[new_index + k].mic_offset = 0;
4968 m_vars[new_index + k].flags.bits = m_vars[i].flags.bits;
4969 m_vars[new_index + k].offset = 0;
4970 m_vars[new_index + k].size = m_vars[i].size;
4971 m_vars[new_index + k].flags.targetptr = m_vars[i].flags.targetptr;
4972 m_vars[new_index + k].flags.preallocated =
4973 m_vars[i].flags.preallocated;
4975 if (ext_start.val == 0) {
4976 m_vars[new_index + k].count = ext_elements.val;
4977 m_vars[new_index + k].ptr = ptr.val;
4978 if (type_src == c_string_ptr) {
4979 m_vars[new_index + k].size = 0;
4982 else {
4983 m_vars[new_index + k].count = 0;
4984 m_vars[new_index + k].ptr =
4985 static_cast<void*>(make_arr_desc(
4986 ptr.val,
4987 ext_start.val,
4988 ext_elements.val,
4989 m_vars[i].size));
4991 type_src = type_src == c_data_ptr ? c_cean_var_ptr :
4992 c_string_ptr ? c_cean_var_ptr :
4993 type_src;
4994 if (!m_vars[i].into) {
4995 type_dst = type_src;
4999 if (m_vars[i].into && into_elem.val != 0) {
5000 m_vars[new_index + k].into =
5001 static_cast<void*>(make_arr_desc(
5002 into.val,
5003 into_start.val,
5004 into_elem.val,
5005 m_vars[i].size));
5006 type_dst = (type == c_data_ptr_array) ? c_cean_var_ptr :
5007 (type == c_string_ptr_array) ? c_cean_var_ptr :
5008 type_src;
5010 else {
5011 m_vars[new_index + k].into = NULL;
5014 if (alloc_elem.val != 0) {
5015 m_vars[new_index + k].alloc =
5016 static_cast<void*>(make_arr_desc(
5017 ptr.val,
5018 alloc_start.val,
5019 alloc_elem.val,
5020 m_vars[i].size));
5022 else {
5023 m_vars[new_index + k].alloc = NULL;
5026 m_vars[new_index + k].type.src = type_src;
5027 m_vars[new_index + k].type.dst = type_dst;
5029 m_vars_extra[new_index + k].alloc = m_vars[new_index + k].alloc;
5030 m_vars_extra[new_index + k].is_arr_ptr_el = 1;
5031 m_vars_extra[new_index + k].ptr_arr_offset =
5032 src_is_for_mic ? ptr.offset : into.offset;
5034 // count and alloc fields are useless at target. They can be reused
5035 // for pointer arrays.
5036 m_vars[i].count = pointers_number;
5037 m_vars[i].ptr_arr_offset = new_index;
5038 return true;
5041 // Gets in dependencies of the previous offload via the stream "m_stream".
5042 // Out argument in_deps_amount - address of amount of the dependencies
5043 // Out argument in_deps - array of dependencies.
5044 // Description of the dependencies scheme for streams :
5045 // ----------------------------------------------------
5046 // Every offload forms DAG consisted of 3 nodes:
5047 // for in-transfers, runfunction and out-transfers.
5048 // Every node has in-dependencies and out-dependencies
5049 // Out-dependencies of previous node forms in-dependencies of current node.
5050 // In-dependencies of 1-st node (of in-transfers) without streams is equal
5051 // to NULL. For streams in-dependencies of 1-st node is equal to list of out
5052 // dependencies of last node of previous offload via this stream.
5053 // So we can say that DAGs of 2 consequent offloads via the same stream are
5054 // connected by the way described above.
5055 void OffloadDescriptor::get_stream_in_dependencies(
5056 uint32_t &in_deps_amount,
5057 COIEVENT* &in_deps
5060 if (m_stream != no_stream && m_stream != 0) {
5061 Stream * stream = Stream::find_stream(m_stream, false);
5062 if (!stream) {
5063 LIBOFFLOAD_ERROR(c_offload_no_stream,
5064 m_device.get_logical_index());
5065 LIBOFFLOAD_ABORT;
5067 OffloadDescriptor* offload = stream->get_last_offload();
5069 // if it's the first offload in the stream
5070 if (!offload) {
5071 return;
5073 // if last offload has out-tranfers
5074 if (offload->m_out_deps_total) {
5075 in_deps_amount = offload->m_out_deps_total;
5076 in_deps = offload->m_out_deps;
5078 // last offload only sends pointer data or run function or both of them
5079 // and has no out-transfers
5080 else if (offload->m_in_deps_total) {
5081 in_deps_amount = offload->m_in_deps_total;
5082 in_deps = offload->m_in_deps;
5087 static void __offload_fini_library(void)
5089 OFFLOAD_DEBUG_TRACE(2, "Cleanup offload library ...\n");
5090 if (mic_engines_total > 0) {
5091 delete[] mic_engines;
5092 mic_engines_total = 0;
5094 if (mic_proxy_fs_root != 0) {
5095 free(mic_proxy_fs_root);
5096 mic_proxy_fs_root = 0;
5099 if (mic_library_path != 0) {
5100 free(mic_library_path);
5101 mic_library_path = 0;
5104 // destroy thread key
5105 thread_key_delete(mic_thread_key);
5108 // unload COI library
5109 if (COI::is_available) {
5110 COI::fini();
5113 OFFLOAD_DEBUG_TRACE(2, "Cleanup offload library ... done\n");
5116 static void __offload_init_library_once(void)
5118 COIRESULT res;
5119 uint32_t num_devices;
5120 std::bitset<MIC_ENGINES_MAX> devices;
5121 prefix = report_get_message_str(c_report_host);
5123 // initialize trace
5124 const char *env_var = getenv(htrace_envname);
5125 if (env_var != 0 && *env_var != '\0') {
5126 int64_t new_val;
5127 if (__offload_parse_int_string(env_var, new_val)) {
5128 console_enabled = new_val & 0x0f;
5132 env_var = getenv(offload_report_envname);
5133 if (env_var != 0 && *env_var != '\0') {
5134 int64_t env_val;
5135 if (__offload_parse_int_string(env_var, env_val)) {
5136 if (env_val == OFFLOAD_REPORT_1 ||
5137 env_val == OFFLOAD_REPORT_2 ||
5138 env_val == OFFLOAD_REPORT_3) {
5139 offload_report_level = env_val;
5141 else {
5142 LIBOFFLOAD_ERROR(c_invalid_env_report_value,
5143 offload_report_envname);
5146 else {
5147 LIBOFFLOAD_ERROR(c_invalid_env_var_int_value,
5148 offload_report_envname);
5151 else if (!offload_report_level) {
5152 env_var = getenv(timer_envname);
5153 if (env_var != 0 && *env_var != '\0') {
5154 timer_enabled = atoi(env_var);
5158 // initialize COI
5159 if (!COI::init()) {
5160 return;
5163 // get number of devices installed in the system
5164 res = COI::EngineGetCount(COI_ISA_MIC, &num_devices);
5165 if (res != COI_SUCCESS) {
5166 return;
5169 if (num_devices > MIC_ENGINES_MAX) {
5170 num_devices = MIC_ENGINES_MAX;
5173 // fill in the list of devices that can be used for offloading
5174 env_var = getenv("OFFLOAD_DEVICES");
5175 if (env_var != 0) {
5176 if (strcasecmp(env_var, "none") != 0) {
5177 // value is composed of comma separated physical device indexes
5178 char *buf = strdup(env_var);
5179 if (buf == NULL)
5180 LIBOFFLOAD_ERROR(c_malloc);
5181 char *str, *ptr;
5182 for (str = strtok_r(buf, ",", &ptr); str != 0;
5183 str = strtok_r(0, ",", &ptr)) {
5184 // convert string to an int
5185 int64_t num;
5186 if (!__offload_parse_int_string(str, num)) {
5187 LIBOFFLOAD_ERROR(c_mic_init5);
5189 // fallback to using all installed devices
5190 devices.reset();
5191 for (int i = 0; i < num_devices; i++) {
5192 devices.set(i);
5194 break;
5196 if (num < 0 || num >= num_devices) {
5197 LIBOFFLOAD_ERROR(c_mic_init6, num);
5198 continue;
5200 devices.set(num);
5202 free(buf);
5205 else {
5206 // use all available devices
5207 for (int i = 0; i < num_devices; i++) {
5208 COIENGINE engine;
5209 res = COI::EngineGetHandle(COI_ISA_MIC, i, &engine);
5210 if (res == COI_SUCCESS) {
5211 devices.set(i);
5216 mic_engines_total = devices.count();
5218 // no need to continue if there are no devices to offload to
5219 if (mic_engines_total <= 0) {
5220 return;
5223 // initialize indexes for available devices
5224 mic_engines = new Engine[mic_engines_total];
5225 for (int p_idx = 0, l_idx = 0; p_idx < num_devices; p_idx++) {
5226 if (devices[p_idx]) {
5227 mic_engines[l_idx].set_indexes(l_idx, p_idx);
5228 l_idx++;
5232 // Get DMA channel count to pass it to COI
5233 env_var = getenv("OFFLOAD_DMA_CHANNEL_COUNT");
5234 if (env_var != 0) {
5235 int64_t new_val;
5236 if (__offload_parse_int_string(env_var, new_val)) {
5237 mic_dma_channel_count = new_val;
5239 else {
5240 LIBOFFLOAD_ERROR(c_invalid_env_var_value,
5241 "OFFLOAD_DMA_CHANNEL_COUNT");
5245 // Set COI_HOST_THREAD_AFFINITY if OFFLOAD_HOST_THREAD_AFFINITY is set.
5246 // Use putenv instead of setenv as Windows has no setenv.
5247 // Note: putenv requires its argument can't be freed or modified.
5248 // So no free after call to putenv or elsewhere.
5249 env_var = getenv("OFFLOAD_HOST_THREAD_AFFINITY");
5250 if (env_var != 0) {
5251 char * new_env_var =
5252 (char*) malloc(sizeof("COI_HOST_THREAD_AFFINITY=") +
5253 strlen(env_var));
5254 if (new_env_var == NULL)
5255 LIBOFFLOAD_ERROR(c_malloc);
5256 sprintf(new_env_var, "COI_HOST_THREAD_AFFINITY=%s", env_var);
5257 putenv(new_env_var);
5260 // library search path for device binaries
5261 env_var = getenv("MIC_LD_LIBRARY_PATH");
5262 if (env_var != 0) {
5263 mic_library_path = strdup(env_var);
5264 if (mic_library_path == NULL)
5265 LIBOFFLOAD_ERROR(c_malloc);
5269 // find target executable to be used if main application is not an
5270 // offload build application.
5271 const char *base_name = "offload_main";
5272 if (mic_library_path != 0) {
5273 char *buf = strdup(mic_library_path);
5274 if (buf == NULL)
5275 LIBOFFLOAD_ERROR(c_malloc);
5276 char *try_name = (char*) alloca(strlen(mic_library_path) +
5277 strlen(base_name) + 2);
5278 char *dir, *ptr;
5280 for (dir = strtok_r(buf, PATH_SEPARATOR, &ptr); dir != 0;
5281 dir = strtok_r(0, PATH_SEPARATOR, &ptr)) {
5282 // compose a full path
5283 sprintf(try_name, "%s/%s", dir, base_name);
5285 // check if such file exists
5286 struct stat st;
5287 if (stat(try_name, &st) == 0 && S_ISREG(st.st_mode)) {
5288 mic_device_main = strdup(try_name);
5289 if (mic_device_main == NULL)
5290 LIBOFFLOAD_ERROR(c_malloc);
5291 break;
5295 free(buf);
5298 // memory size reserved for COI buffers
5299 env_var = getenv("MIC_BUFFERSIZE");
5300 if (env_var != 0) {
5301 uint64_t new_size;
5302 if (__offload_parse_size_string(env_var, new_size)) {
5303 mic_buffer_size = new_size;
5305 else {
5306 LIBOFFLOAD_ERROR(c_invalid_env_var_value, "MIC_BUFFERSIZE");
5310 // memory size reserved for 4K pages for COI buffers
5311 env_var = getenv("MIC_4K_BUFFER_RESERVE_SIZE");
5312 if (env_var != 0) {
5313 uint64_t new_size;
5314 if (__offload_parse_size_string(env_var, new_size)) {
5315 mic_4k_buffer_size = new_size;
5317 else {
5318 LIBOFFLOAD_ERROR(c_invalid_env_var_value, "MIC_4K_BUFFER_RESERVE_SIZE");
5322 // memory size reserved for 2M pages for COI buffers
5323 env_var = getenv("MIC_2M_BUFFER_RESERVE_SIZE");
5324 if (env_var != 0) {
5325 uint64_t new_size;
5326 if (__offload_parse_size_string(env_var, new_size)) {
5327 mic_2m_buffer_size = new_size;
5329 else {
5330 LIBOFFLOAD_ERROR(c_invalid_env_var_value, "MIC_2M_BUFFER_RESERVE_SIZE");
5334 // determine stacksize for the pipeline on the device
5335 env_var = getenv("MIC_STACKSIZE");
5336 if (env_var != 0 && *env_var != '\0') {
5337 uint64_t new_size;
5338 if (__offload_parse_size_string(env_var, new_size) &&
5339 (new_size >= 16384) && ((new_size & 4095) == 0)) {
5340 mic_stack_size = new_size;
5342 else {
5343 LIBOFFLOAD_ERROR(c_mic_init3);
5347 // proxy I/O
5348 env_var = getenv("MIC_PROXY_IO");
5349 if (env_var != 0 && *env_var != '\0') {
5350 int64_t new_val;
5351 if (__offload_parse_int_string(env_var, new_val)) {
5352 mic_proxy_io = new_val;
5354 else {
5355 LIBOFFLOAD_ERROR(c_invalid_env_var_int_value, "MIC_PROXY_IO");
5358 env_var = getenv("MIC_PROXY_FS_ROOT");
5359 if (env_var != 0 && *env_var != '\0') {
5360 mic_proxy_fs_root = strdup(env_var);
5361 if (mic_proxy_fs_root == NULL)
5362 LIBOFFLOAD_ERROR(c_malloc);
5365 // Prepare environment for the target process using the following
5366 // rules
5367 // - If MIC_ENV_PREFIX is set then any environment variable on the
5368 // host which has that prefix are copied to the device without
5369 // the prefix.
5370 // All other host environment variables are ignored.
5371 // - If MIC_ENV_PREFIX is not set or if MIC_ENV_PREFIX="" then host
5372 // environment is duplicated.
5373 env_var = getenv("MIC_ENV_PREFIX");
5374 if (env_var != 0 && *env_var != '\0') {
5375 mic_env_vars.set_prefix(env_var);
5377 int len = strlen(env_var);
5378 for (int i = 0; environ[i] != 0; i++) {
5379 if (strncmp(environ[i], env_var, len) == 0 &&
5380 strncmp(environ[i], "MIC_LD_LIBRARY_PATH", 19) != 0 &&
5381 environ[i][len] != '=') {
5382 mic_env_vars.analyze_env_var(environ[i]);
5387 // create key for thread data
5388 if (thread_key_create(&mic_thread_key, Engine::destroy_thread_data)) {
5389 LIBOFFLOAD_ERROR(c_mic_init4, errno);
5390 return;
5393 // cpu frequency
5394 cpu_frequency = COI::PerfGetCycleFrequency();
5396 env_var = getenv(mic_use_2mb_buffers_envname);
5397 if (env_var != 0 && *env_var != '\0') {
5398 uint64_t new_size;
5399 if (__offload_parse_size_string(env_var, new_size)) {
5400 __offload_use_2mb_buffers = new_size;
5402 else {
5403 LIBOFFLOAD_ERROR(c_invalid_env_var_value,
5404 mic_use_2mb_buffers_envname);
5408 env_var = getenv(mic_use_async_buffer_write_envname);
5409 if (env_var != 0 && *env_var != '\0') {
5410 uint64_t new_size;
5411 if (__offload_parse_size_string(env_var, new_size)) {
5412 __offload_use_async_buffer_write = new_size;
5416 env_var = getenv(mic_use_async_buffer_read_envname);
5417 if (env_var != 0 && *env_var != '\0') {
5418 uint64_t new_size;
5419 if (__offload_parse_size_string(env_var, new_size)) {
5420 __offload_use_async_buffer_read = new_size;
5424 // mic initialization type
5425 env_var = getenv(offload_init_envname);
5426 if (env_var != 0 && *env_var != '\0') {
5427 if (strcmp(env_var, "on_offload") == 0) {
5428 __offload_init_type = c_init_on_offload;
5430 else if (strcmp(env_var, "on_offload_all") == 0) {
5431 __offload_init_type = c_init_on_offload_all;
5433 else if (strcmp(env_var, "on_start") == 0) {
5434 __offload_init_type = c_init_on_start;
5436 else {
5437 LIBOFFLOAD_ERROR(c_invalid_env_var_value, offload_init_envname);
5441 // active wait
5442 env_var = getenv(offload_active_wait_envname);
5443 if (env_var != 0 && *env_var != '\0') {
5444 int64_t new_val;
5445 if (__offload_parse_int_string(env_var, new_val)) {
5446 __offload_active_wait = new_val;
5448 else {
5449 LIBOFFLOAD_ERROR(c_invalid_env_var_int_value,
5450 offload_active_wait_envname);
5454 // omp device num
5455 env_var = getenv(omp_device_num_envname);
5456 if (env_var != 0 && *env_var != '\0') {
5457 int64_t new_val;
5458 if (__offload_parse_int_string(env_var, new_val) && new_val >= 0) {
5459 __omp_device_num = new_val;
5461 else {
5462 LIBOFFLOAD_ERROR(c_omp_invalid_device_num_env,
5463 omp_device_num_envname);
5467 // parallel copy of offload_transfer
5468 env_var = getenv(parallel_copy_envname);
5469 if (env_var != 0 && *env_var != '\0') {
5470 int64_t new_val;
5471 if (__offload_parse_int_string(env_var, new_val) && new_val >= 0) {
5472 __offload_parallel_copy = new_val;
5474 else {
5475 LIBOFFLOAD_ERROR(c_invalid_env_var_value,
5476 parallel_copy_envname);
5480 // use COI interface for noncontiguous arrays transfer
5481 env_var = getenv(use_coi_noncontiguous_transfer_envname);
5482 if (env_var != 0 && *env_var != '\0') {
5483 uint64_t new_size;
5484 if (__offload_parse_size_string(env_var, new_size)) {
5485 __offload_use_coi_noncontiguous_transfer = new_size;
5487 else {
5488 LIBOFFLOAD_ERROR(c_invalid_env_var_value,
5489 use_coi_noncontiguous_transfer_envname);
5493 // init ORSL
5494 ORSL::init();
5497 extern int __offload_init_library(void)
5499 // do one time intialization
5500 static OffloadOnceControl ctrl = OFFLOAD_ONCE_CONTROL_INIT;
5501 __offload_run_once(&ctrl, __offload_init_library_once);
5503 // offload is available if COI is available and the number of devices > 0
5504 bool is_available = COI::is_available && (mic_engines_total > 0);
5506 // register pending libraries if there are any
5507 if (is_available && __target_libs) {
5508 mutex_locker_t locker(__target_libs_lock);
5510 for (TargetImageList::iterator it = __target_libs_list.begin();
5511 it != __target_libs_list.end(); it++) {
5512 // Register library in COI
5513 COI::ProcessRegisterLibraries(1, &it->data, &it->size,
5514 &it->origin, &it->offset);
5516 // add lib to all engines
5517 for (int i = 0; i < mic_engines_total; i++) {
5518 mic_engines[i].add_lib(*it);
5522 __target_libs = false;
5523 __target_libs_list.clear();
5526 return is_available;
5529 extern "C" bool __offload_target_image_is_executable(const void *target_image)
5531 const struct Image *image = static_cast<const struct Image*>(target_image);
5533 // decode image
5534 const char *name = image->data;
5535 const void *data = image->data + strlen(image->data) + 1;
5537 // determine image type
5538 const Elf64_Ehdr *hdr = static_cast<const Elf64_Ehdr*>(data);
5539 return (hdr->e_type == ET_EXEC);
5542 extern "C" bool __offload_register_image(const void *target_image)
5544 const struct Image *image = static_cast<const struct Image*>(target_image);
5546 // decode image
5547 const char *name = image->data;
5548 const void *data = image->data + strlen(image->data) + 1;
5549 uint64_t size = image->size;
5550 char *origin = (char *) malloc(strlen(image->data) + 1);
5551 uint64_t offset = 0;
5552 const char *host_name = image->data;
5553 int i;
5555 if (origin == NULL)
5556 LIBOFFLOAD_ERROR(c_malloc);
5558 // The origin name is the name of the file on the host
5559 // this is used by Vtune, since it is a fat binary we
5560 // use the host file name of the fat binary.
5561 // Driver prepends the host file name ending with "?"
5562 // to the image->data name so need to extract the string
5563 i = 0;
5564 while (*host_name != '\0' && *host_name != '?') {
5565 origin[i] = *host_name;
5566 host_name++;
5567 i++;
5569 origin[i] = '\0';
5570 // Implies the host name does not exist which really should
5571 // not occur. Allow this since only consumer is Vtune.
5572 if ((i == 0) || (*host_name != '?')) {
5573 free(origin);
5574 origin = 0;
5577 // our actions depend on the image type
5578 const Elf64_Ehdr *hdr = static_cast<const Elf64_Ehdr*>(data);
5579 switch (hdr->e_type) {
5580 case ET_EXEC:
5581 // Each offload application is supposed to have only one target
5582 // image representing target executable.
5583 // No thread synchronization is required here as the initialization
5584 // code is always executed in a single thread.
5585 if (__target_exe != 0) {
5586 LIBOFFLOAD_ERROR(c_multiple_target_exes);
5587 exit(1);
5589 __target_exe = new TargetImage(name, data, size, origin, offset);
5591 // Registration code for execs is always called from the context
5592 // of main and thus we can safely call any function here,
5593 // including LoadLibrary API on windows. This is the place where
5594 // we do the offload library initialization.
5595 if (__offload_init_library()) {
5596 // initialize engine if init_type is on_start
5597 if (__offload_init_type == c_init_on_start) {
5598 for (int i = 0; i < mic_engines_total; i++) {
5599 mic_engines[i].init();
5603 return mic_engines_total > 0;
5605 case ET_DYN:
5607 char *fullname = origin;
5608 // We add the library to a list of pending libraries
5609 __target_libs_lock.lock();
5610 __target_libs = true;
5611 __target_libs_list.push_back(
5612 TargetImage(name, data, size, fullname, offset));
5613 __target_libs_lock.unlock();
5614 // If __target_exe is set, then main has started running
5615 // If not main, then we can't do anything useful here
5616 // because this registration code is called from DllMain
5617 // context (on windows).
5618 if (__target_exe != 0) {
5619 // There is no need to delay loading the library
5620 if (!__offload_init_library()) {
5621 // Couldn't validate library as a fat offload library
5622 LIBOFFLOAD_ERROR(c_unknown_binary_type);
5623 exit(1);
5626 return true;
5629 default:
5630 // something is definitely wrong, issue an error and exit
5631 LIBOFFLOAD_ERROR(c_unknown_binary_type);
5632 exit(1);
5636 extern "C" void __offload_unregister_image(const void *target_image)
5638 // Target image is packed as follows:
5639 // 8 bytes - size of the target binary
5640 // null-terminated string - binary name
5641 // <size> bytes - binary contents
5642 const struct Image {
5643 int64_t size;
5644 char data[];
5645 } *image = static_cast<const struct Image*>(target_image);
5647 // decode image
5648 const char *name = image->data;
5649 const void *data = image->data + strlen(image->data) + 1;
5651 // our actions depend on the image type
5652 const Elf64_Ehdr *hdr = static_cast<const Elf64_Ehdr*>(data);
5653 if (hdr->e_type == ET_EXEC) {
5654 // We are executing exec's desctructors.
5655 // It is time to do a library cleanup.
5656 if (timer_enabled) {
5657 Offload_Timer_Print();
5660 #ifdef MYO_SUPPORT
5661 __offload_myoFini();
5662 #endif // MYO_SUPPORT
5664 __offload_fini_library();
5666 else if (hdr->e_type == ET_DYN) {
5667 for (int i = 0; i < mic_engines_total; i++) {
5668 mic_engines[i].unload_library(data, name);
5674 extern "C" void __offload_register_task_callback(void (*cb)(void *))
5676 task_completion_callback = cb;
5679 // Runtime trace interface for user programs
5681 void __offload_console_trace(int level)
5683 console_enabled = level;
5686 // User-visible offload API
5688 int _Offload_number_of_devices(void)
5690 __offload_init_library();
5691 return mic_engines_total;
5694 int _Offload_get_device_number(void)
5696 return -1;
5699 int _Offload_get_physical_device_number(void)
5701 return -1;
5704 int _Offload_signaled(int index, void *signal)
5706 __offload_init_library();
5708 // check index value
5709 if (index < 0) {
5710 LIBOFFLOAD_ERROR(c_offload_signaled1, index);
5711 LIBOFFLOAD_ABORT;
5714 index %= mic_engines_total;
5716 // find associated async task
5717 OffloadDescriptor *task =
5718 mic_engines[index].find_signal(signal, false);
5719 if (task == 0) {
5720 LIBOFFLOAD_ERROR(c_offload_signaled2, signal);
5721 LIBOFFLOAD_ABORT;
5723 // if signal is removed by wait completing
5724 else if (task == SIGNAL_IS_REMOVED) {
5725 return (true);
5727 return task->is_signaled();
5730 void _Offload_report(int val)
5732 if (val == OFFLOAD_REPORT_ON ||
5733 val == OFFLOAD_REPORT_OFF) {
5734 offload_report_enabled = val;
5738 int _Offload_find_associated_mic_memory(
5739 int target,
5740 const void* cpu_addr,
5741 void** cpu_base_addr,
5742 uint64_t* buf_length,
5743 void** mic_addr,
5744 uint64_t* mic_buf_start_offset,
5745 int* is_static
5748 __offload_init_library();
5750 // check target value
5751 if (target < 0) {
5752 LIBOFFLOAD_ERROR(c_offload_signaled1, target);
5753 LIBOFFLOAD_ABORT;
5755 target %= mic_engines_total;
5757 // find existing association in pointer table
5758 PtrData* ptr_data = mic_engines[target].find_ptr_data(cpu_addr);
5759 if (ptr_data == 0) {
5760 OFFLOAD_TRACE(3, "Association does not exist\n");
5761 return 0;
5764 OFFLOAD_TRACE(3, "Found association: base %p, length %lld, is_static %d\n",
5765 ptr_data->cpu_addr.start(), ptr_data->cpu_addr.length(),
5766 ptr_data->is_static);
5768 if (ptr_data->mic_buf != 0 && ptr_data->mic_addr == 0) {
5769 COIRESULT res = COI::BufferGetSinkAddress(ptr_data->mic_buf,
5770 &ptr_data->mic_addr);
5771 if (res != COI_SUCCESS) {
5772 return 0;
5775 *cpu_base_addr = const_cast<void *>(ptr_data->cpu_addr.start());
5776 *buf_length = ptr_data->cpu_addr.length() - ptr_data->alloc_disp;
5777 *mic_addr = (void *)(ptr_data->mic_addr + ptr_data->mic_offset);
5778 *mic_buf_start_offset = ptr_data->alloc_disp;
5779 *is_static = ptr_data->is_static;
5780 return ptr_data->is_static ? 1 : ptr_data->get_reference();
5783 _Offload_stream _Offload_stream_create(
5784 int device, // MIC device number
5785 int number_of_cpus // Cores allocated to the stream
5788 __offload_init_library();
5790 // check target value
5791 if (device < 0) {
5792 LIBOFFLOAD_ERROR(c_offload_signaled1, device);
5793 LIBOFFLOAD_ABORT;
5795 device %= mic_engines_total;
5797 // Create new stream and get its handle
5798 _Offload_stream handle = Stream::add_stream(device, number_of_cpus);
5799 if (handle == 0) {
5800 OFFLOAD_TRACE(3, "Can't create stream\n");
5801 return 0;
5804 // create pipeline associated with the new stream
5805 mic_engines[device].get_pipeline(handle);
5807 return(handle);
5810 int _Offload_stream_destroy(
5811 int device, // MIC device number
5812 _Offload_stream handle // stream to destroy
5815 __offload_init_library();
5817 // check target value
5818 if (device < 0) {
5819 LIBOFFLOAD_ERROR(c_offload_signaled1, device);
5820 LIBOFFLOAD_ABORT;
5822 device %= mic_engines_total;
5824 mic_engines[device].stream_destroy(handle);
5826 return(true);
5829 int _Offload_stream_completed(int device, _Offload_stream handler)
5831 __offload_init_library();
5833 // check index value
5834 if (device < 0) {
5835 LIBOFFLOAD_ERROR(c_offload_signaled1, device);
5836 LIBOFFLOAD_ABORT;
5839 device %= mic_engines_total;
5841 // get stream
5842 Stream * stream;
5844 if (handler != 0) {
5845 stream = Stream::find_stream(handler, false);
5847 // the stream was not created or was destroyed
5848 if (!stream) {
5849 LIBOFFLOAD_ERROR(c_offload_no_stream, device);
5850 LIBOFFLOAD_ABORT;
5853 // find associated async task
5854 OffloadDescriptor *task = stream->get_last_offload();
5856 // offload was completed by offload_wait pragma or wait clause
5857 if (task == 0) {
5858 return(true);
5860 return task->is_signaled();
5862 // zero handler is for all streams at the device
5863 else {
5864 StreamMap stream_map = Stream::all_streams;
5865 for (StreamMap::iterator it = stream_map.begin();
5866 it != stream_map.end(); it++) {
5867 Stream * stream = it->second;
5868 // find associated async task
5869 OffloadDescriptor *task = stream->get_last_offload();
5871 // offload was completed by offload_wait pragma or wait clause
5872 if (task == 0) {
5873 return(true);
5875 // if even one stream is not completed result is false
5876 if (!task->is_signaled()) {
5877 return false;
5880 // no uncompleted streams
5881 return true;
5885 // IDB support
5886 int __dbg_is_attached = 0;
5887 int __dbg_target_id = -1;
5888 pid_t __dbg_target_so_pid = -1;
5889 char __dbg_target_exe_name[MAX_TARGET_NAME] = {0};
5890 const int __dbg_api_major_version = 1;
5891 const int __dbg_api_minor_version = 0;
5893 void __dbg_target_so_loaded()
5896 void __dbg_target_so_unloaded()