2 Copyright (c) 2014-2015 Intel Corporation. All Rights Reserved.
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions
8 * Redistributions of source code must retain the above copyright
9 notice, this list of conditions and the following disclaimer.
10 * Redistributions in binary form must reproduce the above copyright
11 notice, this list of conditions and the following disclaimer in the
12 documentation and/or other materials provided with the distribution.
13 * Neither the name of Intel Corporation nor the names of its
14 contributors may be used to endorse or promote products derived
15 from this software without specific prior written permission.
17 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 // Forward declaration as the following 2 functions are declared as friend
32 // in offload_engine.h.
33 // CLANG does not like static to been after friend declaration.
34 static void __offload_init_library_once(void);
35 static void __offload_fini_library(void);
37 #include "offload_host.h"
39 #include "offload_myo_host.h"
46 #endif // TARGET_WINNT
52 #include <sys/types.h>
58 #if defined(HOST_WINNT)
59 #define PATH_SEPARATOR ";"
61 #define PATH_SEPARATOR ":"
64 #define GET_OFFLOAD_NUMBER(timer_data) \
65 timer_data? timer_data->offload_number : 0
67 static void (*task_completion_callback
)(void *);
71 // Windows does not support imports from libraries without actually
72 // including them as dependence. We don't want to include in the
73 // dependence since is it used only for Fortran when traceback is enabled.
74 // Chose to implement it with GetProcAddress.
75 #define FORTRAN_TRACE_BACK win_for__continue_traceback
76 int win_for__continue_traceback( _Offload_result coi_offload_result
)
79 int (* TraceBackRoutine
)(_Offload_result value
);
81 hDLL
= LoadLibrary("libifcoremd.dll");
83 TraceBackRoutine
= (int (*)(_Offload_result
)) GetProcAddress(hDLL
,
84 "for__continue_traceback");
85 if (TraceBackRoutine
!= 0) {
86 return TraceBackRoutine(coi_offload_result
);
90 "Cannot find for__continue_traceback routine in libifcorert.dll\n");
95 OFFLOAD_TRACE(3, "Cannot load libifcorert.dll\n");
101 #else // TARGET_WINNT
103 #define FORTRAN_TRACE_BACK for__continue_traceback
105 // for__continue_traceback is provided as a dummy to resolve link time symbols
106 // for C/C++ programs. For Fortran the actual fortran library function in
107 // libifcore.so is used.
108 #pragma weak for__continue_traceback
109 int for__continue_traceback( _Offload_result coi_offload_result
)
112 "liboffload function for_continue_traceback should not be called.\n");
115 #endif //TARGET_WINNT
119 // Small subset of ELF declarations for Windows which is needed to compile
120 // this file. ELF header is used to understand what binary type is contained
121 // in the target image - shared library or executable.
123 typedef uint16_t Elf64_Half
;
124 typedef uint32_t Elf64_Word
;
125 typedef uint64_t Elf64_Addr
;
126 typedef uint64_t Elf64_Off
;
135 unsigned char e_ident
[EI_NIDENT
];
137 Elf64_Half e_machine
;
138 Elf64_Word e_version
;
144 Elf64_Half e_phentsize
;
146 Elf64_Half e_shentsize
;
148 Elf64_Half e_shstrndx
;
150 #endif // TARGET_WINNT
152 // Host console and file logging
154 int console_enabled
= 0;
155 int offload_number
= 0;
157 static const char *htrace_envname
= "H_TRACE";
158 static const char *offload_report_envname
= "OFFLOAD_REPORT";
159 static const char *timer_envname
= "H_TIME";
161 // location of offload_main executable
162 // To be used if the main application has no offload and is not built
163 // with -offload but dynamic library linked in has offload pragma
164 char* mic_device_main
= 0;
166 // DMA channel count used by COI and set via
167 // OFFLOAD_DMA_CHANNEL_COUNT environment variable
168 uint32_t mic_dma_channel_count
;
171 static const char* vardesc_direction_as_string
[] = {
177 static const char* vardesc_type_as_string
[] = {
198 Engine
* mic_engines
= 0;
199 uint32_t mic_engines_total
= 0;
200 pthread_key_t mic_thread_key
;
201 MicEnvVar mic_env_vars
;
202 uint64_t cpu_frequency
= 0;
205 uint32_t mic_stack_size
= 12 * 1024 * 1024;
208 uint64_t mic_buffer_size
= 0;
210 // Preallocated 4K page memory size for buffers on MIC
211 uint64_t mic_4k_buffer_size
= 0;
213 // Preallocated 2M page memory size for buffers on MIC
214 uint64_t mic_2m_buffer_size
= 0;
217 // MIC_LD_LIBRARY_PATH
218 char* mic_library_path
= 0;
221 bool mic_proxy_io
= true;
224 char* mic_proxy_fs_root
= 0;
226 // Threshold for creating buffers with large pages. Buffer is created
227 // with large pages hint if its size exceeds the threshold value.
228 // By default large pages are disabled right now (by setting default
229 // value for threshold to MAX) due to HSD 4114629.
230 uint64_t __offload_use_2mb_buffers
= 0xffffffffffffffffULL
;
231 static const char *mic_use_2mb_buffers_envname
=
232 "MIC_USE_2MB_BUFFERS";
234 static uint64_t __offload_use_async_buffer_write
= 2 * 1024 * 1024;
235 static const char *mic_use_async_buffer_write_envname
=
236 "MIC_USE_ASYNC_BUFFER_WRITE";
238 static uint64_t __offload_use_async_buffer_read
= 2 * 1024 * 1024;
239 static const char *mic_use_async_buffer_read_envname
=
240 "MIC_USE_ASYNC_BUFFER_READ";
242 // device initialization type
243 OffloadInitType __offload_init_type
= c_init_on_offload_all
;
244 static const char *offload_init_envname
= "OFFLOAD_INIT";
247 static bool __offload_active_wait
= true;
248 static const char *offload_active_wait_envname
= "OFFLOAD_ACTIVE_WAIT";
250 // OMP_DEFAULT_DEVICE
251 int __omp_device_num
= 0;
252 static const char *omp_device_num_envname
= "OMP_DEFAULT_DEVICE";
254 //OFFLOAD_PARALLEL_COPY
255 static bool __offload_parallel_copy
= false;
256 static const char *parallel_copy_envname
= "OFFLOAD_PARALLEL_COPY";
258 //Use COI interface for noncontiguous transfer if it exists.
259 static bool __offload_use_coi_noncontiguous_transfer
= false;
260 static const char *use_coi_noncontiguous_transfer_envname
=
261 "MIC_USE_COI_MULTI_D";
263 // The list of pending target libraries
264 static bool __target_libs
;
265 static TargetImageList __target_libs_list
;
266 static mutex_t __target_libs_lock
;
267 static mutex_t stack_alloc_lock
;
270 TargetImage
* __target_exe
;
272 // Print readable offload flags
273 static void trace_offload_flags(
274 OffloadHostTimerData
* timer_data
,
275 OffloadFlags offload_flags
278 // Sized big enough for all flag names
281 if (!OFFLOAD_DO_TRACE
&& (console_enabled
>= 1)) {
282 sprintf(fbuffer
, " OffloadFlags=(");
283 if (offload_flags
.bits
.fortran_traceback
) {
284 sprintf(fbuffer
+strlen(fbuffer
), "fortran_traceback");
287 if (offload_flags
.bits
.omp_async
) {
288 sprintf(fbuffer
+strlen(fbuffer
), first
? "omp_async" : ",omp_async");
291 OFFLOAD_DEBUG_TRACE_1(1,
292 GET_OFFLOAD_NUMBER(timer_data
), c_offload_init_func
,
297 // Print readable varDesc flags
298 static void trace_varDesc_flags(
299 OffloadHostTimerData
* timer_data
,
300 varDescFlags offload_flags
303 // SIzed big enough for all flag names
306 if (!OFFLOAD_DO_TRACE
&& (console_enabled
>= 1)) {
307 sprintf(fbuffer
, " varDescFlags=(");
308 if (offload_flags
.is_static
) {
309 sprintf(fbuffer
+strlen(fbuffer
), "is_static");
312 if (offload_flags
.is_static_dstn
) {
313 sprintf(fbuffer
+strlen(fbuffer
),
314 first
? "is_static_dstn" : ",is_static_dstn");
317 if (offload_flags
.has_length
) {
318 sprintf(fbuffer
+strlen(fbuffer
),
319 first
? "has_length" : ",has_length");
322 if (offload_flags
.is_stack_buf
) {
323 sprintf(fbuffer
+strlen(fbuffer
),
324 first
? "is_stack_buf" : ",is_stack_buf");
327 if (offload_flags
.targetptr
) {
328 sprintf(fbuffer
+strlen(fbuffer
),
329 first
? "targetptr" : ",targetptr");
332 if (offload_flags
.preallocated
) {
333 sprintf(fbuffer
+strlen(fbuffer
),
334 first
? "preallocated" : ",preallocated");
337 if (offload_flags
.is_pointer
) {
338 sprintf(fbuffer
+strlen(fbuffer
),
339 first
? "is_pointer" : ",is_pointer");
342 if (offload_flags
.sink_addr
) {
343 sprintf(fbuffer
+strlen(fbuffer
),
344 first
? "sink_addr" : ",sink_addr");
347 if (offload_flags
.alloc_disp
) {
348 sprintf(fbuffer
+strlen(fbuffer
),
349 first
? "alloc_disp" : ",alloc_disp");
352 if (offload_flags
.is_noncont_src
) {
353 sprintf(fbuffer
+strlen(fbuffer
),
354 first
? "is_noncont_src" : ",is_noncont_src");
357 if (offload_flags
.is_noncont_dst
) {
358 sprintf(fbuffer
+strlen(fbuffer
),
359 first
? "is_noncont_dst" : ",is_noncont_dst");
362 if (offload_flags
.always_copy
) {
363 sprintf(fbuffer
+strlen(fbuffer
),
364 first
? "always_copy" : ",always_copy");
367 if (offload_flags
.always_delete
) {
368 sprintf(fbuffer
+strlen(fbuffer
),
369 first
? "always_delete" : ",always_delete");
372 OFFLOAD_DEBUG_TRACE_1(1,
373 GET_OFFLOAD_NUMBER(timer_data
), c_offload_init_func
,
378 static char * offload_get_src_base(void * ptr
, uint8_t type
)
381 if (VAR_TYPE_IS_PTR(type
)) {
382 base
= *static_cast<char**>(ptr
);
384 else if (VAR_TYPE_IS_SCALAR(type
)) {
385 base
= static_cast<char*>(ptr
);
387 else if (VAR_TYPE_IS_DV_DATA_SLICE(type
) || VAR_TYPE_IS_DV_DATA(type
)) {
389 if (VAR_TYPE_IS_DV_DATA_SLICE(type
)) {
390 const Arr_Desc
*ap
= static_cast<const Arr_Desc
*>(ptr
);
391 dvp
= (type
== c_dv_data_slice
) ?
392 reinterpret_cast<ArrDesc
*>(ap
->base
) :
393 *reinterpret_cast<ArrDesc
**>(ap
->base
);
396 dvp
= (type
== c_dv_data
) ?
397 static_cast<ArrDesc
*>(ptr
) :
398 *static_cast<ArrDesc
**>(ptr
);
400 base
= reinterpret_cast<char*>(dvp
->Base
);
408 void OffloadDescriptor::report_coi_error(error_types msg
, COIRESULT res
)
410 // special case for the 'process died' error
411 if (res
== COI_PROCESS_DIED
) {
412 m_device
.fini_process(true);
417 if (res
== COI_OUT_OF_MEMORY
) {
418 msg
= c_buf_create_out_of_mem
;
422 case c_buf_create_from_mem
:
423 case c_buf_get_address
:
424 case c_pipeline_create
:
425 case c_pipeline_run_func
:
426 LIBOFFLOAD_ERROR(msg
, m_device
.get_logical_index(), res
);
435 case c_buf_set_state
:
436 LIBOFFLOAD_ERROR(msg
, res
);
447 _Offload_result
OffloadDescriptor::translate_coi_error(COIRESULT res
) const
451 return OFFLOAD_SUCCESS
;
453 case COI_PROCESS_DIED
:
454 return OFFLOAD_PROCESS_DIED
;
456 case COI_OUT_OF_MEMORY
:
457 return OFFLOAD_OUT_OF_MEMORY
;
460 return OFFLOAD_ERROR
;
464 // is_targetptr == 0 && is_prealloc == 0 - allocation of pointer data;
465 // is_targetptr == 1 && is_prealloc == 0 - allocation of target memory:
466 // allocate memory at target; use its value as base in target table.
467 // is_targetptr == 1 && is_prealloc == 1 - use preallocated target memory:
468 // base - is address at target of preallocated memory; use its value as
469 // base in target table.
471 bool OffloadDescriptor::alloc_ptr_data(
483 // total length of base
484 int64_t length
= size
;
486 COIBUFFER targptr_buf
;
488 uint32_t buffer_flags
= 0;
489 char * base_disp
= reinterpret_cast<char *>(base
) + disp
;
491 // create buffer with large pages if data length exceeds
492 // large page threshold
493 if (length
>= __offload_use_2mb_buffers
) {
494 buffer_flags
= COI_OPTIMIZE_HUGE_PAGE_SIZE
;
496 // Allocate memory at target for targetptr without preallocated as we need
497 // its address as base argument in call to m_device.insert_ptr_data
498 if (is_targptr
&& !is_prealloc
) {
499 length
= alloc_disp
? length
: size
+ disp
;
500 res
= COI::BufferCreate(
506 &m_device
.get_process(),
508 if (res
!= COI_SUCCESS
) {
510 m_status
->result
= translate_coi_error(res
);
512 else if (m_is_mandatory
) {
513 report_coi_error(c_buf_create
, res
);
518 res
= COI::BufferGetSinkAddress(
519 targptr_buf
, reinterpret_cast<uint64_t *>(&base
));
520 if (res
!= COI_SUCCESS
) {
522 m_status
->result
= translate_coi_error(res
);
524 else if (m_is_mandatory
) {
525 report_coi_error(c_buf_get_address
, res
);
531 OFFLOAD_TRACE(3, "Creating association for data: addr %p, length %lld\n",
532 alloc_disp
? base
: base_disp
,
533 alloc_disp
? length
: size
+ disp
);
537 ptr_data
= is_targptr
?
538 m_device
.find_targetptr_data(base_disp
) :
539 m_device
.find_ptr_data(base_disp
);
540 // if ptr_data is found just need to check it for overlapping
546 // If association is not found we must create it.
547 length
= alloc_disp
? length
: size
+ disp
;
548 ptr_data
= is_targptr
?
549 m_device
.insert_targetptr_data(base
, length
, is_new
) :
550 m_device
.insert_ptr_data(base
, length
, is_new
);
554 OFFLOAD_TRACE(3, "Added new association\n");
557 OffloadTimer
timer(get_timer_data(), c_offload_host_alloc_buffers
);
559 // align should be a power of 2
560 if (!pin
&& !is_targptr
&&
561 align
> 0 && (align
& (align
- 1)) == 0) {
562 // offset within mic_buffer. Can do offset optimization
563 // only when source address alignment satisfies requested
564 // alignment on the target (cq172736).
565 if ((reinterpret_cast<intptr_t>(base
) & (align
- 1)) == 0) {
566 ptr_data
->mic_offset
=
567 reinterpret_cast<intptr_t>(base
) & 4095;
571 // buffer size and flags
572 uint64_t buffer_size
= length
+ ptr_data
->mic_offset
;
574 // For targetptr there is no CPU buffer
575 if (pin
|| !is_targptr
) {
577 OFFLOAD_DEBUG_TRACE_1(3,
578 GET_OFFLOAD_NUMBER(get_timer_data()),
579 c_offload_create_buf_host
,
580 "Creating buffer from source memory %p, "
581 "length %lld\n", base
, length
);
583 // result is not checked because we can continue without cpu
584 // buffer. In this case we will use COIBufferRead/Write
585 // instead of COIBufferCopy.
587 COI::BufferCreateFromMemory(length
,
592 &m_device
.get_process(),
598 OFFLOAD_DEBUG_TRACE_1(3,
599 GET_OFFLOAD_NUMBER(get_timer_data()),
600 c_offload_create_buf_mic
,
601 "Creating buffer from sink memory: size %lld, offset %d, "
602 "flags =0x%x\n", buffer_size
,
603 ptr_data
->mic_offset
, buffer_flags
);
604 res
= COI::BufferCreateFromMemory(ptr_data
->cpu_addr
.length(),
609 &m_device
.get_process(),
611 if (res
!= COI_SUCCESS
) {
613 m_status
->result
= translate_coi_error(res
);
615 else if (m_is_mandatory
) {
616 report_coi_error(c_buf_create
, res
);
618 ptr_data
->alloc_ptr_data_lock
.unlock();
622 else if (is_targptr
) {
623 ptr_data
->mic_buf
= targptr_buf
;
626 OFFLOAD_DEBUG_TRACE_1(3,
627 GET_OFFLOAD_NUMBER(get_timer_data()),
628 c_offload_create_buf_mic
,
629 "Creating buffer for sink: size %lld, offset %d, "
630 "flags =0x%x\n", buffer_size
,
631 ptr_data
->mic_offset
, buffer_flags
);
632 res
= COI::BufferCreate(buffer_size
,
637 &m_device
.get_process(),
639 if (res
!= COI_SUCCESS
) {
641 m_status
->result
= translate_coi_error(res
);
643 else if (m_is_mandatory
) {
644 report_coi_error(c_buf_create
, res
);
646 ptr_data
->alloc_ptr_data_lock
.unlock();
652 // make buffer valid on the device.
653 res
= COI::BufferSetState(ptr_data
->mic_buf
,
654 m_device
.get_process(),
658 if (res
!= COI_SUCCESS
) {
660 m_status
->result
= translate_coi_error(res
);
662 else if (m_is_mandatory
) {
663 report_coi_error(c_buf_set_state
, res
);
665 ptr_data
->alloc_ptr_data_lock
.unlock();
669 res
= COI::BufferSetState(ptr_data
->mic_buf
,
674 if (res
!= COI_SUCCESS
) {
676 m_status
->result
= translate_coi_error(res
);
678 else if (m_is_mandatory
) {
679 report_coi_error(c_buf_set_state
, res
);
681 ptr_data
->alloc_ptr_data_lock
.unlock();
686 ptr_data
->alloc_disp
= alloc_disp
;
687 ptr_data
->alloc_ptr_data_lock
.unlock();
690 mutex_locker_t
locker(ptr_data
->alloc_ptr_data_lock
);
692 OFFLOAD_TRACE(3, "Found existing association: addr %p, length %lld, "
694 ptr_data
->cpu_addr
.start(), ptr_data
->cpu_addr
.length(),
695 ptr_data
->is_static
);
697 // This is not a new entry. Make sure that provided address range fits
698 // into existing one.
699 MemRange
addr_range(base
, length
);
700 if (!ptr_data
->cpu_addr
.contains(addr_range
)) {
701 LIBOFFLOAD_ERROR(c_bad_ptr_mem_alloc
, base
, length
,
702 const_cast<void *>(ptr_data
->cpu_addr
.start()),
703 ptr_data
->cpu_addr
.length());
707 // if the entry is associated with static data it may not have buffers
708 // created because they are created on demand.
709 if (ptr_data
->is_static
&& !init_static_ptr_data(ptr_data
)) {
717 bool OffloadDescriptor::find_ptr_data(
726 // total length of base
727 int64_t length
= size
;
728 char *base
= reinterpret_cast<char *>(in_base
) + disp
;
730 OFFLOAD_TRACE(3, "Looking for association for data: addr %p, "
731 "length %lld\n", base
, length
);
733 // find existing association in pointer table
734 ptr_data
= is_targetptr
?
735 m_device
.find_targetptr_data(base
) :
736 m_device
.find_ptr_data(base
);
739 LIBOFFLOAD_ERROR(c_no_ptr_data
, base
);
742 OFFLOAD_TRACE(3, "Association does not exist\n");
746 OFFLOAD_TRACE(3, "Found association: base %p, length %lld, is_static %d\n",
747 ptr_data
->cpu_addr
.start(), ptr_data
->cpu_addr
.length(),
748 ptr_data
->is_static
);
750 // make sure that provided address range fits into existing one
751 MemRange
addr_range(base
, length
);
752 if (!ptr_data
->cpu_addr
.contains(addr_range
)) {
754 LIBOFFLOAD_ERROR(c_bad_ptr_mem_range
, base
, length
,
755 const_cast<void *>(ptr_data
->cpu_addr
.start()),
756 ptr_data
->cpu_addr
.length());
759 OFFLOAD_TRACE(3, "Existing association partially overlaps with "
760 "data address range\n");
765 // if the entry is associated with static data it may not have buffers
766 // created because they are created on demand.
767 if (ptr_data
->is_static
&& !init_static_ptr_data(ptr_data
)) {
774 bool OffloadDescriptor::init_static_ptr_data(PtrData
*ptr_data
)
776 OffloadTimer
timer(get_timer_data(), c_offload_host_alloc_buffers
);
778 if (ptr_data
->cpu_buf
== 0) {
779 OFFLOAD_TRACE(3, "Creating buffer from source memory %llx\n",
780 ptr_data
->cpu_addr
.start());
782 COIRESULT res
= COI::BufferCreateFromMemory(
783 ptr_data
->cpu_addr
.length(),
786 const_cast<void*>(ptr_data
->cpu_addr
.start()),
787 1, &m_device
.get_process(),
790 if (res
!= COI_SUCCESS
) {
792 m_status
->result
= translate_coi_error(res
);
795 report_coi_error(c_buf_create_from_mem
, res
);
799 if (ptr_data
->mic_buf
== 0) {
800 OFFLOAD_TRACE(3, "Creating buffer from sink memory %llx\n",
803 COIRESULT res
= COI::BufferCreateFromMemory(
804 ptr_data
->cpu_addr
.length(),
807 reinterpret_cast<void*>(ptr_data
->mic_addr
),
808 1, &m_device
.get_process(),
811 if (res
!= COI_SUCCESS
) {
813 m_status
->result
= translate_coi_error(res
);
816 report_coi_error(c_buf_create_from_mem
, res
);
823 bool OffloadDescriptor::init_mic_address(PtrData
*ptr_data
)
825 if (ptr_data
->mic_buf
!= 0 && ptr_data
->mic_addr
== 0) {
826 COIRESULT res
= COI::BufferGetSinkAddress(ptr_data
->mic_buf
,
827 &ptr_data
->mic_addr
);
828 if (res
!= COI_SUCCESS
) {
830 m_status
->result
= translate_coi_error(res
);
832 else if (m_is_mandatory
) {
833 report_coi_error(c_buf_get_address
, res
);
841 bool OffloadDescriptor::nullify_target_stack(
846 char * ptr
= (char*)malloc(size
);
848 LIBOFFLOAD_ERROR(c_malloc
);
851 memset(ptr
, 0, size
);
852 res
= COI::BufferWrite(
857 COI_COPY_UNSPECIFIED
,
860 if (res
!= COI_SUCCESS
) {
862 m_status
->result
= translate_coi_error(res
);
865 report_coi_error(c_buf_write
, res
);
870 bool OffloadDescriptor::offload_stack_memory_manager(
871 const void * stack_begin
,
877 mutex_locker_t
locker(stack_alloc_lock
);
879 PersistData
* new_el
;
880 PersistDataList::iterator it_begin
= m_device
.m_persist_list
.begin();
881 PersistDataList::iterator it_end
;
883 uint64_t cur_thread_id
= m_device
.get_thread_id();
887 for (PersistDataList::iterator it
= m_device
.m_persist_list
.begin();
888 it
!= m_device
.m_persist_list
.end(); it
++) {
889 PersistData cur_el
= *it
;
891 if (stack_begin
> it
->stack_cpu_addr
) {
892 // this stack data must be destroyed
893 if (cur_thread_id
== cur_el
.thread_id
) {
894 m_destroy_stack
.push_front(cur_el
.stack_ptr_data
);
899 else if (stack_begin
== it
->stack_cpu_addr
) {
900 if (routine_id
!= it
-> routine_id
) {
901 // this stack data must be destroyed
902 m_destroy_stack
.push_front(cur_el
.stack_ptr_data
);
908 // stack data is reused
909 m_stack_ptr_data
= it
->stack_ptr_data
;
911 // all obsolete stack sections must be erased from the list
912 m_device
.m_persist_list
.erase(it_begin
, ++it_end
);
915 erase
* sizeof(new_el
->stack_ptr_data
->mic_addr
);
917 OFFLOAD_TRACE(3, "Reuse of stack buffer with addr %p\n",
918 m_stack_ptr_data
->mic_addr
);
922 else if (stack_begin
< it
->stack_cpu_addr
&&
923 cur_thread_id
== cur_el
.thread_id
) {
929 // all obsolete stack sections must be erased from the list
930 m_device
.m_persist_list
.erase(it_begin
, ++it_end
);
931 m_in_datalen
+= erase
* sizeof(new_el
->stack_ptr_data
->mic_addr
);
933 // new stack table is created
934 new_el
= new PersistData(stack_begin
, routine_id
, buf_size
, cur_thread_id
);
937 uint32_t buffer_flags
= 0;
939 // create buffer with large pages if data length exceeds
940 // large page threshold
941 if (buf_size
>= __offload_use_2mb_buffers
) {
942 buffer_flags
= COI_OPTIMIZE_HUGE_PAGE_SIZE
;
944 res
= COI::BufferCreate(buf_size
,
949 &m_device
.get_process(),
950 &new_el
->stack_ptr_data
->mic_buf
);
951 if (res
!= COI_SUCCESS
) {
953 m_status
->result
= translate_coi_error(res
);
955 else if (m_is_mandatory
) {
956 report_coi_error(c_buf_create
, res
);
960 // make buffer valid on the device.
961 res
= COI::BufferSetState(new_el
->stack_ptr_data
->mic_buf
,
962 m_device
.get_process(),
966 if (res
!= COI_SUCCESS
) {
968 m_status
->result
= translate_coi_error(res
);
970 else if (m_is_mandatory
) {
971 report_coi_error(c_buf_set_state
, res
);
975 res
= COI::BufferSetState(new_el
->stack_ptr_data
->mic_buf
,
980 if (res
!= COI_SUCCESS
) {
982 m_status
->result
= translate_coi_error(res
);
984 else if (m_is_mandatory
) {
985 report_coi_error(c_buf_set_state
, res
);
989 // persistence algorithm requires target stack initialy to be nullified
990 if (!nullify_target_stack(new_el
->stack_ptr_data
->mic_buf
, buf_size
)) {
994 m_stack_ptr_data
= new_el
->stack_ptr_data
;
995 init_mic_address(m_stack_ptr_data
);
996 OFFLOAD_TRACE(3, "Allocating stack buffer with addr %p\n",
997 m_stack_ptr_data
->mic_addr
);
998 m_device
.m_persist_list
.push_front(*new_el
);
999 init_mic_address(new_el
->stack_ptr_data
);
1004 bool OffloadDescriptor::setup_descriptors(
1009 const void *stack_addr
1014 OffloadTimer
timer(get_timer_data(), c_offload_host_setup_buffers
);
1016 // make a copy of variable descriptors
1017 m_vars_total
= vars_total
;
1018 if (vars_total
> 0) {
1019 m_vars
= (VarDesc
*) malloc(m_vars_total
* sizeof(VarDesc
));
1021 LIBOFFLOAD_ERROR(c_malloc
);
1022 memcpy(m_vars
, vars
, m_vars_total
* sizeof(VarDesc
));
1023 m_vars_extra
= (VarExtra
*) malloc(m_vars_total
* sizeof(VarExtra
));
1024 if (m_vars_extra
== NULL
)
1025 LIBOFFLOAD_ERROR(c_malloc
);
1029 m_in_deps_allocated
= m_vars_total
+ 1;
1030 m_in_deps
= (COIEVENT
*) malloc(sizeof(COIEVENT
) * m_in_deps_allocated
);
1031 if (m_in_deps
== NULL
)
1032 LIBOFFLOAD_ERROR(c_malloc
);
1033 if (m_vars_total
> 0) {
1034 m_out_deps_allocated
= m_vars_total
;
1035 m_out_deps
= (COIEVENT
*) malloc(sizeof(COIEVENT
) * m_out_deps_allocated
);
1036 if (m_out_deps
== NULL
)
1037 LIBOFFLOAD_ERROR(c_malloc
);
1040 // copyin/copyout data length
1044 // First pass over variable descriptors
1045 // - Calculate size of the input and output non-pointer data
1046 // - Allocate buffers for input and output pointers
1047 for (int i
= 0; i
< m_vars_total
; i
++) {
1048 void* alloc_base
= NULL
;
1049 int64_t alloc_disp
= 0;
1050 int64_t alloc_size
= 0;
1051 bool src_is_for_mic
= (m_vars
[i
].direction
.out
||
1052 m_vars
[i
].into
== NULL
);
1054 const char *var_sname
= "";
1055 if (vars2
!= NULL
&& i
< vars_total
) {
1056 if (vars2
[i
].sname
!= NULL
) {
1057 var_sname
= vars2
[i
].sname
;
1060 OFFLOAD_TRACE(2, " VarDesc %d, var=%s, %s, %s\n",
1062 vardesc_direction_as_string
[m_vars
[i
].direction
.bits
],
1063 vardesc_type_as_string
[m_vars
[i
].type
.src
]);
1064 if (vars2
!= NULL
&& i
< vars_total
&& vars2
[i
].dname
!= NULL
) {
1065 OFFLOAD_TRACE(2, " into=%s, %s\n", vars2
[i
].dname
,
1066 vardesc_type_as_string
[m_vars
[i
].type
.dst
]);
1069 " type_src=%d, type_dstn=%d, direction=%d, "
1070 "alloc_if=%d, free_if=%d, align=%d, mic_offset=%d, flags=0x%x, "
1071 "offset=%lld, size=%lld, count/disp=%lld, ptr=%p, into=%p\n",
1074 m_vars
[i
].direction
.bits
,
1078 m_vars
[i
].mic_offset
,
1079 m_vars
[i
].flags
.bits
,
1085 // If any varDesc flags bits set, show them
1086 if (console_enabled
>= 1 && m_vars
[i
].flags
.bits
!= 0) {
1087 trace_varDesc_flags(get_timer_data(), m_vars
[i
].flags
);
1090 // preallocated implies targetptr
1091 if (m_vars
[i
].flags
.preallocated
) {
1092 // targetptr preallocated alloc_if(1) may not be used with
1094 if (m_vars
[i
].direction
.in
&& m_vars
[i
].alloc_if
) {
1095 LIBOFFLOAD_ERROR(c_in_with_preallocated
);
1098 m_vars
[i
].flags
.targetptr
= 1;
1100 if (m_vars
[i
].alloc
!= NULL
) {
1102 const Arr_Desc
*ap
=
1103 static_cast<const Arr_Desc
*>(m_vars
[i
].alloc
);
1106 ARRAY_DESC_DUMP(" ", "ALLOC", ap
, 0, 1);
1108 __arr_data_offset_and_length(ap
, alloc_disp
, alloc_size
);
1110 alloc_base
= reinterpret_cast<void*>(ap
->base
);
1113 m_vars_extra
[i
].alloc
= m_vars
[i
].alloc
;
1114 m_vars_extra
[i
].cpu_disp
= 0;
1115 m_vars_extra
[i
].cpu_offset
= 0;
1116 m_vars_extra
[i
].src_data
= 0;
1117 m_vars_extra
[i
].read_rng_src
= 0;
1118 m_vars_extra
[i
].read_rng_dst
= 0;
1119 m_vars_extra
[i
].omp_last_event_type
= c_last_not
;
1120 // flag is_arr_ptr_el is 1 only for var_descs generated
1121 // for c_data_ptr_array type
1122 if (i
< vars_total
) {
1123 m_vars_extra
[i
].is_arr_ptr_el
= 0;
1126 switch (m_vars
[i
].type
.src
) {
1127 case c_data_ptr_array
:
1130 const VarDesc3
*vd3
=
1131 static_cast<const VarDesc3
*>(m_vars
[i
].ptr
);
1132 int flags
= vd3
->array_fields
;
1134 " pointer array flags = %04x\n", flags
);
1136 " pointer array type is %s\n",
1137 vardesc_type_as_string
[flags
& 0x3f]);
1138 ap
= static_cast<const Arr_Desc
*>(vd3
->ptr_array
);
1139 ARRAY_DESC_DUMP(" ", "ptr array", ap
,
1140 m_vars
[i
].flags
.is_pointer
, 1);
1141 if (m_vars
[i
].into
) {
1142 ap
= static_cast<const Arr_Desc
*>(m_vars
[i
].into
);
1144 " ", "into array", ap
, 0, 1);
1146 if ((flags
& (1<<flag_align_is_array
)) != 0) {
1147 ap
= static_cast<const Arr_Desc
*>(vd3
->align_array
);
1149 " ", "align array", ap
, 0, 1);
1151 if ((flags
& (1<<flag_alloc_if_is_array
)) != 0) {
1152 ap
= static_cast<const Arr_Desc
*>(vd3
->alloc_if_array
);
1154 " ", "alloc_if array", ap
, 0, 1);
1156 if ((flags
& (1<<flag_free_if_is_array
)) != 0) {
1157 ap
= static_cast<const Arr_Desc
*>(vd3
->free_if_array
);
1159 " ", "free_if array", ap
, 0, 1);
1161 if ((flags
& (1<<flag_extent_start_is_array
)) != 0) {
1162 ap
= static_cast<const Arr_Desc
*>(vd3
->extent_start
);
1164 " ", "extent_start array", ap
, 0, 1);
1166 (1<<flag_extent_start_is_scalar
)) != 0) {
1168 " extent_start scalar = %d\n",
1169 (int64_t)vd3
->extent_start
);
1171 if ((flags
& (1<<flag_extent_elements_is_array
)) != 0) {
1172 ap
= static_cast<const Arr_Desc
*>
1173 (vd3
->extent_elements
);
1174 ARRAY_DESC_DUMP(" ",
1175 "extent_elements array", ap
, 0, 1);
1177 (1<<flag_extent_elements_is_scalar
)) != 0) {
1179 " extent_elements scalar = %d\n",
1180 (int64_t)vd3
->extent_elements
);
1182 if ((flags
& (1<<flag_into_start_is_array
)) != 0) {
1183 ap
= static_cast<const Arr_Desc
*>(vd3
->into_start
);
1185 " ", "into_start array", ap
, 0, 1);
1187 (1<<flag_into_start_is_scalar
)) != 0) {
1189 " into_start scalar = %d\n",
1190 (int64_t)vd3
->into_start
);
1192 if ((flags
& (1<<flag_into_elements_is_array
)) != 0) {
1193 ap
= static_cast<const Arr_Desc
*>(vd3
->into_elements
);
1195 " ", "into_elements array", ap
, 0, 1);
1197 (1<<flag_into_elements_is_scalar
)) != 0) {
1199 " into_elements scalar = %d\n",
1200 (int64_t)vd3
->into_elements
);
1202 if ((flags
& (1<<flag_alloc_start_is_array
)) != 0) {
1203 ap
= static_cast<const Arr_Desc
*>(vd3
->alloc_start
);
1205 " ", "alloc_start array", ap
, 0, 1);
1207 (1<<flag_alloc_start_is_scalar
)) != 0) {
1209 " alloc_start scalar = %d\n",
1210 (int64_t)vd3
->alloc_start
);
1212 if ((flags
& (1<<flag_alloc_elements_is_array
)) != 0) {
1213 ap
= static_cast<const Arr_Desc
*>(vd3
->alloc_elements
);
1214 ARRAY_DESC_DUMP(" ",
1215 "alloc_elements array", ap
, 0, 1);
1217 (1<<flag_alloc_elements_is_scalar
)) != 0) {
1219 " alloc_elements scalar = %d\n",
1220 (int64_t)vd3
->alloc_elements
);
1223 if (!gen_var_descs_for_pointer_array(i
)) {
1231 // In all uses later
1232 // VarDesc.size will have the length of the data to be
1234 // VarDesc.disp will have an offset from base
1235 if (m_vars
[i
].type
.src
== c_cean_var
) {
1237 const Arr_Desc
*ap
=
1238 static_cast<const Arr_Desc
*>(m_vars
[i
].ptr
);
1241 ARRAY_DESC_DUMP("", "IN/OUT", ap
, 0, !src_is_for_mic
);
1243 // offset and length are derived from the array descriptor
1244 __arr_data_offset_and_length(ap
, m_vars
[i
].disp
,
1246 if (!is_arr_desc_contiguous(ap
)) {
1247 m_vars
[i
].flags
.is_noncont_src
= 1;
1248 m_vars_extra
[i
].read_rng_src
=
1249 init_read_ranges_arr_desc(ap
);
1251 // all necessary information about length and offset is
1252 // transferred in var descriptor. There is no need to send
1253 // array descriptor to the target side.
1254 m_vars
[i
].ptr
= reinterpret_cast<void*>(ap
->base
);
1257 m_vars
[i
].size
*= m_vars
[i
].count
;
1261 if (m_vars
[i
].direction
.bits
) {
1262 // make sure that transfer size > 0
1263 if (m_vars
[i
].size
<= 0) {
1264 LIBOFFLOAD_ERROR(c_zero_or_neg_transfer_size
);
1268 if (m_vars
[i
].flags
.is_static
) {
1271 // find data associated with variable
1272 if (!find_ptr_data(ptr_data
,
1280 if (ptr_data
!= 0) {
1281 // offset to base from the beginning of the buffer
1284 (char*) m_vars
[i
].ptr
-
1285 (char*) ptr_data
->cpu_addr
.start();
1288 m_vars
[i
].flags
.is_static
= false;
1289 if (m_vars
[i
].into
== NULL
) {
1290 m_vars
[i
].flags
.is_static_dstn
= false;
1293 m_vars_extra
[i
].src_data
= ptr_data
;
1297 if (m_vars
[i
].flags
.is_static
) {
1298 // Static data is transferred either by omp target
1299 // update construct which passes zeros for
1300 // alloc_if and free_if or by always modifier.
1301 if (!m_vars
[i
].flags
.always_copy
&&
1302 (m_vars
[i
].alloc_if
|| m_vars
[i
].free_if
)) {
1303 m_vars
[i
].direction
.bits
= c_parameter_nocopy
;
1307 AutoData
*auto_data
;
1308 if (m_vars
[i
].alloc_if
) {
1309 auto_data
= m_device
.insert_auto_data(
1310 m_vars
[i
].ptr
, m_vars
[i
].size
);
1311 auto_data
->add_reference();
1314 // TODO: what should be done if var is not in
1316 auto_data
= m_device
.find_auto_data(
1320 // For automatic variables data is transferred:
1321 // - if always modifier is used OR
1322 // - if alloc_if == 0 && free_if == 0 OR
1323 // - if reference count is 1
1324 if (!m_vars
[i
].flags
.always_copy
&&
1325 (m_vars
[i
].alloc_if
|| m_vars
[i
].free_if
) &&
1327 auto_data
->get_reference() != 1) {
1328 m_vars
[i
].direction
.bits
= c_parameter_nocopy
;
1331 // save data for later use
1332 m_vars_extra
[i
].auto_data
= auto_data
;
1336 if (m_vars
[i
].direction
.in
&&
1337 !m_vars
[i
].flags
.is_static
) {
1338 m_in_datalen
+= m_vars
[i
].size
;
1340 // for non-static target destination defined as CEAN
1341 // expression we pass to target its size and dist
1342 if (m_vars
[i
].into
== NULL
&&
1343 m_vars
[i
].type
.src
== c_cean_var
) {
1344 m_in_datalen
+= 2 * sizeof(uint64_t);
1346 m_need_runfunction
= true;
1348 if (m_vars
[i
].direction
.out
&&
1349 !m_vars
[i
].flags
.is_static
) {
1350 m_out_datalen
+= m_vars
[i
].size
;
1351 m_need_runfunction
= true;
1357 if (m_vars
[i
].direction
.bits
||
1358 m_vars
[i
].alloc_if
||
1359 m_vars
[i
].free_if
) {
1360 ArrDesc
*dvp
= static_cast<ArrDesc
*>(m_vars
[i
].ptr
);
1363 __dv_desc_dump("IN/OUT", dvp
);
1365 // send dope vector contents excluding base
1366 m_in_datalen
+= m_vars
[i
].size
- sizeof(uint64_t);
1367 m_need_runfunction
= true;
1372 if ((m_vars
[i
].direction
.bits
||
1373 m_vars
[i
].alloc_if
||
1374 m_vars
[i
].free_if
) &&
1375 m_vars
[i
].size
== 0) {
1378 strlen(*static_cast<char**>(m_vars
[i
].ptr
)) + 1;
1383 if (m_vars
[i
].flags
.is_stack_buf
&&
1384 !m_vars
[i
].direction
.bits
&&
1385 m_vars
[i
].alloc_if
) {
1386 // this var_desc is for stack buffer
1389 if (!offload_stack_memory_manager(
1390 stack_addr
, entry_id
,
1391 m_vars
[i
].count
, m_vars
[i
].align
, &is_new
)) {
1395 m_compute_buffers
.push_back(
1396 m_stack_ptr_data
->mic_buf
);
1397 m_device
.m_persist_list
.front().cpu_stack_addr
=
1398 static_cast<char*>(m_vars
[i
].ptr
);
1401 m_vars
[i
].flags
.sink_addr
= 1;
1402 m_in_datalen
+= sizeof(m_stack_ptr_data
->mic_addr
);
1404 m_vars
[i
].size
= m_destroy_stack
.size();
1405 m_vars_extra
[i
].src_data
= m_stack_ptr_data
;
1407 // need to add or remove references for stack buffer at target
1408 if (is_new
|| m_destroy_stack
.size()) {
1409 m_need_runfunction
= true;
1416 case c_cean_var_ptr
:
1418 if (m_vars
[i
].type
.src
== c_cean_var_ptr
) {
1420 const Arr_Desc
*ap
=
1421 static_cast<const Arr_Desc
*>(m_vars
[i
].ptr
);
1424 ARRAY_DESC_DUMP("", "IN/OUT", ap
, 1, !src_is_for_mic
);
1426 // offset and length are derived from the array descriptor
1427 __arr_data_offset_and_length(ap
, m_vars
[i
].disp
,
1430 if (!is_arr_desc_contiguous(ap
)) {
1431 m_vars
[i
].flags
.is_noncont_src
= 1;
1432 m_vars_extra
[i
].read_rng_src
=
1433 init_read_ranges_arr_desc(ap
);
1435 // all necessary information about length and offset is
1436 // transferred in var descriptor. There is no need to send
1437 // array descriptor to the target side.
1438 m_vars
[i
].ptr
= reinterpret_cast<void*>(ap
->base
);
1440 else if (m_vars
[i
].type
.src
== c_dv_ptr
) {
1441 // need to send DV to the device unless it is 'nocopy'
1442 if (m_vars
[i
].direction
.bits
||
1443 m_vars
[i
].alloc_if
||
1444 m_vars
[i
].free_if
) {
1445 ArrDesc
*dvp
= *static_cast<ArrDesc
**>(m_vars
[i
].ptr
);
1448 __dv_desc_dump("IN/OUT", dvp
);
1450 m_vars
[i
].direction
.bits
= c_parameter_in
;
1457 // c_data_ptr or c_string_ptr
1458 m_vars
[i
].size
*= m_vars
[i
].count
;
1462 if (m_vars
[i
].direction
.bits
||
1463 m_vars
[i
].alloc_if
||
1464 m_vars
[i
].free_if
) {
1467 // check that buffer length > 0
1468 if (m_vars
[i
].alloc_if
&&
1469 m_vars
[i
].disp
+ m_vars
[i
].size
<
1470 (m_is_openmp
? 0 : 1)) {
1471 LIBOFFLOAD_ERROR(c_zero_or_neg_ptr_len
);
1476 void *base
= *static_cast<void**>(m_vars
[i
].ptr
);
1478 // allocate buffer if we have no INTO and don't need
1479 // allocation for the ptr at target
1480 if (src_is_for_mic
) {
1481 if (m_vars
[i
].flags
.is_stack_buf
) {
1482 // for stack persistent objects ptr data is created
1483 // by var_desc with number 0.
1484 // Its ptr_data is stored at m_stack_ptr_data
1485 ptr_data
= m_stack_ptr_data
;
1486 m_vars
[i
].flags
.sink_addr
= 1;
1488 else if (m_vars
[i
].alloc_if
) {
1489 if (m_vars
[i
].flags
.preallocated
) {
1490 m_out_datalen
+= sizeof(void*);
1491 m_need_runfunction
= true;
1495 if (!alloc_ptr_data(
1497 reinterpret_cast<char *>(base
) + alloc_disp
,
1498 (alloc_base
!= NULL
) ?
1499 alloc_disp
: m_vars
[i
].disp
,
1500 (alloc_base
!= NULL
) ?
1501 alloc_size
: m_vars
[i
].size
,
1503 (alloc_base
!= NULL
) ?
1504 0 : m_vars
[i
].align
,
1505 m_vars
[i
].flags
.targetptr
,
1507 m_vars
[i
].flags
.pin
)) {
1510 if (m_vars
[i
].flags
.targetptr
) {
1511 if (!init_mic_address(ptr_data
)) {
1514 *static_cast<void**>(m_vars
[i
].ptr
) = base
=
1515 reinterpret_cast<void*>(ptr_data
->mic_addr
);
1517 if (ptr_data
->add_reference() == 0 &&
1518 ptr_data
->mic_buf
!= 0) {
1519 // add buffer to the list of buffers that
1520 // are passed to dispatch call
1521 m_compute_buffers
.push_back(
1524 else if (!m_vars
[i
].flags
.pin
&&
1525 !m_vars
[i
].flags
.preallocated
) {
1526 // will send buffer address to device
1527 m_vars
[i
].flags
.sink_addr
= 1;
1530 if (!m_vars
[i
].flags
.pin
&&
1531 !ptr_data
->is_static
) {
1532 // need to add reference for buffer
1533 m_need_runfunction
= true;
1537 bool error_if_not_found
= true;
1539 // For omp target update variable is ignored
1540 // if it does not exist.
1541 if (m_vars
[i
].flags
.always_copy
||
1542 (!m_vars
[i
].alloc_if
&&
1543 !m_vars
[i
].free_if
)) {
1544 error_if_not_found
= false;
1548 // use existing association from pointer table
1549 if (!find_ptr_data(ptr_data
,
1553 m_vars
[i
].flags
.targetptr
,
1554 error_if_not_found
)) {
1559 // make var nocopy if it does not exist
1560 if (ptr_data
== 0) {
1561 m_vars
[i
].direction
.bits
=
1566 if (ptr_data
!= 0) {
1567 m_vars
[i
].flags
.sink_addr
= 1;
1571 if (ptr_data
!= 0) {
1573 // data is transferred only if
1574 // alloc_if == 0 && free_if == 0
1575 // or reference count is 1
1576 if (!m_vars
[i
].flags
.always_copy
&&
1577 ((m_vars
[i
].alloc_if
||
1578 m_vars
[i
].free_if
) &&
1579 ptr_data
->get_reference() != 1)) {
1580 m_vars
[i
].direction
.bits
=
1585 if (ptr_data
->alloc_disp
!= 0) {
1586 m_vars
[i
].flags
.alloc_disp
= 1;
1587 m_in_datalen
+= sizeof(alloc_disp
);
1590 if (m_vars
[i
].flags
.sink_addr
) {
1591 // get buffers's address on the sink
1592 if (!init_mic_address(ptr_data
)) {
1596 m_in_datalen
+= sizeof(ptr_data
->mic_addr
);
1599 if (!m_vars
[i
].flags
.pin
&&
1600 !ptr_data
->is_static
&& m_vars
[i
].free_if
) {
1601 // need to decrement buffer reference on target
1602 m_need_runfunction
= true;
1605 // offset to base from the beginning of the buffer
1607 m_vars
[i
].offset
= (char*) base
-
1608 (char*) ptr_data
->cpu_addr
.start();
1610 // copy other pointer properties to var descriptor
1611 m_vars
[i
].mic_offset
= ptr_data
->mic_offset
;
1612 m_vars
[i
].flags
.is_static
= ptr_data
->is_static
;
1616 if (!find_ptr_data(ptr_data
,
1626 (char*) ptr_data
->cpu_addr
.start();
1630 // save pointer data
1631 m_vars_extra
[i
].src_data
= ptr_data
;
1636 if (m_vars
[i
].direction
.in
) {
1637 m_in_datalen
+= __offload_funcs
.max_name_length();
1639 if (m_vars
[i
].direction
.out
) {
1640 m_out_datalen
+= __offload_funcs
.max_name_length();
1642 m_need_runfunction
= true;
1647 case c_dv_data_slice
:
1648 case c_dv_ptr_data_slice
:
1650 if (VAR_TYPE_IS_DV_DATA_SLICE(m_vars
[i
].type
.src
)) {
1652 ap
= static_cast<const Arr_Desc
*>(m_vars
[i
].ptr
);
1654 dvp
= (m_vars
[i
].type
.src
== c_dv_data_slice
) ?
1655 reinterpret_cast<ArrDesc
*>(ap
->base
) :
1656 *reinterpret_cast<ArrDesc
**>(ap
->base
);
1659 dvp
= (m_vars
[i
].type
.src
== c_dv_data
) ?
1660 static_cast<ArrDesc
*>(m_vars
[i
].ptr
) :
1661 *static_cast<ArrDesc
**>(m_vars
[i
].ptr
);
1664 // if allocatable dope vector isn't allocated don't
1665 // transfer its data
1666 if (!__dv_is_allocated(dvp
)) {
1667 m_vars
[i
].direction
.bits
= c_parameter_nocopy
;
1668 m_vars
[i
].alloc_if
= 0;
1669 m_vars
[i
].free_if
= 0;
1671 if (m_vars
[i
].direction
.bits
||
1672 m_vars
[i
].alloc_if
||
1673 m_vars
[i
].free_if
) {
1676 if (VAR_TYPE_IS_DV_DATA_SLICE(m_vars
[i
].type
.src
)) {
1677 ap
= static_cast<const Arr_Desc
*>(m_vars
[i
].ptr
);
1680 ARRAY_DESC_DUMP("", "IN/OUT", ap
, 0, !src_is_for_mic
);
1682 if (!__dv_is_contiguous(dvp
)) {
1683 m_vars
[i
].flags
.is_noncont_src
= 1;
1684 m_vars_extra
[i
].read_rng_src
=
1685 init_read_ranges_dv(dvp
);
1688 // size and displacement
1689 if (VAR_TYPE_IS_DV_DATA_SLICE(m_vars
[i
].type
.src
)) {
1690 // offset and length are derived from the
1692 __arr_data_offset_and_length(ap
,
1695 if (m_vars
[i
].direction
.bits
) {
1696 if (!is_arr_desc_contiguous(ap
)) {
1697 if (m_vars
[i
].flags
.is_noncont_src
) {
1698 LIBOFFLOAD_ERROR(c_slice_of_noncont_array
);
1701 m_vars
[i
].flags
.is_noncont_src
= 1;
1702 m_vars_extra
[i
].read_rng_src
=
1703 init_read_ranges_arr_desc(ap
);
1708 if (m_vars
[i
].flags
.has_length
) {
1710 __dv_data_length(dvp
, m_vars
[i
].count
);
1713 m_vars
[i
].size
= __dv_data_length(dvp
);
1718 // check that length >= 0
1719 if (m_vars
[i
].alloc_if
&&
1720 (m_vars
[i
].disp
+ m_vars
[i
].size
< 0)) {
1721 LIBOFFLOAD_ERROR(c_zero_or_neg_ptr_len
);
1726 void *base
= reinterpret_cast<void*>(dvp
->Base
);
1729 // allocate buffer if we have no INTO and don't need
1730 // allocation for the ptr at target
1731 if (src_is_for_mic
) {
1732 if (m_vars
[i
].alloc_if
) {
1734 if (!alloc_ptr_data(
1736 reinterpret_cast<char *>(base
) + alloc_disp
,
1737 (alloc_base
!= NULL
) ?
1738 alloc_disp
: m_vars
[i
].disp
,
1739 (alloc_base
!= NULL
) ?
1740 alloc_size
: m_vars
[i
].size
,
1742 (alloc_base
!= NULL
) ?
1743 0 : m_vars
[i
].align
,
1744 m_vars
[i
].flags
.targetptr
,
1745 m_vars
[i
].flags
.preallocated
,
1746 m_vars
[i
].flags
.pin
)) {
1750 if (ptr_data
->add_reference() == 0 &&
1751 ptr_data
->mic_buf
!= 0) {
1752 // add buffer to the list of buffers
1753 // that are passed to dispatch call
1754 m_compute_buffers
.push_back(
1758 // will send buffer address to device
1759 m_vars
[i
].flags
.sink_addr
= 1;
1762 if (!ptr_data
->is_static
) {
1763 // need to add reference for buffer
1764 m_need_runfunction
= true;
1768 bool error_if_not_found
= true;
1770 // For omp target update variable is ignored
1771 // if it does not exist.
1772 if (m_vars
[i
].flags
.always_copy
||
1773 (!m_vars
[i
].alloc_if
&&
1774 !m_vars
[i
].free_if
)) {
1775 error_if_not_found
= false;
1779 // use existing association from pointer table
1780 if (!find_ptr_data(ptr_data
,
1784 m_vars
[i
].flags
.targetptr
,
1785 error_if_not_found
)) {
1790 // make var nocopy if it does not exist
1791 if (ptr_data
== 0) {
1792 m_vars
[i
].direction
.bits
=
1797 if (ptr_data
!= 0) {
1798 // need to update base in dope vector on device
1799 m_vars
[i
].flags
.sink_addr
= 1;
1803 if (ptr_data
!= 0) {
1805 // data is transferred if
1806 // - if always modifier is used OR
1807 // - if alloc_if == 0 && free_if == 0 OR
1808 // - if reference count is 1
1809 if (!m_vars
[i
].flags
.always_copy
&&
1810 (m_vars
[i
].alloc_if
||
1811 m_vars
[i
].free_if
) &&
1812 ptr_data
->get_reference() != 1) {
1813 m_vars
[i
].direction
.bits
=
1818 if (ptr_data
->alloc_disp
!= 0) {
1819 m_vars
[i
].flags
.alloc_disp
= 1;
1820 m_in_datalen
+= sizeof(alloc_disp
);
1823 if (m_vars
[i
].flags
.sink_addr
) {
1824 // get buffers's address on the sink
1825 if (!init_mic_address(ptr_data
)) {
1829 m_in_datalen
+= sizeof(ptr_data
->mic_addr
);
1832 if (!ptr_data
->is_static
&& m_vars
[i
].free_if
) {
1833 // need to decrement buffer reference on target
1834 m_need_runfunction
= true;
1837 // offset to base from the beginning of the buffer
1841 (char*) ptr_data
->cpu_addr
.start();
1843 // copy other pointer properties to var descriptor
1844 m_vars
[i
].mic_offset
= ptr_data
->mic_offset
;
1845 m_vars
[i
].flags
.is_static
= ptr_data
->is_static
;
1848 else { // !src_is_for_mic
1849 if (!find_ptr_data(ptr_data
,
1856 m_vars
[i
].offset
= !ptr_data
? 0 :
1858 (char*) ptr_data
->cpu_addr
.start();
1861 // save pointer data
1862 m_vars_extra
[i
].src_data
= ptr_data
;
1867 LIBOFFLOAD_ERROR(c_unknown_var_type
, m_vars
[i
].type
.src
);
1870 if (m_vars
[i
].type
.src
== c_data_ptr_array
) {
1874 if (src_is_for_mic
&& m_vars
[i
].flags
.is_stack_buf
) {
1875 m_vars
[i
].offset
= static_cast<char*>(m_vars
[i
].ptr
) -
1876 m_device
.m_persist_list
.front().cpu_stack_addr
;
1878 // if source is used at CPU save its offset and disp
1879 if (m_vars
[i
].into
== NULL
|| m_vars
[i
].direction
.in
) {
1880 m_vars_extra
[i
].cpu_offset
= m_vars
[i
].offset
;
1881 m_vars_extra
[i
].cpu_disp
= m_vars
[i
].disp
;
1884 // If "into" is define we need to do the similar work for it
1885 if (!m_vars
[i
].into
) {
1889 int64_t into_disp
=0, into_offset
= 0;
1891 switch (m_vars
[i
].type
.dst
) {
1892 case c_data_ptr_array
:
1897 int64_t size
= m_vars
[i
].size
;
1899 if (m_vars
[i
].type
.dst
== c_cean_var
) {
1901 const Arr_Desc
*ap
=
1902 static_cast<const Arr_Desc
*>(m_vars
[i
].into
);
1905 ARRAY_DESC_DUMP(" ", "INTO", ap
, 0, src_is_for_mic
);
1907 // offset and length are derived from the array descriptor
1908 __arr_data_offset_and_length(ap
, into_disp
, size
);
1910 if (!is_arr_desc_contiguous(ap
)) {
1911 m_vars
[i
].flags
.is_noncont_dst
= 1;
1912 m_vars_extra
[i
].read_rng_dst
=
1913 init_read_ranges_arr_desc(ap
);
1914 if (!cean_ranges_match(
1915 m_vars_extra
[i
].read_rng_src
,
1916 m_vars_extra
[i
].read_rng_dst
)) {
1917 LIBOFFLOAD_ERROR(c_ranges_dont_match
);
1921 m_vars
[i
].into
= reinterpret_cast<void*>(ap
->base
);
1924 int64_t size_src
= m_vars_extra
[i
].read_rng_src
?
1925 cean_get_transf_size(m_vars_extra
[i
].read_rng_src
) :
1927 int64_t size_dst
= m_vars_extra
[i
].read_rng_dst
?
1928 cean_get_transf_size(m_vars_extra
[i
].read_rng_dst
) :
1930 // It's supposed that "into" size must be not less
1932 if (size_src
> size_dst
) {
1933 LIBOFFLOAD_ERROR(c_different_src_and_dstn_sizes
,
1934 size_src
, size_dst
);
1938 if (m_vars
[i
].direction
.bits
) {
1939 if (m_vars
[i
].flags
.is_static_dstn
) {
1942 // find data associated with variable
1943 if (!find_ptr_data(ptr_data
, m_vars
[i
].into
,
1944 into_disp
, size
, false, false)) {
1947 if (ptr_data
!= 0) {
1948 // offset to base from the beginning of the buffer
1951 (char*) m_vars
[i
].into
-
1952 (char*) ptr_data
->cpu_addr
.start();
1955 m_vars
[i
].flags
.is_static_dstn
= false;
1957 m_vars_extra
[i
].dst_data
= ptr_data
;
1961 if (m_vars
[i
].direction
.in
&&
1962 !m_vars
[i
].flags
.is_static_dstn
) {
1963 m_in_datalen
+= m_vars
[i
].size
;
1965 // for non-static target destination defined as CEAN
1966 // expression we pass to target its size and dist
1967 if (m_vars
[i
].type
.dst
== c_cean_var
) {
1968 m_in_datalen
+= 2 * sizeof(uint64_t);
1970 m_need_runfunction
= true;
1976 if (m_vars
[i
].direction
.bits
||
1977 m_vars
[i
].alloc_if
||
1978 m_vars
[i
].free_if
) {
1979 ArrDesc
*dvp
= static_cast<ArrDesc
*>(m_vars
[i
].into
);
1982 __dv_desc_dump("INTO", dvp
);
1984 // send dope vector contents excluding base
1985 m_in_datalen
+= m_vars
[i
].size
- sizeof(uint64_t);
1986 m_need_runfunction
= true;
1992 case c_cean_var_ptr
:
1994 int64_t size
= m_vars
[i
].size
;
1996 if (m_vars
[i
].type
.dst
== c_cean_var_ptr
) {
1998 const Arr_Desc
*ap
=
1999 static_cast<const Arr_Desc
*>(m_vars
[i
].into
);
2002 ARRAY_DESC_DUMP(" ", "INTO", ap
, 1, src_is_for_mic
);
2004 // offset and length are derived from the array descriptor
2005 __arr_data_offset_and_length(ap
, into_disp
, size
);
2007 if (!is_arr_desc_contiguous(ap
)) {
2008 m_vars
[i
].flags
.is_noncont_src
= 1;
2009 m_vars_extra
[i
].read_rng_dst
=
2010 init_read_ranges_arr_desc(ap
);
2011 if (!cean_ranges_match(
2012 m_vars_extra
[i
].read_rng_src
,
2013 m_vars_extra
[i
].read_rng_dst
)) {
2014 LIBOFFLOAD_ERROR(c_ranges_dont_match
);
2017 m_vars
[i
].into
= reinterpret_cast<char**>(ap
->base
);
2019 else if (m_vars
[i
].type
.dst
== c_dv_ptr
) {
2020 // need to send DV to the device unless it is 'nocopy'
2021 if (m_vars
[i
].direction
.bits
||
2022 m_vars
[i
].alloc_if
||
2023 m_vars
[i
].free_if
) {
2024 ArrDesc
*dvp
= *static_cast<ArrDesc
**>(m_vars
[i
].into
);
2027 __dv_desc_dump("INTO", dvp
);
2029 m_vars
[i
].direction
.bits
= c_parameter_in
;
2033 int64_t size_src
= m_vars_extra
[i
].read_rng_src
?
2034 cean_get_transf_size(m_vars_extra
[i
].read_rng_src
) :
2036 int64_t size_dst
= m_vars_extra
[i
].read_rng_dst
?
2037 cean_get_transf_size(m_vars_extra
[i
].read_rng_dst
) :
2039 // It's supposed that "into" size must be not less than
2041 if (size_src
> size_dst
) {
2042 LIBOFFLOAD_ERROR(c_different_src_and_dstn_sizes
,
2043 size_src
, size_dst
);
2047 if (m_vars
[i
].direction
.bits
) {
2051 void *base
= *static_cast<void**>(m_vars
[i
].into
);
2053 if (m_vars
[i
].direction
.in
) {
2055 if (m_vars
[i
].flags
.is_stack_buf
) {
2056 // for stack persistent objects ptr data is created
2057 // by var_desc with number 0.
2058 // Its ptr_data is stored at m_stack_ptr_data
2059 ptr_data
= m_stack_ptr_data
;
2060 m_vars
[i
].flags
.sink_addr
= 1;
2062 else if (m_vars
[i
].alloc_if
) {
2063 if (m_vars
[i
].flags
.preallocated
) {
2064 m_out_datalen
+= sizeof(void*);
2065 m_need_runfunction
= true;
2069 if (!alloc_ptr_data(
2071 reinterpret_cast<char *>(base
) + alloc_disp
,
2072 (alloc_base
!= NULL
) ?
2073 alloc_disp
: into_disp
,
2074 (alloc_base
!= NULL
) ?
2077 (alloc_base
!= NULL
) ?
2078 0 : m_vars
[i
].align
,
2079 m_vars
[i
].flags
.targetptr
,
2080 m_vars
[i
].flags
.preallocated
,
2081 m_vars
[i
].flags
.pin
)) {
2084 if (m_vars
[i
].flags
.targetptr
) {
2085 if (!init_mic_address(ptr_data
)) {
2088 *static_cast<void**>(m_vars
[i
].into
) = base
=
2089 reinterpret_cast<void*>(ptr_data
->mic_addr
);
2091 if (ptr_data
->add_reference() == 0 &&
2092 ptr_data
->mic_buf
!= 0) {
2093 // add buffer to the list of buffers that
2094 // are passed to dispatch call
2095 m_compute_buffers
.push_back(
2099 // will send buffer address to device
2100 m_vars
[i
].flags
.sink_addr
= 1;
2103 if (!ptr_data
->is_static
) {
2104 // need to add reference for buffer
2105 m_need_runfunction
= true;
2109 // use existing association from pointer table
2110 if (!find_ptr_data(ptr_data
, base
, into_disp
,
2111 size
, m_vars
[i
].flags
.targetptr
, true)) {
2114 m_vars
[i
].flags
.sink_addr
= 1;
2117 if (ptr_data
->alloc_disp
!= 0) {
2118 m_vars
[i
].flags
.alloc_disp
= 1;
2119 m_in_datalen
+= sizeof(alloc_disp
);
2122 if (m_vars
[i
].flags
.sink_addr
) {
2123 // get buffers's address on the sink
2124 if (!init_mic_address(ptr_data
)) {
2128 m_in_datalen
+= sizeof(ptr_data
->mic_addr
);
2131 if (!ptr_data
->is_static
&& m_vars
[i
].free_if
) {
2132 // need to decrement buffer reference on target
2133 m_need_runfunction
= true;
2136 // copy other pointer properties to var descriptor
2137 m_vars
[i
].mic_offset
= ptr_data
->mic_offset
;
2138 m_vars
[i
].flags
.is_static_dstn
= ptr_data
->is_static
;
2141 if (!find_ptr_data(ptr_data
,
2150 into_offset
= ptr_data
?
2152 (char*) ptr_data
->cpu_addr
.start() :
2155 // save pointer data
2156 m_vars_extra
[i
].dst_data
= ptr_data
;
2166 case c_dv_data_slice
:
2167 case c_dv_ptr_data_slice
:
2168 if (m_vars
[i
].direction
.bits
||
2169 m_vars
[i
].alloc_if
||
2170 m_vars
[i
].free_if
) {
2177 if (VAR_TYPE_IS_DV_DATA_SLICE(m_vars
[i
].type
.dst
)) {
2178 ap
= static_cast<const Arr_Desc
*>(m_vars
[i
].into
);
2181 ARRAY_DESC_DUMP(" ", "INTO", ap
, 0, src_is_for_mic
);
2183 dvp
= (m_vars
[i
].type
.dst
== c_dv_data_slice
) ?
2184 reinterpret_cast<ArrDesc
*>(ap
->base
) :
2185 *reinterpret_cast<ArrDesc
**>(ap
->base
);
2188 dvp
= (m_vars
[i
].type
.dst
== c_dv_data
) ?
2189 static_cast<ArrDesc
*>(m_vars
[i
].into
) :
2190 *static_cast<ArrDesc
**>(m_vars
[i
].into
);
2192 if (!__dv_is_contiguous(dvp
)) {
2193 m_vars
[i
].flags
.is_noncont_dst
= 1;
2194 m_vars_extra
[i
].read_rng_dst
=
2195 init_read_ranges_dv(dvp
);
2197 // size and displacement
2198 if (VAR_TYPE_IS_DV_DATA_SLICE(m_vars
[i
].type
.dst
)) {
2199 // offset and length are derived from the array
2201 __arr_data_offset_and_length(ap
, into_disp
, size
);
2202 if (m_vars
[i
].direction
.bits
) {
2203 if (!is_arr_desc_contiguous(ap
)) {
2204 if (m_vars
[i
].flags
.is_noncont_dst
) {
2205 LIBOFFLOAD_ERROR(c_slice_of_noncont_array
);
2208 m_vars
[i
].flags
.is_noncont_dst
= 1;
2209 m_vars_extra
[i
].read_rng_dst
=
2210 init_read_ranges_arr_desc(ap
);
2211 if (!cean_ranges_match(
2212 m_vars_extra
[i
].read_rng_src
,
2213 m_vars_extra
[i
].read_rng_dst
)) {
2214 LIBOFFLOAD_ERROR(c_ranges_dont_match
);
2220 if (m_vars
[i
].flags
.has_length
) {
2221 size
= __dv_data_length(dvp
, m_vars
[i
].count
);
2224 size
= __dv_data_length(dvp
);
2230 m_vars_extra
[i
].read_rng_src
?
2231 cean_get_transf_size(m_vars_extra
[i
].read_rng_src
) :
2234 m_vars_extra
[i
].read_rng_dst
?
2235 cean_get_transf_size(m_vars_extra
[i
].read_rng_dst
) :
2237 // It's supposed that "into" size must be not less
2239 if (size_src
> size_dst
) {
2240 LIBOFFLOAD_ERROR(c_different_src_and_dstn_sizes
,
2241 size_src
, size_dst
);
2246 void *base
= reinterpret_cast<void*>(dvp
->Base
);
2249 if (m_vars
[i
].direction
.in
) {
2250 if (m_vars
[i
].alloc_if
) {
2252 if (!alloc_ptr_data(
2254 reinterpret_cast<char *>(base
) + alloc_disp
,
2255 (alloc_base
!= NULL
) ?
2256 alloc_disp
: into_disp
,
2257 (alloc_base
!= NULL
) ?
2260 (alloc_base
!= NULL
) ?
2261 0 : m_vars
[i
].align
,
2262 m_vars
[i
].flags
.targetptr
,
2263 m_vars
[i
].flags
.preallocated
,
2264 m_vars
[i
].flags
.pin
)) {
2267 if (ptr_data
->add_reference() == 0 &&
2268 ptr_data
->mic_buf
!=0) {
2269 // add buffer to the list of buffers
2270 // that are passed to dispatch call
2271 m_compute_buffers
.push_back(
2275 // will send buffer address to device
2276 m_vars
[i
].flags
.sink_addr
= 1;
2279 if (!ptr_data
->is_static
) {
2280 // need to add reference for buffer
2281 m_need_runfunction
= true;
2285 // use existing association from pointer table
2286 if (!find_ptr_data(ptr_data
, base
, into_disp
,
2287 size
, m_vars
[i
].flags
.targetptr
, true)) {
2291 // need to update base in dope vector on device
2292 m_vars
[i
].flags
.sink_addr
= 1;
2295 if (ptr_data
->alloc_disp
!= 0) {
2296 m_vars
[i
].flags
.alloc_disp
= 1;
2297 m_in_datalen
+= sizeof(alloc_disp
);
2300 if (m_vars
[i
].flags
.sink_addr
) {
2301 // get buffers's address on the sink
2302 if (!init_mic_address(ptr_data
)) {
2305 m_in_datalen
+= sizeof(ptr_data
->mic_addr
);
2308 if (!ptr_data
->is_static
&& m_vars
[i
].free_if
) {
2309 // need to decrement buffer reference on target
2310 m_need_runfunction
= true;
2313 // offset to base from the beginning of the buffer
2316 (char*) base
- (char*) ptr_data
->cpu_addr
.start();
2318 // copy other pointer properties to var descriptor
2319 m_vars
[i
].mic_offset
= ptr_data
->mic_offset
;
2320 m_vars
[i
].flags
.is_static_dstn
= ptr_data
->is_static
;
2322 else { // src_is_for_mic
2323 if (!find_ptr_data(ptr_data
,
2330 into_offset
= !ptr_data
?
2332 (char*) base
- (char*) ptr_data
->cpu_addr
.start();
2335 // save pointer data
2336 m_vars_extra
[i
].dst_data
= ptr_data
;
2341 LIBOFFLOAD_ERROR(c_unknown_var_type
, m_vars
[i
].type
.src
);
2344 // if into is used at CPU save its offset and disp
2345 if (m_vars
[i
].direction
.out
) {
2346 m_vars_extra
[i
].cpu_offset
= into_offset
;
2347 m_vars_extra
[i
].cpu_disp
= into_disp
;
2350 if (m_vars
[i
].flags
.is_stack_buf
) {
2351 into_offset
= static_cast<char*>(m_vars
[i
].into
) -
2352 m_device
.m_persist_list
.front().cpu_stack_addr
;
2354 m_vars
[i
].offset
= into_offset
;
2355 m_vars
[i
].disp
= into_disp
;
2362 bool OffloadDescriptor::setup_misc_data(const char *name
)
2364 OffloadTimer
timer(get_timer_data(), c_offload_host_setup_misc_data
);
2366 // we can skip run functon call together with wait if offloaded
2367 // region is empty and there is no user defined non-pointer IN/OUT data
2368 if (m_need_runfunction
) {
2369 // variable descriptors are sent as input data
2370 m_in_datalen
+= m_vars_total
* sizeof(VarDesc
);
2372 // timer data is sent as a part of the output data
2373 m_out_datalen
+= OFFLOAD_TIMER_DATALEN();
2375 // max from input data and output data length
2376 uint64_t data_len
= m_in_datalen
> m_out_datalen
? m_in_datalen
:
2379 // Misc data has the following layout
2380 // <Function Descriptor>
2382 // <In/Out Data> (optional)
2384 // We can transfer copyin/copyout data in misc/return data which can
2385 // be passed to run function call if its size does not exceed
2386 // COI_PIPELINE_MAX_IN_MISC_DATA_LEN. Otherwise we have to allocate
2389 m_func_desc_size
= sizeof(FunctionDescriptor
) + strlen(name
) + 1;
2390 m_func_desc_size
= (m_func_desc_size
+ 7) & ~7;
2392 int misc_data_offset
= 0;
2393 int misc_data_size
= 0;
2395 if (m_func_desc_size
+
2396 m_in_datalen
<= COI_PIPELINE_MAX_IN_MISC_DATA_LEN
&&
2397 m_out_datalen
<= COI_PIPELINE_MAX_IN_MISC_DATA_LEN
) {
2398 // use misc/return data for copyin/copyout
2399 misc_data_offset
= m_func_desc_size
;
2400 misc_data_size
= data_len
;
2403 OffloadTimer
timer_buf(get_timer_data(),
2404 c_offload_host_alloc_data_buffer
);
2406 // send/receive data using buffer
2407 COIRESULT res
= COI::BufferCreate(data_len
,
2410 1, &m_device
.get_process(),
2412 if (res
!= COI_SUCCESS
) {
2413 if (m_status
!= 0) {
2414 m_status
->result
= translate_coi_error(res
);
2417 report_coi_error(c_buf_create
, res
);
2420 m_compute_buffers
.push_back(m_inout_buf
);
2421 m_destroy_buffers
.push_back(m_inout_buf
);
2425 // initialize function descriptor
2426 m_func_desc
= (FunctionDescriptor
*) calloc(1, m_func_desc_size
2428 if (m_func_desc
== NULL
)
2429 LIBOFFLOAD_ERROR(c_malloc
);
2430 m_func_desc
->console_enabled
= console_enabled
;
2431 m_func_desc
->timer_enabled
= offload_report_enabled
&&
2432 (timer_enabled
|| offload_report_level
);
2433 m_func_desc
->offload_report_level
= offload_report_enabled
?
2434 offload_report_level
: 0;
2435 m_func_desc
->offload_number
= GET_OFFLOAD_NUMBER(get_timer_data());
2436 m_func_desc
->in_datalen
= m_in_datalen
;
2437 m_func_desc
->out_datalen
= m_out_datalen
;
2438 m_func_desc
->vars_num
= m_vars_total
;
2439 m_func_desc
->data_offset
= misc_data_offset
;
2441 // append entry name
2442 strcpy(m_func_desc
->data
, name
);
2448 void OffloadDescriptor::setup_omp_async_info()
2450 OFFLOAD_TRACE(2, "setup_omp_async_info\n");
2451 OmpAsyncLastEventType event_type
= m_need_runfunction
?
2452 c_last_runfunc
: c_last_write
;
2453 int last_in
= m_need_runfunction
? 0 : -1;
2456 for (i
= m_vars_total
- 1; i
>=0; i
--) {
2457 switch (m_vars
[i
].type
.dst
) {
2461 if (m_vars
[i
].direction
.out
&&
2462 m_vars
[i
].flags
.is_static_dstn
) {
2463 event_type
= c_last_read
;
2465 else if (last_in
< 0 && m_vars
[i
].direction
.in
&&
2466 m_vars
[i
].flags
.is_static_dstn
) {
2472 case c_cean_var_ptr
:
2476 case c_dv_data_slice
:
2477 case c_dv_ptr_data_slice
:
2479 if (m_vars
[i
].direction
.out
) {
2480 event_type
= c_last_read
;
2482 else if (last_in
< 0 && m_vars
[i
].direction
.in
) {
2489 if (event_type
== c_last_read
) {
2494 if (event_type
== c_last_read
) {
2495 m_vars_extra
[i
].omp_last_event_type
= c_last_read
;
2497 else if (event_type
== c_last_write
) {
2498 m_vars_extra
[last_in
].omp_last_event_type
= c_last_write
;
2500 m_omp_async_last_event_type
= event_type
;
2501 OFFLOAD_TRACE(2, "setup_omp_async_info: event_type=%d\n",
2502 m_omp_async_last_event_type
);
2506 void offload_proxy_task_completed_ooo(
2512 task_completion_callback ((void *) info
);
2516 void OffloadDescriptor::register_omp_event_call_back(
2517 const COIEVENT
*event
,
2520 OFFLOAD_TRACE(2, "register_omp_event_call_back(event=%p, info=%p)\n",
2522 if (COI::EventRegisterCallback
) {
2523 COI::EventRegisterCallback(
2525 &offload_proxy_task_completed_ooo
,
2528 "COI::EventRegisterCallback found; callback registered\n");
2532 bool OffloadDescriptor::wait_dependencies(
2535 _Offload_stream handle
2538 OffloadTimer
timer(get_timer_data(), c_offload_host_wait_deps
);
2540 OffloadDescriptor
*task
;
2541 if (num_waits
== 0) {
2546 if (num_waits
== -1) {
2548 // some specific stream of the device
2550 stream
= Stream::find_stream(handle
, false);
2552 // the stream was not created or was destroyed
2554 LIBOFFLOAD_ERROR(c_offload_no_stream
, m_device
.get_logical_index());
2557 task
= stream
->get_last_offload();
2559 // offload was completed by previous offload_wait pragma
2564 if (!task
->offload_finish(0)) { //arg is 0 for is_traceback
2568 stream
->set_last_offload(NULL
);
2571 // all streams of the device or over all devices
2573 StreamMap stream_map
= Stream::all_streams
;
2574 for (StreamMap::iterator it
= stream_map
.begin();
2575 it
!= stream_map
.end(); it
++) {
2576 Stream
* stream
= it
->second
;
2578 if (!m_wait_all_devices
&&
2579 stream
->get_device() != m_device
.get_logical_index()) {
2582 // get associated async task
2583 OffloadDescriptor
*task
= stream
->get_last_offload();
2585 // offload was completed by offload_wait pragma or wait clause
2589 if (!task
->offload_finish(0)) { //arg is 0 for is_traceback
2593 stream
->set_last_offload(NULL
);
2596 // no uncompleted streams
2601 // if handle is equal to no_stream it's wait for signals
2602 for (int i
= 0; i
< num_waits
; i
++) {
2603 _Offload_stream stream_handle
;
2605 task
= m_device
.find_signal(waits
[i
], true);
2607 LIBOFFLOAD_ERROR(c_offload1
, m_device
.get_logical_index(),
2611 else if (task
== SIGNAL_IS_REMOVED
) {
2614 if (!task
->offload_finish(0)) { //arg is 0 for is_traceback
2618 // if the offload both has signal and is last offload of its
2619 // stream, we must wipe out the "last_offload" reference as
2620 // the offload already is finished.
2621 stream_handle
= task
->m_stream
;
2622 if (stream_handle
!= -1) {
2623 stream
= Stream::find_stream(stream_handle
, false);
2624 if (stream
&& stream
->get_last_offload() == task
) {
2625 stream
->set_last_offload(NULL
);
2634 bool OffloadDescriptor::offload_wrap(
2642 const void **signal
,
2644 const void *stack_addr
,
2645 OffloadFlags offload_flags
2648 OffloadWaitKind wait_kind
= c_offload_wait_signal
;
2649 bool is_traceback
= offload_flags
.bits
.fortran_traceback
;
2651 // define kind of wait if any;
2652 // there can be one off the following kind:
2653 // 1. c_offload_wait_signal for "offload_wait wait(signal)"
2654 // 2. c_offload_wait_stream for "offload_wait stream(stream)"
2655 // 3. c_offload_wait_all_streams for "offload_wait stream(0)"
2656 if (num_waits
== -1) {
2657 wait_kind
= (m_stream
== 0) ?
2658 c_offload_wait_all_streams
:
2659 c_offload_wait_stream
;
2662 const char *stream_str
;
2664 if (m_stream
== no_stream
|| num_waits
>= 0) {
2665 stream_str
= "none";
2667 else if (m_stream
== 0) {
2671 sprintf(buf
, "%#llx", m_stream
);
2676 OFFLOAD_DEBUG_TRACE_1(1,
2677 GET_OFFLOAD_NUMBER(get_timer_data()),
2678 c_offload_init_func
,
2679 "Offload function %s, is_empty=%d, #varDescs=%d, "
2680 "signal=none, stream=%s, #waits=%d%c",
2681 name
, is_empty
, vars_total
, stream_str
, num_waits
,
2682 num_waits
== 0 ? '\n' : ' ');
2683 // Breaks the norm of using OFFLOAD_DEBUG_TRACE to print the waits
2684 // since the number of waits is not fixed.
2685 if (!OFFLOAD_DO_TRACE
&& (console_enabled
>= 1)) {
2688 if (m_stream
== no_stream
) {
2689 printf("%p", waits
[0]);
2690 for (int i
= 1; i
< num_waits
; i
++) {
2691 printf(", %p", waits
[i
]);
2694 else if (m_stream
!= 0) {
2695 printf("%#x", m_stream
);
2698 printf(" all streams");
2705 // stream in wait is reported further in OFFLOAD_REPORT for waits
2706 if (m_stream
!= no_stream
&& num_waits
== 0) {
2707 OFFLOAD_REPORT(3, GET_OFFLOAD_NUMBER(get_timer_data()),
2711 OFFLOAD_REPORT(3, GET_OFFLOAD_NUMBER(get_timer_data()),
2716 OFFLOAD_DEBUG_TRACE_1(1,
2717 GET_OFFLOAD_NUMBER(get_timer_data()),
2718 c_offload_init_func
,
2719 "Offload function %s, is_empty=%d, #varDescs=%d, "
2720 "signal=%p, stream=%s, #waits=%d%c",
2721 name
, is_empty
, vars_total
, *signal
, stream_str
, num_waits
,
2722 num_waits
== 0 ? '\n' : ' ');
2723 // Breaks the norm of using OFFLOAD_DEBUG_TRACE to print the waits
2724 // since the number of waits is not fixed.
2725 if (!OFFLOAD_DO_TRACE
&& (console_enabled
>= 1)) {
2728 if (m_stream
== no_stream
) {
2729 printf("%p", waits
[0]);
2730 for (int i
= 1; i
< num_waits
; i
++) {
2731 printf(", %p", waits
[i
]);
2735 else if (m_stream
!= 0) {
2736 printf("%#x", m_stream
);
2739 printf(" all streams");
2746 // stream in wait is reported further in OFFLOAD_REPORT for waits
2747 if (m_stream
!= no_stream
&& num_waits
== 0) {
2748 OFFLOAD_REPORT(3, GET_OFFLOAD_NUMBER(get_timer_data()),
2752 OFFLOAD_REPORT(3, GET_OFFLOAD_NUMBER(get_timer_data()),
2756 if (console_enabled
>= 1 && offload_flags
.flags
!= 0) {
2757 trace_offload_flags(get_timer_data(), offload_flags
);
2760 OFFLOAD_REPORT(3, GET_OFFLOAD_NUMBER(get_timer_data()),
2761 c_offload_wait
, "%d\n",
2762 wait_kind
, num_waits
,
2763 (wait_kind
== c_offload_wait_signal
) ?
2765 reinterpret_cast<const void **>(m_stream
));
2767 if (m_status
!= 0) {
2768 m_status
->result
= OFFLOAD_SUCCESS
;
2769 m_status
->device_number
= m_device
.get_logical_index();
2772 m_initial_need_runfunction
= m_need_runfunction
= !is_empty
;
2774 // wait for dependencies to finish
2775 if (!wait_dependencies(waits
, num_waits
, m_stream
)) {
2781 if (!setup_descriptors(vars
, vars2
, vars_total
, entry_id
, stack_addr
)) {
2786 if (offload_flags
.bits
.omp_async
) {
2787 setup_omp_async_info();
2790 // initiate send for pointers. Want to do it as early as possible.
2791 if (!send_pointer_data(signal
!= 0 || offload_flags
.bits
.omp_async
,
2797 // setup misc data for run function
2798 if (!setup_misc_data(name
)) {
2803 // gather copyin data into buffer
2804 if (!gather_copyin_data()) {
2809 // Start the computation
2810 if (!compute(signal
)) {
2815 // initiate receive for pointers
2816 if (!receive_pointer_data(signal
!= 0 || offload_flags
.bits
.omp_async
,
2821 if (offload_flags
.bits
.omp_async
) {
2824 // if there is a signal or stream save descriptor for the later use.
2825 // num_waits == -1 is for offload_wait and there is nothing to save
2826 if (num_waits
!= -1 && (signal
!= 0 || m_stream
!= no_stream
)) {
2828 m_device
.add_signal(*signal
, this);
2831 if (m_stream
!= no_stream
&& m_stream
!= 0) {
2832 Stream
* stream
= Stream::find_stream(m_stream
, false);
2834 stream
->set_last_offload(this);
2837 LIBOFFLOAD_ERROR(c_offload_no_stream
, m_device
.get_logical_index());
2841 // if there is a clause with alloc_if(1) and preallocated need to call
2842 // offload_finish after runfunction
2843 if (!m_preallocated_alloc
) {
2848 // wait for the offload to finish.
2849 if (!offload_finish(is_traceback
)) {
2858 bool OffloadDescriptor::offload(
2866 const void **signal
,
2868 const void *stack_addr
,
2869 OffloadFlags offload_flags
2873 res
= offload_wrap(name
, is_empty
, vars
, vars2
, vars_total
,
2874 waits
, num_waits
, signal
, entry_id
,
2875 stack_addr
, offload_flags
);
2876 if (res
== false && !m_traceback_called
) {
2877 if (offload_flags
.bits
.fortran_traceback
) {
2879 "Calling Fortran library to continue traceback from MIC\n");
2880 FORTRAN_TRACE_BACK(m_status
->result
);
2881 m_traceback_called
= true;
2887 bool OffloadDescriptor::offload_finish(
2893 // wait for compute dependencies to become signaled
2894 if (m_in_deps_total
> 0) {
2895 OffloadTimer
timer(get_timer_data(), c_offload_host_wait_compute
);
2897 if (__offload_active_wait
) {
2900 res
= COI::EventWait(m_in_deps_total
, m_in_deps
, 0, 1, 0, 0);
2902 while (res
== COI_TIME_OUT_REACHED
);
2905 res
= COI::EventWait(m_in_deps_total
, m_in_deps
, -1, 1, 0, 0);
2908 if (res
!= COI_SUCCESS
) {
2909 if (m_status
!= 0 && !m_traceback_called
) {
2910 m_status
->result
= translate_coi_error(res
);
2913 "Calling Fortran library to continue traceback from MIC\n");
2914 FORTRAN_TRACE_BACK(m_status
->result
);
2915 m_traceback_called
= true;
2920 if (is_traceback
&& !m_traceback_called
) {
2922 "Calling Fortran library to continue traceback from MIC\n");
2923 FORTRAN_TRACE_BACK(OFFLOAD_ERROR
);
2924 m_traceback_called
= true;
2927 report_coi_error(c_event_wait
, res
);
2931 // scatter copyout data received from target
2932 if (!scatter_copyout_data()) {
2936 if (m_out_with_preallocated
&&
2937 !receive_pointer_data(m_out_deps_total
> 0, false, NULL
)) {
2942 // wait for receive dependencies to become signaled
2943 if (m_out_deps_total
> 0) {
2944 OffloadTimer
timer(get_timer_data(), c_offload_host_wait_buffers_reads
);
2946 if (__offload_active_wait
) {
2949 res
= COI::EventWait(m_out_deps_total
, m_out_deps
, 0, 1, 0, 0);
2951 while (res
== COI_TIME_OUT_REACHED
);
2954 res
= COI::EventWait(m_out_deps_total
, m_out_deps
, -1, 1, 0, 0);
2957 if (res
!= COI_SUCCESS
) {
2958 if (m_status
!= 0) {
2959 m_status
->result
= translate_coi_error(res
);
2962 report_coi_error(c_event_wait
, res
);
2968 OffloadTimer
timer(get_timer_data(), c_offload_host_destroy_buffers
);
2970 for (BufferList::const_iterator it
= m_destroy_buffers
.begin();
2971 it
!= m_destroy_buffers
.end(); it
++) {
2972 res
= COI::BufferDestroy(*it
);
2973 if (res
!= COI_SUCCESS
) {
2974 if (m_status
!= 0) {
2975 m_status
->result
= translate_coi_error(res
);
2978 report_coi_error(c_buf_destroy
, res
);
2986 void OffloadDescriptor::cleanup()
2988 // release device in orsl
2989 ORSL::release(m_device
.get_logical_index());
2991 OFFLOAD_TIMER_STOP(get_timer_data(), c_offload_host_total_offload
);
2994 Offload_Report_Epilog(get_timer_data());
2997 bool OffloadDescriptor::is_signaled()
2999 bool signaled
= true;
3002 // check compute and receive dependencies
3003 if (m_in_deps_total
> 0) {
3004 res
= COI::EventWait(m_in_deps_total
, m_in_deps
, 0, 1, 0, 0);
3005 signaled
= signaled
&& (res
== COI_SUCCESS
);
3007 if (m_out_deps_total
> 0) {
3008 res
= COI::EventWait(m_out_deps_total
, m_out_deps
, 0, 1, 0, 0);
3009 signaled
= signaled
&& (res
== COI_SUCCESS
);
3015 static Arr_Desc
* make_arr_desc(
3017 int64_t extent_start_val
,
3018 int64_t extent_elements_val
,
3023 res
= (Arr_Desc
*)malloc(sizeof(Arr_Desc
));
3025 LIBOFFLOAD_ERROR(c_malloc
);
3026 res
->base
= reinterpret_cast<int64_t>(ptr_val
);
3028 res
->dim
[0].size
= size
;
3029 res
->dim
[0].lindex
= 0;
3030 res
->dim
[0].lower
= extent_start_val
;
3031 res
->dim
[0].upper
= extent_elements_val
+ extent_start_val
- 1;
3032 res
->dim
[0].stride
= 1;
3036 // Send pointer data if source or destination or both of them are
3037 // noncontiguous. There is guarantee that length of destination enough for
3038 // transferred data.
3039 bool OffloadDescriptor::send_noncontiguous_pointer_data(
3044 uint64_t &data_sent
,
3045 uint32_t in_deps_amount
,
3049 int64_t offset_src
, offset_dst
;
3050 int64_t length_src
, length_dst
;
3051 int64_t length_src_cur
, length_dst_cur
;
3054 bool dst_is_empty
= true;
3055 bool src_is_empty
= true;
3059 // Set length_src and length_dst
3060 length_src
= (m_vars_extra
[i
].read_rng_src
) ?
3061 m_vars_extra
[i
].read_rng_src
->range_size
: m_vars
[i
].size
;
3062 length_dst
= !m_vars
[i
].into
? length_src
:
3063 (m_vars_extra
[i
].read_rng_dst
) ?
3064 m_vars_extra
[i
].read_rng_dst
->range_size
: m_vars
[i
].size
;
3065 send_size
= (length_src
< length_dst
) ? length_src
: length_dst
;
3067 // If BufferWriteMultiD is defined we can set values of required arguments
3068 // and transfer noncontiguous data via call to the COI routine.
3069 if (__offload_use_coi_noncontiguous_transfer
&& COI::BufferWriteMultiD
) {
3070 struct Arr_Desc
* arr_desc_dst
;
3071 struct Arr_Desc
* arr_desc_src
;
3072 int64_t size_src
, size_dst
;
3073 char *base
= offload_get_src_base(static_cast<char*>(m_vars
[i
].ptr
),
3074 m_vars
[i
].type
.src
);
3075 COIBUFFER dst_buf
= m_vars
[i
].into
?
3076 m_vars_extra
[i
].dst_data
->mic_buf
:
3077 m_vars_extra
[i
].src_data
->mic_buf
;
3079 offset_src
= (m_vars_extra
[i
].read_rng_src
)?
3080 m_vars_extra
[i
].read_rng_src
->init_offset
: m_vars_extra
[i
].cpu_disp
;
3081 size_src
= m_vars_extra
[i
].read_rng_src
?
3082 cean_get_transf_size(m_vars_extra
[i
].read_rng_src
) :
3085 offset_dst
= (m_vars_extra
[i
].read_rng_dst
)?
3086 m_vars_extra
[i
].read_rng_dst
->init_offset
: m_vars
[i
].disp
;
3087 size_dst
= m_vars_extra
[i
].read_rng_dst
?
3088 cean_get_transf_size(m_vars_extra
[i
].read_rng_dst
) : m_vars
[i
].size
;
3090 int64_t el_size
= (!m_vars
[i
].into
||
3091 (m_vars_extra
[i
].read_rng_src
&& m_vars_extra
[i
].read_rng_dst
)) ?
3093 m_vars_extra
[i
].read_rng_src
?
3094 m_vars_extra
[i
].read_rng_src
->arr_desc
->dim
[
3095 m_vars_extra
[i
].read_rng_src
->arr_desc
->rank
- 1].size
:
3096 m_vars_extra
[i
].read_rng_dst
->arr_desc
->dim
[
3097 m_vars_extra
[i
].read_rng_dst
->arr_desc
->rank
- 1].size
;
3099 arr_desc_src
= (m_vars_extra
[i
].read_rng_src
) ?
3100 m_vars_extra
[i
].read_rng_src
->arr_desc
:
3101 make_arr_desc(NULL
, // don't required for source
3102 offset_src
/el_size
, size_src
/el_size
, el_size
);
3104 arr_desc_dst
= !m_vars
[i
].into
?
3106 (m_vars_extra
[i
].read_rng_dst
) ?
3107 m_vars_extra
[i
].read_rng_dst
->arr_desc
:
3109 offset_dst
/el_size
, size_src
/el_size
, el_size
);
3111 int64_t alloc_disp
= m_vars
[i
].into
?
3112 m_vars_extra
[i
].dst_data
->alloc_disp
:
3113 m_vars_extra
[i
].src_data
->alloc_disp
;
3115 arr_desc_src
->base
= reinterpret_cast<int64_t>(base
);
3116 arr_desc_dst
->base
= 0;
3118 res
= COI::BufferWriteMultiD(
3119 dst_buf
, // in_DestBuffer,
3120 m_device
.get_process(), // DestProcess,
3121 m_vars
[i
].offset
+ m_vars
[i
].mic_offset
-
3122 alloc_disp
, // Offset
3123 (void*)arr_desc_dst
, // descriptor of DestArray
3124 (void*)arr_desc_src
, // descriptor of SrcArray
3125 COI_COPY_UNSPECIFIED
, // Type
3126 in_deps_amount
, // Number of in Dependencies
3127 in_deps
, // array of in Dependencies
3128 event
); // out Dependency
3129 if (res
!= COI_SUCCESS
) {
3130 if (m_status
!= 0) {
3131 m_status
->result
= translate_coi_error(res
);
3134 report_coi_error(c_buf_copy
, res
);
3139 // if event is defined we must multiplate it for all contiguous intervals
3140 // that will be Copied/Write.
3141 // Take in account that we already have 1 event.
3143 m_in_deps_allocated
+= (length_src
/ send_size
) *
3144 ((m_vars_extra
[i
].read_rng_src
) ?
3145 m_vars_extra
[i
].read_rng_src
->range_max_number
: 1) ;
3147 (COIEVENT
*)realloc(m_in_deps
, sizeof(COIEVENT
) * m_in_deps_allocated
);
3151 // consequently get contiguous ranges,
3152 // define corresponded destination offset and send data
3155 if (m_vars_extra
[i
].read_rng_src
) {
3156 if (!get_next_range(m_vars_extra
[i
].read_rng_src
,
3158 // source ranges are over - nothing to send
3162 else if (data_sent
== 0) {
3163 offset_src
= m_vars_extra
[i
].cpu_disp
;
3168 length_src_cur
= length_src
;
3171 // if source is contiguous or its contiguous range is greater
3172 // than destination one
3173 offset_src
+= send_size
;
3175 length_src_cur
-= send_size
;
3176 src_is_empty
= length_src_cur
== 0;
3179 if (m_vars
[i
].into
) {
3180 if (m_vars_extra
[i
].read_rng_dst
) {
3181 if (!get_next_range(m_vars_extra
[i
].read_rng_dst
,
3183 // destination ranges are over
3184 LIBOFFLOAD_ERROR(c_destination_is_over
);
3188 // into is contiguous.
3190 offset_dst
= m_vars
[i
].disp
;
3192 length_dst_cur
= length_dst
;
3196 offset_dst
= offset_src
;
3197 length_dst_cur
= length_src
;
3201 // if destination is contiguous or its contiguous range is greater
3203 offset_dst
+= send_size
;
3205 length_dst_cur
-= send_size
;
3206 dst_is_empty
= length_dst_cur
== 0;
3209 event
= &m_in_deps
[m_in_deps_total
++];
3211 if (src_data
!= 0 && src_data
->cpu_buf
!= 0) {
3212 res
= COI::BufferCopy(
3215 m_vars
[i
].mic_offset
+
3216 m_vars
[i
].offset
+ offset_dst
,
3217 m_vars_extra
[i
].cpu_offset
+ offset_src
,
3219 COI_COPY_UNSPECIFIED
,
3220 in_deps_amount
, in_deps
,
3222 if (res
!= COI_SUCCESS
) {
3223 if (m_status
!= 0) {
3224 m_status
->result
= translate_coi_error(res
);
3227 report_coi_error(c_buf_copy
, res
);
3231 char *base
= offload_get_src_base(m_vars
[i
].ptr
,
3232 m_vars
[i
].type
.src
);
3234 res
= COI::BufferWrite(
3236 m_vars
[i
].mic_offset
+
3237 m_vars
[i
].offset
+ offset_dst
,
3240 COI_COPY_UNSPECIFIED
,
3241 in_deps_amount
, in_deps
,
3243 if (res
!= COI_SUCCESS
) {
3244 if (m_status
!= 0) {
3245 m_status
->result
= translate_coi_error(res
);
3248 report_coi_error(c_buf_write
, res
);
3251 data_sent
+= send_size
;
3257 bool OffloadDescriptor::send_pointer_data(bool is_async
, void* info
)
3259 OffloadTimer
timer(get_timer_data(), c_offload_host_send_pointers
);
3261 bool should_use_async_buffer_write
= m_initial_need_runfunction
;
3262 uint64_t ptr_sent
= 0;
3264 uint32_t in_deps_amount
= 0;
3265 COIEVENT
*in_deps
= NULL
;
3267 // For offload_transfer and offload with empty body without signal:
3268 // - if there is only one buffer copy - send data synchronously
3269 // - if there are multiple buffer copy and
3270 // __offload_parallel_copy is false - send data synchronously
3271 // - if there are multiple buffer copy and
3272 // __offload_parallel_copy is true - send data asynchronously
3273 // It concerns only big size data - greater than __offload_use_async_buffer_write.
3274 // Data of size less than __offload_use_async_buffer_write are sent synchronously.
3275 // Synchronous transfer results in better performance in COI.
3276 // __offload_parallel_copy is false by default but can be changed
3277 // via environment variable OFFLOAD_PARALLEL_COPY
3278 if (!m_initial_need_runfunction
&& __offload_parallel_copy
) {
3279 int big_size_count
= 0;
3280 for (int i
= 0; i
< m_vars_total
; i
++) {
3281 if (m_vars
[i
].direction
.in
&&
3282 m_vars
[i
].size
>= __offload_use_async_buffer_write
) {
3283 switch (m_vars
[i
].type
.dst
) {
3287 if (m_vars
[i
].flags
.is_static_dstn
) {
3293 case c_cean_var_ptr
:
3297 case c_dv_data_slice
:
3298 case c_dv_ptr_data_slice
:
3306 if (big_size_count
> 1) {
3307 should_use_async_buffer_write
= true;
3311 if (m_stream
!= no_stream
&& m_vars_total
!= 0) {
3312 get_stream_in_dependencies(in_deps_amount
, in_deps
);
3315 // Initiate send for pointer data
3316 for (int i
= 0; i
< m_vars_total
; i
++) {
3317 uint64_t sent_data
= m_vars
[i
].size
;
3318 uint32_t in_deps_amount_save
;
3319 COIEVENT
*in_deps_save
;
3321 if (m_vars_extra
[i
].omp_last_event_type
== c_last_write
) {
3322 in_deps_amount_save
= in_deps_amount
;
3323 in_deps_save
= in_deps
;
3324 in_deps_amount
= m_in_deps_total
;
3325 if (in_deps_amount
> 0) {
3326 in_deps
= (COIEVENT
*) malloc(sizeof(COIEVENT
) * in_deps_amount
);
3327 if (in_deps
== NULL
)
3328 LIBOFFLOAD_ERROR(c_malloc
);
3329 memcpy(in_deps
, m_in_deps
,in_deps_amount
* sizeof(COIEVENT
));
3332 switch (m_vars
[i
].type
.dst
) {
3333 case c_data_ptr_array
:
3338 if (m_vars
[i
].direction
.in
&&
3339 m_vars
[i
].flags
.is_static_dstn
) {
3342 (should_use_async_buffer_write
&&
3343 m_vars
[i
].size
>= __offload_use_async_buffer_write
)) ?
3344 &m_in_deps
[m_in_deps_total
++] : 0;
3345 PtrData
* dst_data
= m_vars
[i
].into
?
3346 m_vars_extra
[i
].dst_data
:
3347 m_vars_extra
[i
].src_data
;
3349 VAR_TYPE_IS_PTR(m_vars
[i
].type
.src
) ||
3350 VAR_TYPE_IS_SCALAR(m_vars
[i
].type
.src
) &&
3351 m_vars
[i
].flags
.is_static
?
3352 m_vars_extra
[i
].src_data
: 0;
3354 if (m_vars
[i
].flags
.is_noncont_src
||
3355 m_vars
[i
].flags
.is_noncont_dst
) {
3356 if (!send_noncontiguous_pointer_data(
3357 i
, src_data
, dst_data
, event
, sent_data
,
3358 in_deps_amount
, in_deps
)) {
3362 else if (src_data
!= 0 && src_data
->cpu_buf
!= 0) {
3363 res
= COI::BufferCopy(
3366 m_vars
[i
].mic_offset
+
3367 m_vars
[i
].offset
+ m_vars
[i
].disp
,
3368 m_vars_extra
[i
].cpu_offset
+
3369 m_vars_extra
[i
].cpu_disp
,
3371 COI_COPY_UNSPECIFIED
,
3372 in_deps_amount
, in_deps
,
3374 if (res
!= COI_SUCCESS
) {
3375 if (m_status
!= 0) {
3376 m_status
->result
= translate_coi_error(res
);
3379 report_coi_error(c_buf_copy
, res
);
3383 char *base
= offload_get_src_base(m_vars
[i
].ptr
,
3384 m_vars
[i
].type
.src
);
3385 res
= COI::BufferWrite(
3387 m_vars
[i
].mic_offset
+
3388 m_vars
[i
].offset
+ m_vars
[i
].disp
,
3389 base
+ m_vars_extra
[i
].cpu_disp
,
3391 COI_COPY_UNSPECIFIED
,
3392 in_deps_amount
, in_deps
,
3394 if (res
!= COI_SUCCESS
) {
3395 if (m_status
!= 0) {
3396 m_status
->result
= translate_coi_error(res
);
3399 report_coi_error(c_buf_write
, res
);
3402 ptr_sent
+= sent_data
;
3408 case c_cean_var_ptr
:
3410 if (m_vars
[i
].direction
.in
&& m_vars
[i
].size
> 0) {
3413 (should_use_async_buffer_write
&&
3414 m_vars
[i
].size
>= __offload_use_async_buffer_write
)) ?
3415 &m_in_deps
[m_in_deps_total
++] : 0;
3416 PtrData
* dst_data
= m_vars
[i
].into
?
3417 m_vars_extra
[i
].dst_data
:
3418 m_vars_extra
[i
].src_data
;
3420 VAR_TYPE_IS_PTR(m_vars
[i
].type
.src
) ||
3421 VAR_TYPE_IS_SCALAR(m_vars
[i
].type
.src
) &&
3422 m_vars
[i
].flags
.is_static
?
3423 m_vars_extra
[i
].src_data
: 0;
3425 if (m_vars
[i
].flags
.is_noncont_src
||
3426 m_vars
[i
].flags
.is_noncont_dst
) {
3427 send_noncontiguous_pointer_data(
3428 i
, src_data
, dst_data
, event
, sent_data
,
3429 in_deps_amount
, in_deps
);
3431 else if (src_data
!= 0 && src_data
->cpu_buf
!= 0) {
3432 res
= COI::BufferCopy(
3435 m_vars
[i
].mic_offset
+
3436 m_vars
[i
].offset
+ m_vars
[i
].disp
,
3437 m_vars_extra
[i
].cpu_offset
+
3438 m_vars_extra
[i
].cpu_disp
,
3440 COI_COPY_UNSPECIFIED
,
3441 in_deps_amount
, in_deps
,
3443 if (res
!= COI_SUCCESS
) {
3444 if (m_status
!= 0) {
3445 m_status
->result
= translate_coi_error(res
);
3448 report_coi_error(c_buf_copy
, res
);
3452 char *base
= offload_get_src_base(m_vars
[i
].ptr
,
3453 m_vars
[i
].type
.src
);
3454 res
= COI::BufferWrite(
3456 m_vars
[i
].mic_offset
+
3457 m_vars
[i
].offset
+ m_vars
[i
].disp
,
3458 base
+ m_vars_extra
[i
].cpu_disp
,
3460 COI_COPY_UNSPECIFIED
,
3461 in_deps_amount
, in_deps
,
3463 if (res
!= COI_SUCCESS
) {
3464 if (m_status
!= 0) {
3465 m_status
->result
= translate_coi_error(res
);
3468 report_coi_error(c_buf_write
, res
);
3472 ptr_sent
+= sent_data
;
3478 if (m_vars
[i
].direction
.in
&&
3479 m_vars
[i
].size
> 0) {
3480 PtrData
*ptr_data
= m_vars
[i
].into
?
3481 m_vars_extra
[i
].dst_data
:
3482 m_vars_extra
[i
].src_data
;
3483 PtrData
* src_data
= m_vars_extra
[i
].src_data
;
3487 (should_use_async_buffer_write
&&
3488 m_vars
[i
].size
>= __offload_use_async_buffer_write
)) ?
3489 &m_in_deps
[m_in_deps_total
++] : 0;
3491 if (m_vars
[i
].flags
.is_noncont_src
||
3492 m_vars
[i
].flags
.is_noncont_dst
) {
3493 send_noncontiguous_pointer_data(
3494 i
, src_data
, ptr_data
, event
, sent_data
,
3495 in_deps_amount
, in_deps
);
3497 else if (src_data
&& src_data
->cpu_buf
!= 0) {
3498 res
= COI::BufferCopy(
3501 m_vars
[i
].offset
+ ptr_data
->mic_offset
+
3503 m_vars_extra
[i
].cpu_offset
+
3504 m_vars_extra
[i
].cpu_disp
,
3506 COI_COPY_UNSPECIFIED
,
3507 in_deps_amount
, in_deps
,
3509 if (res
!= COI_SUCCESS
) {
3510 if (m_status
!= 0) {
3511 m_status
->result
= translate_coi_error(res
);
3514 report_coi_error(c_buf_copy
, res
);
3518 char *base
= offload_get_src_base(m_vars
[i
].ptr
,
3519 m_vars
[i
].type
.src
);
3520 res
= COI::BufferWrite(
3522 ptr_data
->mic_offset
+
3523 m_vars
[i
].offset
+ m_vars
[i
].disp
,
3524 base
+ m_vars_extra
[i
].cpu_disp
,
3526 COI_COPY_UNSPECIFIED
,
3527 in_deps_amount
, in_deps
,
3529 if (res
!= COI_SUCCESS
) {
3530 if (m_status
!= 0) {
3531 m_status
->result
= translate_coi_error(res
);
3534 report_coi_error(c_buf_write
, res
);
3537 ptr_sent
+= sent_data
;
3541 case c_dv_data_slice
:
3542 case c_dv_ptr_data_slice
:
3543 if (m_vars
[i
].direction
.in
&&
3544 m_vars
[i
].size
> 0) {
3545 PtrData
*dst_data
= m_vars
[i
].into
?
3546 m_vars_extra
[i
].dst_data
:
3547 m_vars_extra
[i
].src_data
;
3549 (VAR_TYPE_IS_PTR(m_vars
[i
].type
.src
) ||
3550 VAR_TYPE_IS_DV_DATA(m_vars
[i
].type
.src
) ||
3551 VAR_TYPE_IS_DV_DATA_SLICE(m_vars
[i
].type
.src
) ||
3552 VAR_TYPE_IS_SCALAR(m_vars
[i
].type
.src
) &&
3553 m_vars
[i
].flags
.is_static
) ?
3554 m_vars_extra
[i
].src_data
: 0;
3557 (should_use_async_buffer_write
&&
3558 m_vars
[i
].size
>= __offload_use_async_buffer_write
)) ?
3559 &m_in_deps
[m_in_deps_total
++] : 0;
3560 if (m_vars
[i
].flags
.is_noncont_src
||
3561 m_vars
[i
].flags
.is_noncont_dst
) {
3562 send_noncontiguous_pointer_data(
3563 i
, src_data
, dst_data
, event
, sent_data
,
3564 in_deps_amount
, in_deps
);
3566 else if (src_data
&& src_data
->cpu_buf
!= 0) {
3567 res
= COI::BufferCopy(
3571 dst_data
->mic_offset
+
3573 m_vars_extra
[i
].cpu_offset
+
3574 m_vars_extra
[i
].cpu_disp
,
3576 COI_COPY_UNSPECIFIED
,
3577 in_deps_amount
, in_deps
,
3579 if (res
!= COI_SUCCESS
) {
3580 if (m_status
!= 0) {
3581 m_status
->result
= translate_coi_error(res
);
3584 report_coi_error(c_buf_copy
, res
);
3588 char *base
= offload_get_src_base(m_vars
[i
].ptr
,
3589 m_vars
[i
].type
.src
);
3590 res
= COI::BufferWrite(
3592 dst_data
->mic_offset
+
3593 m_vars
[i
].offset
+ m_vars
[i
].disp
,
3594 base
+ m_vars_extra
[i
].cpu_disp
,
3596 COI_COPY_UNSPECIFIED
,
3597 in_deps_amount
, in_deps
,
3599 if (res
!= COI_SUCCESS
) {
3600 if (m_status
!= 0) {
3601 m_status
->result
= translate_coi_error(res
);
3604 report_coi_error(c_buf_write
, res
);
3608 ptr_sent
+= sent_data
;
3615 if (m_vars_extra
[i
].omp_last_event_type
== c_last_write
) {
3616 in_deps_amount
= in_deps_amount_save
;
3617 in_deps
= in_deps_save
;
3618 register_omp_event_call_back(&m_in_deps
[m_in_deps_total
- 1], info
);
3620 // alloc field isn't used at target.
3621 // We can reuse it for offset of array pointers.
3622 if (m_vars_extra
[i
].is_arr_ptr_el
) {
3623 m_vars
[i
].ptr_arr_offset
= m_vars_extra
[i
].ptr_arr_offset
;
3628 m_status
->data_sent
+= ptr_sent
;
3631 OFFLOAD_TIMER_HOST_SDATA(get_timer_data(), ptr_sent
);
3632 OFFLOAD_DEBUG_TRACE_1(1, GET_OFFLOAD_NUMBER(get_timer_data()),
3633 c_offload_sent_pointer_data
,
3634 "Total pointer data sent to target: [%lld] bytes\n",
3640 bool OffloadDescriptor::gather_copyin_data()
3642 OffloadTimer
timer(get_timer_data(), c_offload_host_gather_inputs
);
3644 if (m_need_runfunction
&& m_in_datalen
> 0) {
3645 COIMAPINSTANCE map_inst
;
3649 if (m_inout_buf
!= 0) {
3650 OffloadTimer
timer_map(get_timer_data(),
3651 c_offload_host_map_in_data_buffer
);
3653 COIRESULT res
= COI::BufferMap(m_inout_buf
, 0, m_in_datalen
,
3654 COI_MAP_WRITE_ENTIRE_BUFFER
,
3656 reinterpret_cast<void**>(&data
));
3657 if (res
!= COI_SUCCESS
) {
3658 if (m_status
!= 0) {
3659 m_status
->result
= translate_coi_error(res
);
3662 report_coi_error(c_buf_map
, res
);
3666 data
= (char*) m_func_desc
+ m_func_desc
->data_offset
;
3669 // send variable descriptors
3670 memcpy(data
, m_vars
, m_vars_total
* sizeof(VarDesc
));
3671 data
+= m_vars_total
* sizeof(VarDesc
);
3674 m_in
.init_buffer(data
, m_in_datalen
);
3676 // Gather copy data into buffer
3677 for (int i
= 0; i
< m_vars_total
; i
++) {
3678 bool src_is_for_mic
= (m_vars
[i
].direction
.out
||
3679 m_vars
[i
].into
== NULL
);
3680 PtrData
* ptr_data
= src_is_for_mic
?
3681 m_vars_extra
[i
].src_data
:
3682 m_vars_extra
[i
].dst_data
;
3683 if (m_vars
[i
].flags
.alloc_disp
) {
3684 m_in
.send_data(&ptr_data
->alloc_disp
,
3685 sizeof(ptr_data
->alloc_disp
));
3688 // send sink address to the target
3689 if (m_vars
[i
].flags
.sink_addr
) {
3690 m_in
.send_data(&ptr_data
->mic_addr
,
3691 sizeof(ptr_data
->mic_addr
));
3694 switch (m_vars
[i
].type
.dst
) {
3695 case c_data_ptr_array
:
3700 if (m_vars
[i
].direction
.in
&&
3701 !m_vars
[i
].flags
.is_static_dstn
) {
3703 char *ptr
= offload_get_src_base(m_vars
[i
].ptr
,
3704 m_vars
[i
].type
.src
);
3705 if (m_vars
[i
].type
.dst
== c_cean_var
) {
3706 // offset and length are derived from the array
3708 int64_t size
= m_vars
[i
].size
;
3709 int64_t disp
= m_vars
[i
].disp
;
3710 m_in
.send_data(reinterpret_cast<char*>(&size
),
3712 m_in
.send_data(reinterpret_cast<char*>(&disp
),
3716 m_in
.send_data(ptr
+ m_vars_extra
[i
].cpu_disp
,
3722 if (m_vars
[i
].direction
.bits
||
3723 m_vars
[i
].alloc_if
||
3724 m_vars
[i
].free_if
) {
3725 // send dope vector excluding base
3726 char *ptr
= static_cast<char*>(m_vars
[i
].ptr
);
3727 m_in
.send_data(ptr
+ sizeof(uint64_t),
3728 m_vars
[i
].size
- sizeof(uint64_t));
3733 // send to target addresses of obsolete
3734 // stacks to be released
3735 if (m_vars
[i
].flags
.is_stack_buf
&&
3736 !m_vars
[i
].direction
.bits
&&
3737 m_vars
[i
].alloc_if
&&
3738 m_vars
[i
].size
!= 0) {
3739 for (PtrDataList::iterator it
=
3740 m_destroy_stack
.begin();
3741 it
!= m_destroy_stack
.end(); it
++) {
3742 PtrData
* ptr_data
= *it
;
3743 m_in
.send_data(&(ptr_data
->mic_addr
),
3744 sizeof(ptr_data
->mic_addr
));
3749 if (m_vars
[i
].direction
.in
) {
3750 m_in
.send_func_ptr(*((const void**) m_vars
[i
].ptr
));
3760 m_status
->data_sent
+= m_in
.get_tfr_size();
3763 if (m_func_desc
->data_offset
== 0) {
3764 OffloadTimer
timer_unmap(get_timer_data(),
3765 c_offload_host_unmap_in_data_buffer
);
3766 COIRESULT res
= COI::BufferUnmap(map_inst
, 0, 0, 0);
3767 if (res
!= COI_SUCCESS
) {
3768 if (m_status
!= 0) {
3769 m_status
->result
= translate_coi_error(res
);
3772 report_coi_error(c_buf_unmap
, res
);
3777 OFFLOAD_TIMER_HOST_SDATA(get_timer_data(), m_in
.get_tfr_size());
3778 OFFLOAD_DEBUG_TRACE_1(1,
3779 GET_OFFLOAD_NUMBER(get_timer_data()), c_offload_copyin_data
,
3780 "Total copyin data sent to target: [%lld] bytes\n",
3781 m_in
.get_tfr_size());
3786 bool OffloadDescriptor::compute(void *info
)
3788 OffloadTimer
timer(get_timer_data(), c_offload_host_start_compute
);
3790 if (m_need_runfunction
) {
3791 OFFLOAD_DEBUG_TRACE_1(2, GET_OFFLOAD_NUMBER(get_timer_data()),
3792 c_offload_compute
, "Compute task on MIC\n");
3794 void* misc
= m_func_desc
;
3795 int misc_len
= m_func_desc_size
;
3799 if (m_func_desc
->data_offset
!= 0) {
3800 misc_len
+= m_in_datalen
;
3802 if (m_out_datalen
> 0) {
3803 ret
= (char*) m_func_desc
+ m_func_desc
->data_offset
;
3804 ret_len
= m_out_datalen
;
3811 uint32_t in_deps_amount
= m_in_deps_total
;
3812 COIEVENT
*in_deps
= m_in_deps_total
> 0 ? m_in_deps
: 0;
3814 if (0 == m_in_deps_total
&& m_stream
!= no_stream
) {
3815 get_stream_in_dependencies(in_deps_amount
, in_deps
);
3818 res
= m_device
.compute(m_stream
,
3826 if (res
!= COI_SUCCESS
) {
3827 if (m_status
!= 0) {
3828 m_status
->result
= translate_coi_error(res
);
3831 report_coi_error(c_pipeline_run_func
, res
);
3834 if (m_omp_async_last_event_type
== c_last_runfunc
) {
3835 register_omp_event_call_back(&event
, info
);
3838 m_in_deps_total
= 1;
3839 m_in_deps
[0] = event
;
3845 // receive pointer data if source or destination or both of them are
3846 // noncontiguous. There is guarantee that length of destination enough for
3847 // transferred data.
3848 bool OffloadDescriptor::receive_noncontiguous_pointer_data(
3852 uint64_t &received_data
,
3853 uint32_t in_deps_amount
,
3857 int64_t offset_src
, offset_dst
;
3858 int64_t length_src
, length_dst
;
3859 int64_t length_src_cur
, length_dst_cur
;
3860 int64_t receive_size
;
3862 bool dst_is_empty
= true;
3863 bool src_is_empty
= true;
3865 char *base
= offload_get_src_base(
3867 static_cast<char*>(m_vars
[i
].into
) :
3868 static_cast<char*>(m_vars
[i
].ptr
),
3869 m_vars
[i
].type
.dst
);
3872 // Set length_src and length_dst
3873 length_src
= (m_vars_extra
[i
].read_rng_src
) ?
3874 m_vars_extra
[i
].read_rng_src
->range_size
: m_vars
[i
].size
;
3875 length_dst
= !m_vars
[i
].into
? length_src
:
3876 (m_vars_extra
[i
].read_rng_dst
) ?
3877 m_vars_extra
[i
].read_rng_dst
->range_size
: m_vars
[i
].size
;
3878 receive_size
= (length_src
< length_dst
) ? length_src
: length_dst
;
3880 // If BufferReadMultiD is defined we can set values of required arguments
3881 // and transfer noncontiguous data via call to the COI routine.
3882 if (__offload_use_coi_noncontiguous_transfer
&& COI::BufferReadMultiD
) {
3883 struct Arr_Desc
* arr_desc_dst
;
3884 struct Arr_Desc
* arr_desc_src
;
3885 int64_t size_src
, size_dst
;
3887 offset_src
= (m_vars_extra
[i
].read_rng_src
)?
3888 m_vars_extra
[i
].read_rng_src
->init_offset
: m_vars
[i
].disp
;
3889 size_src
= m_vars_extra
[i
].read_rng_src
?
3890 cean_get_transf_size(m_vars_extra
[i
].read_rng_src
) :
3893 offset_dst
= (m_vars_extra
[i
].read_rng_dst
)?
3894 m_vars_extra
[i
].read_rng_dst
->init_offset
: m_vars_extra
[i
].cpu_disp
;
3895 size_dst
= m_vars_extra
[i
].read_rng_dst
?
3896 cean_get_transf_size(m_vars_extra
[i
].read_rng_dst
) : m_vars
[i
].size
;
3898 int64_t el_size
= (!m_vars
[i
].into
||
3899 (m_vars_extra
[i
].read_rng_src
&&
3900 m_vars_extra
[i
].read_rng_dst
)) ?
3902 m_vars_extra
[i
].read_rng_src
?
3903 m_vars_extra
[i
].read_rng_src
->arr_desc
->dim
[
3904 m_vars_extra
[i
].read_rng_src
->arr_desc
->rank
- 1].size
:
3905 m_vars_extra
[i
].read_rng_dst
->arr_desc
->dim
[
3906 m_vars_extra
[i
].read_rng_dst
->arr_desc
->rank
- 1].size
;
3907 arr_desc_src
= (m_vars_extra
[i
].read_rng_src
) ?
3908 m_vars_extra
[i
].read_rng_src
->arr_desc
:
3909 make_arr_desc(NULL
, // don't required for source
3910 offset_src
/el_size
, size_src
/el_size
,
3912 arr_desc_dst
= !m_vars
[i
].into
? arr_desc_src
:
3913 (m_vars_extra
[i
].read_rng_dst
) ?
3914 m_vars_extra
[i
].read_rng_dst
->arr_desc
:
3916 offset_dst
/el_size
, size_src
/el_size
, el_size
);
3918 arr_desc_dst
->base
= reinterpret_cast<int64_t>(base
);
3920 res
= COI::BufferReadMultiD(
3921 m_vars_extra
[i
].src_data
->mic_buf
, // SourceBuffer
3922 m_vars
[i
].offset
+ m_vars
[i
].mic_offset
-
3923 m_vars_extra
[i
].src_data
->alloc_disp
, // Offset
3924 (void*)arr_desc_dst
, // descriptor of DestArray
3925 (void*)arr_desc_src
, // descriptor of SrcArray
3926 COI_COPY_UNSPECIFIED
, // Type
3927 in_deps_amount
, // Number of in Dependencies
3928 in_deps
, // array of in Dependencies
3929 event
); // out Dependency
3930 if (res
!= COI_SUCCESS
) {
3931 if (m_status
!= 0) {
3932 m_status
->result
= translate_coi_error(res
);
3935 report_coi_error(c_buf_copy
, res
);
3939 // if event is defined we must multiplate for all contiguous intervals
3940 // that will be Copied/Read.
3941 // Take in account that we already have 1 event.
3943 m_out_deps_allocated
+= (length_src
/ receive_size
) *
3944 ((m_vars_extra
[i
].read_rng_src
) ?
3945 m_vars_extra
[i
].read_rng_src
->range_max_number
: 1) ;
3947 (COIEVENT
*)realloc(m_out_deps
, sizeof(COIEVENT
) * m_out_deps_allocated
);
3951 // consequently get contiguous ranges,
3952 // define corresponded destination offset and receive data
3956 if (m_vars_extra
[i
].read_rng_src
) {
3957 if (!get_next_range(m_vars_extra
[i
].read_rng_src
,
3959 // source ranges are over - nothing to send
3963 else if (received_data
== 0) {
3964 offset_src
= m_vars
[i
].disp
;
3969 length_src_cur
= length_src
;
3972 // if source is contiguous or its contiguous range is greater
3973 // than destination one
3974 offset_src
+= receive_size
;
3976 length_src_cur
-= receive_size
;
3977 src_is_empty
= length_src_cur
== 0;
3979 // get destination offset
3981 if (m_vars
[i
].into
) {
3982 if (m_vars_extra
[i
].read_rng_dst
) {
3983 if (!get_next_range(m_vars_extra
[i
].read_rng_dst
,
3985 // destination ranges are over
3986 LIBOFFLOAD_ERROR(c_destination_is_over
);
3990 // destination is contiguous.
3992 offset_dst
= m_vars_extra
[i
].cpu_disp
;
3994 length_dst_cur
= length_dst
;
3998 offset_dst
= offset_src
;
3999 length_dst_cur
= length_src
;
4003 // if destination is contiguous or its contiguous range is greater
4005 offset_dst
+= receive_size
;
4007 length_dst_cur
-= receive_size
;
4008 dst_is_empty
= length_dst_cur
== 0;
4010 event
= &m_out_deps
[m_out_deps_total
++];
4013 res
= COI::BufferCopy(
4015 m_vars_extra
[i
].src_data
->mic_buf
,
4016 m_vars_extra
[i
].cpu_offset
+ offset_dst
,
4017 m_vars
[i
].offset
+ offset_src
+
4018 m_vars
[i
].mic_offset
,
4020 COI_COPY_UNSPECIFIED
,
4024 if (res
!= COI_SUCCESS
) {
4025 if (m_status
!= 0) {
4026 m_status
->result
= translate_coi_error(res
);
4029 report_coi_error(c_buf_copy
, res
);
4033 res
= COI::BufferRead(
4034 m_vars_extra
[i
].src_data
->mic_buf
,
4035 m_vars
[i
].offset
+ offset_src
+
4036 m_vars
[i
].mic_offset
,
4039 COI_COPY_UNSPECIFIED
,
4043 if (res
!= COI_SUCCESS
) {
4044 if (m_status
!= 0) {
4045 m_status
->result
= translate_coi_error(res
);
4048 report_coi_error(c_buf_read
, res
);
4051 received_data
+= receive_size
;
4057 bool OffloadDescriptor::receive_pointer_data(bool is_async
,
4058 bool first_run
, void *info
)
4060 OffloadTimer
timer(get_timer_data(), c_offload_host_start_buffers_reads
);
4062 bool should_use_async_buffer_read
= m_initial_need_runfunction
;
4063 uint64_t ptr_received
= 0;
4066 // For offload_transfer and offload with empty body without signal:
4067 // - if there is only one buffer copy - get data synchronously
4068 // - if there are multiple buffer copy and
4069 // __offload_parallel_copy is false - get data synchronously
4070 // - if there are multiple buffer copy
4071 // and __offload_parallel_copy is true - get data asynchronously
4072 // It concerns only data with size greater than __offload_use_async_buffer_read.
4073 // Data of size less than __offload_use_async_buffer_read are received synchronously.
4074 // Synchronous transfer results in better performance in COI.
4075 // __offload_parallel_copy is false by default but can be changed
4076 // via environment variable OFFLOAD_PARALLEL_COPY
4077 if (!m_initial_need_runfunction
&& __offload_parallel_copy
) {
4078 int big_size_count
= 0;
4080 for (int i
= 0; i
< m_vars_total
; i
++) {
4081 if (m_vars
[i
].direction
.out
&&
4082 m_vars
[i
].size
>= __offload_use_async_buffer_read
) {
4083 // preallocated OUT only at second run
4084 if (first_run
== m_vars
[i
].flags
.preallocated
) {
4087 switch (m_vars
[i
].type
.src
) {
4091 if (m_vars
[i
].flags
.is_static
) {
4097 case c_cean_var_ptr
:
4100 case c_dv_data_slice
:
4101 case c_dv_ptr_data_slice
:
4110 if (big_size_count
> 1) {
4111 should_use_async_buffer_read
= true;
4114 uint32_t in_deps_amount
= m_in_deps_total
;
4115 COIEVENT
*in_deps
= m_in_deps_total
> 0 ? m_in_deps
: 0;
4117 if (0 == m_in_deps_total
&&
4118 m_stream
!= no_stream
&&
4119 m_vars_total
!= 0) {
4120 get_stream_in_dependencies(in_deps_amount
, in_deps
);
4123 for (int i
= 0; i
< m_vars_total
; i
++) {
4124 uint64_t received_data
= m_vars
[i
].size
;
4125 uint32_t in_deps_amount_save
;
4126 COIEVENT
*in_deps_save
;
4128 if (m_vars_extra
[i
].omp_last_event_type
== c_last_read
) {
4129 in_deps_amount_save
= in_deps_amount
;
4130 in_deps_save
= in_deps
;
4132 in_deps_amount
+= m_out_deps_total
;
4133 if (in_deps_amount
> 0) {
4134 in_deps
= (COIEVENT
*) malloc(sizeof(COIEVENT
) * in_deps_amount
);
4135 if (in_deps
== NULL
)
4136 LIBOFFLOAD_ERROR(c_malloc
);
4137 memcpy(in_deps
, in_deps_save
,
4138 in_deps_amount_save
* sizeof(COIEVENT
));
4139 memcpy(in_deps
+ in_deps_amount_save
* sizeof(COIEVENT
),
4141 m_out_deps_total
* sizeof(COIEVENT
));
4144 // At first run don't receive by preallocated target pointer as the
4145 //pointer value will be ready later after call to scatter_copyout_data
4146 if (first_run
&& m_vars
[i
].alloc_if
&& m_vars
[i
].flags
.preallocated
) {
4147 m_preallocated_alloc
= true;
4148 // need one more call to OffloadDescriptor::receive_pointer_data
4149 if (m_vars
[i
].direction
.out
) {
4150 m_out_with_preallocated
= true;
4154 switch (m_vars
[i
].type
.src
) {
4155 case c_data_ptr_array
:
4160 if (m_vars
[i
].direction
.out
&&
4161 m_vars
[i
].flags
.is_static
) {
4164 m_in_deps_total
> 0 ||
4165 (should_use_async_buffer_read
&&
4166 m_vars
[i
].size
>= __offload_use_async_buffer_read
)) ?
4167 &m_out_deps
[m_out_deps_total
++] : 0;
4168 PtrData
*ptr_data
= NULL
;
4169 COIBUFFER dst_buf
= NULL
; // buffer at host
4172 if (VAR_TYPE_IS_PTR(m_vars
[i
].type
.dst
)) {
4173 ptr_data
= m_vars
[i
].into
?
4174 m_vars_extra
[i
].dst_data
:
4175 m_vars_extra
[i
].src_data
;
4177 else if (VAR_TYPE_IS_SCALAR(m_vars
[i
].type
.dst
)) {
4178 if (m_vars
[i
].flags
.is_static_dstn
) {
4179 ptr_data
= m_vars
[i
].into
?
4180 m_vars_extra
[i
].dst_data
:
4181 m_vars_extra
[i
].src_data
;
4184 dst_buf
= ptr_data
? ptr_data
->cpu_buf
: NULL
;
4185 if (dst_buf
== NULL
) {
4186 base
= offload_get_src_base(
4188 static_cast<char*>(m_vars
[i
].into
) :
4189 static_cast<char*>(m_vars
[i
].ptr
),
4190 m_vars
[i
].type
.dst
);
4193 if (m_vars
[i
].flags
.is_noncont_src
||
4194 m_vars
[i
].flags
.is_noncont_dst
) {
4195 receive_noncontiguous_pointer_data(
4196 i
, dst_buf
, event
, received_data
,
4197 in_deps_amount
, in_deps
);
4199 else if (dst_buf
!= 0) {
4200 res
= COI::BufferCopy(
4202 m_vars_extra
[i
].src_data
->mic_buf
,
4203 m_vars_extra
[i
].cpu_offset
+
4204 m_vars_extra
[i
].cpu_disp
,
4205 m_vars
[i
].offset
+ m_vars
[i
].disp
,
4207 COI_COPY_UNSPECIFIED
,
4211 if (res
!= COI_SUCCESS
) {
4212 if (m_status
!= 0) {
4213 m_status
->result
= translate_coi_error(res
);
4216 report_coi_error(c_buf_copy
, res
);
4220 res
= COI::BufferRead(
4221 m_vars_extra
[i
].src_data
->mic_buf
,
4222 m_vars
[i
].offset
+ m_vars
[i
].disp
,
4223 base
+ m_vars_extra
[i
].cpu_offset
+
4224 m_vars_extra
[i
].cpu_disp
,
4226 COI_COPY_UNSPECIFIED
,
4230 if (res
!= COI_SUCCESS
) {
4231 if (m_status
!= 0) {
4232 m_status
->result
= translate_coi_error(res
);
4235 report_coi_error(c_buf_read
, res
);
4238 ptr_received
+= received_data
;
4244 case c_cean_var_ptr
:
4247 case c_dv_data_slice
:
4248 case c_dv_ptr_data_slice
:
4250 COIBUFFER dst_buf
= NULL
; // buffer on host
4251 if (m_vars
[i
].direction
.out
&& m_vars
[i
].size
> 0) {
4254 m_in_deps_total
> 0 ||
4255 (should_use_async_buffer_read
&&
4256 m_vars
[i
].size
>= __offload_use_async_buffer_read
)) ?
4257 &m_out_deps
[m_out_deps_total
++] : 0;
4259 uint64_t dst_offset
= 0;
4260 char *base
= static_cast<char*>(m_vars
[i
].ptr
);
4262 if (VAR_TYPE_IS_PTR(m_vars
[i
].type
.dst
)) {
4263 PtrData
*ptr_data
= m_vars
[i
].into
?
4264 m_vars_extra
[i
].dst_data
:
4265 m_vars_extra
[i
].src_data
;
4266 dst_buf
= ptr_data
? ptr_data
->cpu_buf
: NULL
;
4267 if (dst_buf
== NULL
) {
4268 base
= m_vars
[i
].into
?
4269 *static_cast<char**>(m_vars
[i
].into
) :
4270 *static_cast<char**>(m_vars
[i
].ptr
);
4272 dst_offset
= m_vars_extra
[i
].cpu_offset
+
4273 m_vars_extra
[i
].cpu_disp
;
4275 else if (VAR_TYPE_IS_SCALAR(m_vars
[i
].type
.dst
)) {
4276 if (m_vars
[i
].flags
.is_static_dstn
) {
4277 dst_buf
= m_vars
[i
].into
?
4278 m_vars_extra
[i
].dst_data
->cpu_buf
:
4279 m_vars_extra
[i
].src_data
->cpu_buf
;
4281 if (dst_buf
== NULL
) {
4282 base
= offload_get_src_base(
4284 static_cast<char*>(m_vars
[i
].into
) :
4285 static_cast<char*>(m_vars
[i
].ptr
),
4286 m_vars
[i
].type
.dst
);
4288 dst_offset
= m_vars_extra
[i
].cpu_offset
+
4289 m_vars_extra
[i
].cpu_disp
;
4291 else if (VAR_TYPE_IS_DV_DATA(m_vars
[i
].type
.dst
) ||
4292 VAR_TYPE_IS_DV_DATA_SLICE(m_vars
[i
].type
.dst
)) {
4293 PtrData
*ptr_data
= m_vars
[i
].into
!= 0 ?
4294 m_vars_extra
[i
].dst_data
:
4295 m_vars_extra
[i
].src_data
;
4296 dst_buf
= ptr_data
!= 0 ? ptr_data
->cpu_buf
: 0;
4297 if (dst_buf
== NULL
) {
4298 base
= offload_get_src_base(
4300 static_cast<char*>(m_vars
[i
].into
) :
4301 static_cast<char*>(m_vars
[i
].ptr
),
4302 m_vars
[i
].type
.dst
);
4305 dst_offset
= m_vars_extra
[i
].cpu_offset
+
4306 m_vars_extra
[i
].cpu_disp
;
4309 if (m_vars
[i
].flags
.is_noncont_src
||
4310 m_vars
[i
].flags
.is_noncont_dst
) {
4311 receive_noncontiguous_pointer_data(
4312 i
, dst_buf
, event
, received_data
,
4316 else if (dst_buf
!= 0) {
4317 res
= COI::BufferCopy(
4319 m_vars_extra
[i
].src_data
->mic_buf
,
4321 m_vars
[i
].offset
+ m_vars
[i
].disp
+
4322 m_vars
[i
].mic_offset
,
4324 COI_COPY_UNSPECIFIED
,
4328 if (res
!= COI_SUCCESS
) {
4329 if (m_status
!= 0) {
4330 m_status
->result
= translate_coi_error(res
);
4333 report_coi_error(c_buf_copy
, res
);
4337 res
= COI::BufferRead(
4338 m_vars_extra
[i
].src_data
->mic_buf
,
4339 m_vars
[i
].offset
+ m_vars
[i
].disp
+
4340 m_vars
[i
].mic_offset
,
4343 COI_COPY_UNSPECIFIED
,
4347 if (res
!= COI_SUCCESS
) {
4348 if (m_status
!= 0) {
4349 m_status
->result
= translate_coi_error(res
);
4352 report_coi_error(c_buf_read
, res
);
4355 ptr_received
+= received_data
;
4364 if (m_vars_extra
[i
].omp_last_event_type
== c_last_read
) {
4365 in_deps_amount
= in_deps_amount_save
;
4366 in_deps
= in_deps_save
;
4367 register_omp_event_call_back(&m_out_deps
[m_out_deps_total
- 1], info
);
4369 // destroy buffers for obsolete stacks
4370 if (m_destroy_stack
.size() != 0) {
4371 for (PtrDataList::iterator it
= m_destroy_stack
.begin();
4372 it
!= m_destroy_stack
.end(); it
++) {
4373 PtrData
*ptr_data
= *it
;
4374 m_destroy_buffers
.push_back(ptr_data
->mic_buf
);
4375 OFFLOAD_TRACE(3, "Removing stack buffer with addr %p\n",
4376 ptr_data
->mic_addr
);
4378 m_destroy_stack
.clear();
4380 if (m_vars
[i
].free_if
) {
4381 // remove association for automatic variables
4382 if (m_is_openmp
&& !m_vars
[i
].flags
.is_static
&&
4383 (m_vars
[i
].type
.src
== c_data
||
4384 m_vars
[i
].type
.src
== c_void_ptr
||
4385 m_vars
[i
].type
.src
== c_cean_var
)) {
4386 AutoData
*auto_data
= m_vars_extra
[i
].auto_data
;
4387 if (auto_data
!= 0) {
4388 if (m_vars
[i
].flags
.always_delete
) {
4389 auto_data
->nullify_reference();
4391 else if(auto_data
->remove_reference() == 0) {
4392 m_device
.remove_auto_data(auto_data
->cpu_addr
.start());
4398 if (m_vars
[i
].direction
.out
|| m_vars
[i
].into
== NULL
) {
4399 if (!VAR_TYPE_IS_PTR(m_vars
[i
].type
.src
) &&
4400 !VAR_TYPE_IS_DV_DATA_SLICE(m_vars
[i
].type
.src
) &&
4401 !VAR_TYPE_IS_DV_DATA(m_vars
[i
].type
.src
)) {
4405 PtrData
*ptr_data
= m_vars_extra
[i
].src_data
;
4406 if (ptr_data
->remove_reference() == 0) {
4408 if (ptr_data
->cpu_buf
!= 0) {
4409 m_destroy_buffers
.push_back(ptr_data
->cpu_buf
);
4411 if (ptr_data
->mic_buf
!= 0) {
4412 m_destroy_buffers
.push_back(ptr_data
->mic_buf
);
4414 OFFLOAD_TRACE(3, "Removing association for addr %p\n",
4415 ptr_data
->cpu_addr
.start());
4417 // remove association from map
4418 if (m_vars
[i
].flags
.targetptr
) {
4419 m_device
.remove_targetptr_data(ptr_data
->cpu_addr
.start());
4422 m_device
.remove_ptr_data(ptr_data
->cpu_addr
.start());
4426 else if (VAR_TYPE_IS_PTR(m_vars
[i
].type
.dst
) ||
4427 VAR_TYPE_IS_DV_DATA_SLICE(m_vars
[i
].type
.dst
) ||
4428 VAR_TYPE_IS_DV_DATA(m_vars
[i
].type
.dst
)) {
4429 PtrData
*ptr_data
= m_vars_extra
[i
].dst_data
;
4430 if (ptr_data
->remove_reference() == 0) {
4432 if (ptr_data
->cpu_buf
!= 0) {
4433 m_destroy_buffers
.push_back(ptr_data
->cpu_buf
);
4435 if (ptr_data
->mic_buf
!= 0) {
4436 m_destroy_buffers
.push_back(ptr_data
->mic_buf
);
4438 OFFLOAD_TRACE(3, "Removing association for addr %p\n",
4439 ptr_data
->cpu_addr
.start());
4441 // remove association from map
4442 if (m_vars
[i
].flags
.targetptr
) {
4443 m_device
.remove_targetptr_data(ptr_data
->cpu_addr
.start());
4446 m_device
.remove_ptr_data(ptr_data
->cpu_addr
.start());
4454 m_status
->data_received
+= ptr_received
;
4457 OFFLOAD_TIMER_HOST_RDATA(get_timer_data(), ptr_received
);
4458 OFFLOAD_DEBUG_TRACE_1(1, GET_OFFLOAD_NUMBER(get_timer_data()),
4459 c_offload_received_pointer_data
,
4460 "Total pointer data received from target: [%lld] bytes\n",
4466 bool OffloadDescriptor::scatter_copyout_data()
4468 OffloadTimer
timer(get_timer_data(), c_offload_host_scatter_outputs
);
4470 if (m_need_runfunction
&& m_out_datalen
> 0) {
4472 // total size that need to be transferred from target to host
4473 COIMAPINSTANCE map_inst
;
4477 // output data buffer
4478 if (m_func_desc
->data_offset
== 0) {
4479 OffloadTimer
timer_map(get_timer_data(),
4480 c_offload_host_map_out_data_buffer
);
4482 COIRESULT res
= COI::BufferMap(m_inout_buf
, 0, m_out_datalen
,
4483 COI_MAP_READ_ONLY
, 0, 0, 0,
4485 reinterpret_cast<void**>(&data
));
4486 if (res
!= COI_SUCCESS
) {
4487 if (m_status
!= 0) {
4488 m_status
->result
= translate_coi_error(res
);
4491 report_coi_error(c_buf_map
, res
);
4495 data
= (char*) m_func_desc
+ m_func_desc
->data_offset
;
4499 OFFLOAD_TIMER_TARGET_DATA(get_timer_data(), data
);
4500 data
+= OFFLOAD_TIMER_DATALEN();
4502 // initialize output marshaller
4503 m_out
.init_buffer(data
, m_out_datalen
);
4505 for (int i
= 0; i
< m_vars_total
; i
++) {
4506 bool src_is_for_mic
= (m_vars
[i
].direction
.out
||
4507 m_vars
[i
].into
== NULL
);
4509 if (m_vars
[i
].type
.src
!= c_data_ptr_array
&&
4510 m_vars
[i
].flags
.preallocated
&& m_vars
[i
].alloc_if
) {
4513 void ** cpu_ptr
= src_is_for_mic
?
4514 reinterpret_cast<void**>(m_vars
[i
].ptr
) :
4515 reinterpret_cast<void**>(m_vars
[i
].into
);
4516 void* alloc_base
= NULL
;
4517 int64_t alloc_disp
= 0;
4519 if (m_vars_extra
[i
].alloc
!= NULL
) {
4521 const Arr_Desc
*ap
=
4522 static_cast<const Arr_Desc
*>(m_vars_extra
[i
].alloc
);
4524 __arr_data_offset_and_length(ap
, alloc_disp
, alloc_size
);
4526 alloc_base
= reinterpret_cast<void*>(ap
->base
);
4529 // get pointer to target memory
4530 m_out
.receive_data(&ptr_value
, sizeof(void*));
4533 if (!alloc_ptr_data(
4536 (alloc_base
!= NULL
) ?
4537 alloc_disp
: m_vars
[i
].disp
,
4538 (alloc_base
!= NULL
) ?
4539 alloc_size
: m_vars
[i
].size
,
4542 m_vars
[i
].flags
.targetptr
,
4543 m_vars
[i
].flags
.preallocated
,
4544 m_vars
[i
].flags
.pin
)) {
4548 ptr_data
->add_reference();
4549 *cpu_ptr
= ptr_value
;
4550 if (src_is_for_mic
) {
4551 m_vars_extra
[i
].src_data
= ptr_data
;
4554 m_vars_extra
[i
].dst_data
= ptr_data
;
4556 m_vars
[i
].offset
= (char*) ptr_value
-
4557 (char*) ptr_data
->cpu_addr
.start();
4560 switch (m_vars
[i
].type
.src
) {
4561 case c_data_ptr_array
:
4566 if (m_vars
[i
].direction
.out
&&
4567 !m_vars
[i
].flags
.is_static
) {
4569 if (m_vars
[i
].into
) {
4570 char *ptr
= offload_get_src_base(
4571 static_cast<char*>(m_vars
[i
].into
),
4572 m_vars
[i
].type
.dst
);
4573 m_out
.receive_data(ptr
+ m_vars_extra
[i
].cpu_disp
,
4578 static_cast<char*>(m_vars
[i
].ptr
) +
4579 m_vars_extra
[i
].cpu_disp
,
4586 if (m_vars
[i
].direction
.out
) {
4587 m_out
.receive_func_ptr((const void**) m_vars
[i
].ptr
);
4597 m_status
->data_received
+= m_out
.get_tfr_size();
4600 if (m_func_desc
->data_offset
== 0) {
4601 OffloadTimer
timer_unmap(get_timer_data(),
4602 c_offload_host_unmap_out_data_buffer
);
4604 COIRESULT res
= COI::BufferUnmap(map_inst
, 0, 0, 0);
4605 if (res
!= COI_SUCCESS
) {
4606 if (m_status
!= 0) {
4607 m_status
->result
= translate_coi_error(res
);
4610 report_coi_error(c_buf_unmap
, res
);
4615 OFFLOAD_TIMER_HOST_RDATA(get_timer_data(), m_out
.get_tfr_size());
4616 OFFLOAD_TRACE(1, "Total copyout data received from target: [%lld] bytes\n",
4617 m_out
.get_tfr_size());
4622 static void get_arr_desc_numbers(
4628 CeanReadRanges
* &ptr_ranges
4631 if (is_arr_desc_contiguous(ap
)) {
4633 __arr_data_offset_and_length(ap
, offset
, size
);
4634 el_number
= size
/ el_size
;
4637 ptr_ranges
= init_read_ranges_arr_desc(ap
);
4638 el_number
= (ptr_ranges
->range_size
/ el_size
) *
4639 ptr_ranges
->range_max_number
;
4640 size
= ptr_ranges
->range_size
;
4644 bool OffloadDescriptor::gen_var_descs_for_pointer_array(int i
)
4646 int pointers_number
;
4648 int new_index
= m_vars_total
;
4650 const VarDesc3
*vd3
= static_cast<const VarDesc3
*>(m_vars
[i
].ptr
);
4651 int flags
= vd3
->array_fields
;
4652 bool src_is_for_mic
= (m_vars
[i
].direction
.out
||
4653 m_vars
[i
].into
== NULL
);
4655 ReadArrElements
<void *> ptr
;
4656 ReadArrElements
<void *> into
;
4657 ReadArrElements
<int64_t> ext_start
;
4658 ReadArrElements
<int64_t> ext_elements
;
4659 ReadArrElements
<int64_t> align
;
4660 ReadArrElements
<int64_t> alloc_if
;
4661 ReadArrElements
<int64_t> free_if
;
4662 ReadArrElements
<int64_t> into_start
;
4663 ReadArrElements
<int64_t> into_elem
;
4664 ReadArrElements
<int64_t> alloc_start
;
4665 ReadArrElements
<int64_t> alloc_elem
;
4668 ap
= static_cast<const Arr_Desc
*>(vd3
->ptr_array
);
4670 // "pointers_number" for total number of transferred pointers.
4671 // For each of them we create new var_desc and put it at the bottom
4672 // of the var_desc's array
4673 get_arr_desc_numbers(ap
, sizeof(void *), ptr
.offset
, ptr
.size
,
4674 pointers_number
, ptr
.ranges
);
4675 ptr
.base
= (m_vars
[i
].flags
.is_pointer
) ?
4676 *(reinterpret_cast<char**>(ap
->base
)) :
4677 reinterpret_cast<char*>(ap
->base
);
4679 // 2. prepare memory for new var_descs
4680 m_vars_total
+= pointers_number
;
4681 m_vars
= (VarDesc
*)realloc(m_vars
, m_vars_total
* sizeof(VarDesc
));
4683 LIBOFFLOAD_ERROR(c_malloc
);
4685 (VarExtra
*)realloc(m_vars_extra
, m_vars_total
* sizeof(VarExtra
));
4686 if (m_vars_extra
== NULL
)
4687 LIBOFFLOAD_ERROR(c_malloc
);
4689 (COIEVENT
*)realloc(m_in_deps
, sizeof(COIEVENT
) * (m_vars_total
+ 1));
4690 if (m_in_deps
== NULL
)
4691 LIBOFFLOAD_ERROR(c_malloc
);
4693 (COIEVENT
*)realloc(m_out_deps
, sizeof(COIEVENT
) * m_vars_total
);
4694 if (m_out_deps
== NULL
)
4695 LIBOFFLOAD_ERROR(c_malloc
);
4697 // 3. Prepare for reading new var_desc's fields
4699 if ((flags
& (1<<flag_extent_start_is_array
)) != 0) {
4700 ap
= static_cast<const Arr_Desc
*>(vd3
->extent_start
);
4701 get_arr_desc_numbers(ap
, ap
->dim
[ap
->rank
- 1].size
, ext_start
.offset
,
4702 ext_start
.size
, tmp_val
, ext_start
.ranges
);
4703 ext_start
.base
= reinterpret_cast<char*>(ap
->base
);
4704 ext_start
.el_size
= ap
->dim
[ap
->rank
- 1].size
;
4706 if (tmp_val
< pointers_number
) {
4707 LIBOFFLOAD_ERROR(c_pointer_array_mismatch
, "extent start");
4711 else if ((flags
& (1<<flag_extent_start_is_scalar
)) != 0) {
4712 ext_start
.val
= (int64_t)vd3
->extent_start
;
4718 // EXTENT ELEMENTS NUMBER
4719 if ((flags
& (1<<flag_extent_elements_is_array
)) != 0) {
4720 ap
= static_cast<const Arr_Desc
*>(vd3
->extent_elements
);
4721 get_arr_desc_numbers(ap
, ap
->dim
[ap
->rank
- 1].size
,
4722 ext_elements
.offset
, ext_elements
.size
,
4723 tmp_val
, ext_elements
.ranges
);
4724 ext_elements
.base
= reinterpret_cast<char*>(ap
->base
);
4725 ext_elements
.el_size
= ap
->dim
[ap
->rank
- 1].size
;
4727 if (tmp_val
< pointers_number
) {
4728 LIBOFFLOAD_ERROR(c_pointer_array_mismatch
, "extent elements");
4732 else if ((flags
& (1<<flag_extent_elements_is_scalar
)) != 0) {
4733 ext_elements
.val
= (int64_t)vd3
->extent_elements
;
4736 ext_elements
.val
= m_vars
[i
].count
;
4740 if ((flags
& (1<<flag_alloc_if_is_array
)) != 0) {
4741 ap
= static_cast<const Arr_Desc
*>(vd3
->alloc_if_array
);
4742 get_arr_desc_numbers(ap
, ap
->dim
[ap
->rank
- 1].size
, alloc_if
.offset
,
4743 alloc_if
.size
, tmp_val
, alloc_if
.ranges
);
4744 alloc_if
.base
= reinterpret_cast<char*>(ap
->base
);
4745 alloc_if
.el_size
= ap
->dim
[ap
->rank
- 1].size
;
4747 if (tmp_val
< pointers_number
) {
4748 LIBOFFLOAD_ERROR(c_pointer_array_mismatch
, "alloc_if");
4753 alloc_if
.val
= m_vars
[i
].alloc_if
;
4757 if ((flags
& (1<<flag_free_if_is_array
)) != 0) {
4758 ap
= static_cast<const Arr_Desc
*>(vd3
->free_if_array
);
4759 get_arr_desc_numbers(ap
, ap
->dim
[ap
->rank
- 1].size
, free_if
.offset
,
4760 free_if
.size
, tmp_val
, free_if
.ranges
);
4761 free_if
.base
= reinterpret_cast<char*>(ap
->base
);
4762 free_if
.el_size
= ap
->dim
[ap
->rank
- 1].size
;
4764 if (tmp_val
< pointers_number
) {
4765 LIBOFFLOAD_ERROR(c_pointer_array_mismatch
, "free_if");
4770 free_if
.val
= m_vars
[i
].free_if
;
4775 if ((flags
& (1<<flag_align_is_array
)) != 0) {
4776 ap
= static_cast<const Arr_Desc
*>(vd3
->align_array
);
4777 get_arr_desc_numbers(ap
, ap
->dim
[ap
->rank
- 1].size
, align
.offset
,
4778 align
.size
, tmp_val
, align
.ranges
);
4779 align
.base
= reinterpret_cast<char*>(ap
->base
);
4780 align
.el_size
= ap
->dim
[ap
->rank
- 1].size
;
4782 if (tmp_val
< pointers_number
) {
4783 LIBOFFLOAD_ERROR(c_pointer_array_mismatch
, "align");
4788 align
.val
= m_vars
[i
].align
;
4793 if (m_vars
[i
].into
) {
4794 ap
= static_cast<const Arr_Desc
*>(m_vars
[i
].into
);
4795 get_arr_desc_numbers(ap
, ap
->dim
[ap
->rank
- 1].size
, into
.offset
,
4796 into
.size
, tmp_val
, into
.ranges
);
4797 into
.base
= reinterpret_cast<char*>(ap
->base
);
4799 if (tmp_val
< pointers_number
) {
4800 LIBOFFLOAD_ERROR(c_pointer_array_mismatch
, "into");
4807 if ((flags
& (1<<flag_into_start_is_array
)) != 0) {
4808 ap
= static_cast<const Arr_Desc
*>(vd3
->into_start
);
4809 get_arr_desc_numbers(ap
, ap
->dim
[ap
->rank
- 1].size
, into_start
.offset
,
4810 into_start
.size
, tmp_val
, into_start
.ranges
);
4811 into_start
.base
= reinterpret_cast<char*>(ap
->base
);
4812 into_start
.el_size
= ap
->dim
[ap
->rank
- 1].size
;
4814 if (tmp_val
< pointers_number
) {
4815 LIBOFFLOAD_ERROR(c_pointer_array_mismatch
, "into_extent start");
4819 else if ((flags
& (1<<flag_into_start_is_scalar
)) != 0) {
4820 into_start
.val
= (int64_t)vd3
->into_start
;
4826 // 3.3 INTO_ELEMENTS
4828 if ((flags
& (1<<flag_into_elements_is_array
)) != 0) {
4829 ap
= static_cast<const Arr_Desc
*>(vd3
->into_elements
);
4830 get_arr_desc_numbers(ap
, ap
->dim
[ap
->rank
- 1].size
, into_elem
.offset
,
4831 into_elem
.size
, tmp_val
, into_elem
.ranges
);
4832 into_elem
.base
= reinterpret_cast<char*>(ap
->base
);
4833 into_elem
.el_size
= ap
->dim
[ap
->rank
- 1].size
;
4835 if (tmp_val
< pointers_number
) {
4836 LIBOFFLOAD_ERROR(c_pointer_array_mismatch
, "into_extent elements");
4840 else if ((flags
& (1<<flag_into_elements_is_scalar
)) != 0) {
4841 into_elem
.val
= (int64_t)vd3
->into_elements
;
4844 into_elem
.val
= m_vars
[i
].count
;
4849 if ((flags
& (1<<flag_alloc_start_is_array
)) != 0) {
4850 ap
= static_cast<const Arr_Desc
*>(vd3
->alloc_start
);
4851 get_arr_desc_numbers(ap
, ap
->dim
[ap
->rank
- 1].size
,
4852 alloc_start
.offset
, alloc_start
.size
, tmp_val
,
4853 alloc_start
.ranges
);
4854 alloc_start
.base
= reinterpret_cast<char*>(ap
->base
);
4855 alloc_start
.el_size
= ap
->dim
[ap
->rank
- 1].size
;
4857 if (tmp_val
< pointers_number
) {
4858 LIBOFFLOAD_ERROR(c_pointer_array_mismatch
, "alloc_extent start");
4862 else if ((flags
& (1<<flag_alloc_start_is_scalar
)) != 0) {
4863 alloc_start
.val
= (int64_t)vd3
->alloc_start
;
4866 alloc_start
.val
= 0;
4871 if ((flags
& (1<<flag_alloc_elements_is_array
)) != 0) {
4872 ap
= static_cast<const Arr_Desc
*>(vd3
->alloc_elements
);
4873 get_arr_desc_numbers(ap
, ap
->dim
[ap
->rank
- 1].size
, alloc_elem
.offset
,
4874 alloc_elem
.size
, tmp_val
, alloc_elem
.ranges
);
4875 alloc_elem
.base
= reinterpret_cast<char*>(ap
->base
);
4876 alloc_elem
.el_size
= ap
->dim
[ap
->rank
- 1].size
;
4877 if (tmp_val
< pointers_number
) {
4878 LIBOFFLOAD_ERROR(c_pointer_array_mismatch
,
4879 "alloc_extent elements");
4883 else if ((flags
& (1<<flag_alloc_elements_is_scalar
)) != 0) {
4884 alloc_elem
.val
= (int64_t)vd3
->alloc_elements
;
4890 for (int k
= 0; k
< pointers_number
; k
++) {
4891 int type
= flags
& 0x3f;
4892 int type_src
, type_dst
;
4894 // type_src, type_dst
4895 type_src
= type_dst
= (type
== c_data_ptr_array
) ?
4896 c_data_ptr
: (type
== c_func_ptr_array
) ?
4897 c_func_ptr
: (type
== c_void_ptr_array
) ?
4898 c_void_ptr
: (type
== c_string_ptr_array
) ?
4902 if (!ptr
.read_next(true)) {
4906 ptr
.val
= (void*)(ptr
.base
+ ptr
.offset
);
4909 // !!! If we got error at phase of reading - it's an internal
4910 // !!! error, as we must detect mismatch before
4913 if (m_vars
[i
].into
) {
4914 if (!into
.read_next(true)) {
4915 LIBOFFLOAD_ERROR(c_pointer_array_mismatch
, "into");
4919 into
.val
= (void*)(into
.base
+ into
.offset
);
4923 // Get other components of the clause
4924 if (!ext_start
.read_next(flags
& (1<<flag_extent_start_is_array
))) {
4925 LIBOFFLOAD_ERROR(c_pointer_array_mismatch
, "extent start");
4928 if (!ext_elements
.read_next(
4929 flags
& (1<<flag_extent_elements_is_array
))) {
4930 LIBOFFLOAD_ERROR(c_pointer_array_mismatch
, "extent elements");
4933 if (!alloc_if
.read_next(flags
& (1<<flag_alloc_if_is_array
))) {
4934 LIBOFFLOAD_ERROR(c_pointer_array_mismatch
, "alloc_if");
4937 if (!free_if
.read_next(flags
& (1<<flag_free_if_is_array
))) {
4938 LIBOFFLOAD_ERROR(c_pointer_array_mismatch
, "free_if");
4941 if (!align
.read_next(flags
& (1<<flag_align_is_array
))) {
4942 LIBOFFLOAD_ERROR(c_pointer_array_mismatch
, "align");
4945 if (!into_start
.read_next(flags
& (1<<flag_into_start_is_array
))) {
4946 LIBOFFLOAD_ERROR(c_pointer_array_mismatch
, "into_extent start");
4949 if (!into_elem
.read_next(flags
& (1<<flag_into_elements_is_array
))) {
4950 LIBOFFLOAD_ERROR(c_pointer_array_mismatch
, "into_extent elements");
4953 if (!alloc_start
.read_next(flags
& (1<<flag_alloc_start_is_array
))) {
4954 LIBOFFLOAD_ERROR(c_pointer_array_mismatch
, "alloc_extent start");
4957 if (!alloc_elem
.read_next(
4958 flags
& (1<<flag_alloc_elements_is_array
))) {
4959 LIBOFFLOAD_ERROR(c_pointer_array_mismatch
, "alloc_extent elements");
4963 m_vars
[new_index
+ k
].direction
.bits
= m_vars
[i
].direction
.bits
;
4964 m_vars
[new_index
+ k
].alloc_if
= alloc_if
.val
;
4965 m_vars
[new_index
+ k
].free_if
= free_if
.val
;
4966 m_vars
[new_index
+ k
].align
= align
.val
;
4967 m_vars
[new_index
+ k
].mic_offset
= 0;
4968 m_vars
[new_index
+ k
].flags
.bits
= m_vars
[i
].flags
.bits
;
4969 m_vars
[new_index
+ k
].offset
= 0;
4970 m_vars
[new_index
+ k
].size
= m_vars
[i
].size
;
4971 m_vars
[new_index
+ k
].flags
.targetptr
= m_vars
[i
].flags
.targetptr
;
4972 m_vars
[new_index
+ k
].flags
.preallocated
=
4973 m_vars
[i
].flags
.preallocated
;
4975 if (ext_start
.val
== 0) {
4976 m_vars
[new_index
+ k
].count
= ext_elements
.val
;
4977 m_vars
[new_index
+ k
].ptr
= ptr
.val
;
4978 if (type_src
== c_string_ptr
) {
4979 m_vars
[new_index
+ k
].size
= 0;
4983 m_vars
[new_index
+ k
].count
= 0;
4984 m_vars
[new_index
+ k
].ptr
=
4985 static_cast<void*>(make_arr_desc(
4991 type_src
= type_src
== c_data_ptr
? c_cean_var_ptr
:
4992 c_string_ptr
? c_cean_var_ptr
:
4994 if (!m_vars
[i
].into
) {
4995 type_dst
= type_src
;
4999 if (m_vars
[i
].into
&& into_elem
.val
!= 0) {
5000 m_vars
[new_index
+ k
].into
=
5001 static_cast<void*>(make_arr_desc(
5006 type_dst
= (type
== c_data_ptr_array
) ? c_cean_var_ptr
:
5007 (type
== c_string_ptr_array
) ? c_cean_var_ptr
:
5011 m_vars
[new_index
+ k
].into
= NULL
;
5014 if (alloc_elem
.val
!= 0) {
5015 m_vars
[new_index
+ k
].alloc
=
5016 static_cast<void*>(make_arr_desc(
5023 m_vars
[new_index
+ k
].alloc
= NULL
;
5026 m_vars
[new_index
+ k
].type
.src
= type_src
;
5027 m_vars
[new_index
+ k
].type
.dst
= type_dst
;
5029 m_vars_extra
[new_index
+ k
].alloc
= m_vars
[new_index
+ k
].alloc
;
5030 m_vars_extra
[new_index
+ k
].is_arr_ptr_el
= 1;
5031 m_vars_extra
[new_index
+ k
].ptr_arr_offset
=
5032 src_is_for_mic
? ptr
.offset
: into
.offset
;
5034 // count and alloc fields are useless at target. They can be reused
5035 // for pointer arrays.
5036 m_vars
[i
].count
= pointers_number
;
5037 m_vars
[i
].ptr_arr_offset
= new_index
;
5041 // Gets in dependencies of the previous offload via the stream "m_stream".
5042 // Out argument in_deps_amount - address of amount of the dependencies
5043 // Out argument in_deps - array of dependencies.
5044 // Description of the dependencies scheme for streams :
5045 // ----------------------------------------------------
5046 // Every offload forms DAG consisted of 3 nodes:
5047 // for in-transfers, runfunction and out-transfers.
5048 // Every node has in-dependencies and out-dependencies
5049 // Out-dependencies of previous node forms in-dependencies of current node.
5050 // In-dependencies of 1-st node (of in-transfers) without streams is equal
5051 // to NULL. For streams in-dependencies of 1-st node is equal to list of out
5052 // dependencies of last node of previous offload via this stream.
5053 // So we can say that DAGs of 2 consequent offloads via the same stream are
5054 // connected by the way described above.
5055 void OffloadDescriptor::get_stream_in_dependencies(
5056 uint32_t &in_deps_amount
,
5060 if (m_stream
!= no_stream
&& m_stream
!= 0) {
5061 Stream
* stream
= Stream::find_stream(m_stream
, false);
5063 LIBOFFLOAD_ERROR(c_offload_no_stream
,
5064 m_device
.get_logical_index());
5067 OffloadDescriptor
* offload
= stream
->get_last_offload();
5069 // if it's the first offload in the stream
5073 // if last offload has out-tranfers
5074 if (offload
->m_out_deps_total
) {
5075 in_deps_amount
= offload
->m_out_deps_total
;
5076 in_deps
= offload
->m_out_deps
;
5078 // last offload only sends pointer data or run function or both of them
5079 // and has no out-transfers
5080 else if (offload
->m_in_deps_total
) {
5081 in_deps_amount
= offload
->m_in_deps_total
;
5082 in_deps
= offload
->m_in_deps
;
5087 static void __offload_fini_library(void)
5089 OFFLOAD_DEBUG_TRACE(2, "Cleanup offload library ...\n");
5090 if (mic_engines_total
> 0) {
5091 delete[] mic_engines
;
5092 mic_engines_total
= 0;
5094 if (mic_proxy_fs_root
!= 0) {
5095 free(mic_proxy_fs_root
);
5096 mic_proxy_fs_root
= 0;
5099 if (mic_library_path
!= 0) {
5100 free(mic_library_path
);
5101 mic_library_path
= 0;
5104 // destroy thread key
5105 thread_key_delete(mic_thread_key
);
5108 // unload COI library
5109 if (COI::is_available
) {
5113 OFFLOAD_DEBUG_TRACE(2, "Cleanup offload library ... done\n");
5116 static void __offload_init_library_once(void)
5119 uint32_t num_devices
;
5120 std::bitset
<MIC_ENGINES_MAX
> devices
;
5121 prefix
= report_get_message_str(c_report_host
);
5124 const char *env_var
= getenv(htrace_envname
);
5125 if (env_var
!= 0 && *env_var
!= '\0') {
5127 if (__offload_parse_int_string(env_var
, new_val
)) {
5128 console_enabled
= new_val
& 0x0f;
5132 env_var
= getenv(offload_report_envname
);
5133 if (env_var
!= 0 && *env_var
!= '\0') {
5135 if (__offload_parse_int_string(env_var
, env_val
)) {
5136 if (env_val
== OFFLOAD_REPORT_1
||
5137 env_val
== OFFLOAD_REPORT_2
||
5138 env_val
== OFFLOAD_REPORT_3
) {
5139 offload_report_level
= env_val
;
5142 LIBOFFLOAD_ERROR(c_invalid_env_report_value
,
5143 offload_report_envname
);
5147 LIBOFFLOAD_ERROR(c_invalid_env_var_int_value
,
5148 offload_report_envname
);
5151 else if (!offload_report_level
) {
5152 env_var
= getenv(timer_envname
);
5153 if (env_var
!= 0 && *env_var
!= '\0') {
5154 timer_enabled
= atoi(env_var
);
5163 // get number of devices installed in the system
5164 res
= COI::EngineGetCount(COI_ISA_MIC
, &num_devices
);
5165 if (res
!= COI_SUCCESS
) {
5169 if (num_devices
> MIC_ENGINES_MAX
) {
5170 num_devices
= MIC_ENGINES_MAX
;
5173 // fill in the list of devices that can be used for offloading
5174 env_var
= getenv("OFFLOAD_DEVICES");
5176 if (strcasecmp(env_var
, "none") != 0) {
5177 // value is composed of comma separated physical device indexes
5178 char *buf
= strdup(env_var
);
5180 LIBOFFLOAD_ERROR(c_malloc
);
5182 for (str
= strtok_r(buf
, ",", &ptr
); str
!= 0;
5183 str
= strtok_r(0, ",", &ptr
)) {
5184 // convert string to an int
5186 if (!__offload_parse_int_string(str
, num
)) {
5187 LIBOFFLOAD_ERROR(c_mic_init5
);
5189 // fallback to using all installed devices
5191 for (int i
= 0; i
< num_devices
; i
++) {
5196 if (num
< 0 || num
>= num_devices
) {
5197 LIBOFFLOAD_ERROR(c_mic_init6
, num
);
5206 // use all available devices
5207 for (int i
= 0; i
< num_devices
; i
++) {
5209 res
= COI::EngineGetHandle(COI_ISA_MIC
, i
, &engine
);
5210 if (res
== COI_SUCCESS
) {
5216 mic_engines_total
= devices
.count();
5218 // no need to continue if there are no devices to offload to
5219 if (mic_engines_total
<= 0) {
5223 // initialize indexes for available devices
5224 mic_engines
= new Engine
[mic_engines_total
];
5225 for (int p_idx
= 0, l_idx
= 0; p_idx
< num_devices
; p_idx
++) {
5226 if (devices
[p_idx
]) {
5227 mic_engines
[l_idx
].set_indexes(l_idx
, p_idx
);
5232 // Get DMA channel count to pass it to COI
5233 env_var
= getenv("OFFLOAD_DMA_CHANNEL_COUNT");
5236 if (__offload_parse_int_string(env_var
, new_val
)) {
5237 mic_dma_channel_count
= new_val
;
5240 LIBOFFLOAD_ERROR(c_invalid_env_var_value
,
5241 "OFFLOAD_DMA_CHANNEL_COUNT");
5245 // Set COI_HOST_THREAD_AFFINITY if OFFLOAD_HOST_THREAD_AFFINITY is set.
5246 // Use putenv instead of setenv as Windows has no setenv.
5247 // Note: putenv requires its argument can't be freed or modified.
5248 // So no free after call to putenv or elsewhere.
5249 env_var
= getenv("OFFLOAD_HOST_THREAD_AFFINITY");
5251 char * new_env_var
=
5252 (char*) malloc(sizeof("COI_HOST_THREAD_AFFINITY=") +
5254 if (new_env_var
== NULL
)
5255 LIBOFFLOAD_ERROR(c_malloc
);
5256 sprintf(new_env_var
, "COI_HOST_THREAD_AFFINITY=%s", env_var
);
5257 putenv(new_env_var
);
5260 // library search path for device binaries
5261 env_var
= getenv("MIC_LD_LIBRARY_PATH");
5263 mic_library_path
= strdup(env_var
);
5264 if (mic_library_path
== NULL
)
5265 LIBOFFLOAD_ERROR(c_malloc
);
5269 // find target executable to be used if main application is not an
5270 // offload build application.
5271 const char *base_name
= "offload_main";
5272 if (mic_library_path
!= 0) {
5273 char *buf
= strdup(mic_library_path
);
5275 LIBOFFLOAD_ERROR(c_malloc
);
5276 char *try_name
= (char*) alloca(strlen(mic_library_path
) +
5277 strlen(base_name
) + 2);
5280 for (dir
= strtok_r(buf
, PATH_SEPARATOR
, &ptr
); dir
!= 0;
5281 dir
= strtok_r(0, PATH_SEPARATOR
, &ptr
)) {
5282 // compose a full path
5283 sprintf(try_name
, "%s/%s", dir
, base_name
);
5285 // check if such file exists
5287 if (stat(try_name
, &st
) == 0 && S_ISREG(st
.st_mode
)) {
5288 mic_device_main
= strdup(try_name
);
5289 if (mic_device_main
== NULL
)
5290 LIBOFFLOAD_ERROR(c_malloc
);
5298 // memory size reserved for COI buffers
5299 env_var
= getenv("MIC_BUFFERSIZE");
5302 if (__offload_parse_size_string(env_var
, new_size
)) {
5303 mic_buffer_size
= new_size
;
5306 LIBOFFLOAD_ERROR(c_invalid_env_var_value
, "MIC_BUFFERSIZE");
5310 // memory size reserved for 4K pages for COI buffers
5311 env_var
= getenv("MIC_4K_BUFFER_RESERVE_SIZE");
5314 if (__offload_parse_size_string(env_var
, new_size
)) {
5315 mic_4k_buffer_size
= new_size
;
5318 LIBOFFLOAD_ERROR(c_invalid_env_var_value
, "MIC_4K_BUFFER_RESERVE_SIZE");
5322 // memory size reserved for 2M pages for COI buffers
5323 env_var
= getenv("MIC_2M_BUFFER_RESERVE_SIZE");
5326 if (__offload_parse_size_string(env_var
, new_size
)) {
5327 mic_2m_buffer_size
= new_size
;
5330 LIBOFFLOAD_ERROR(c_invalid_env_var_value
, "MIC_2M_BUFFER_RESERVE_SIZE");
5334 // determine stacksize for the pipeline on the device
5335 env_var
= getenv("MIC_STACKSIZE");
5336 if (env_var
!= 0 && *env_var
!= '\0') {
5338 if (__offload_parse_size_string(env_var
, new_size
) &&
5339 (new_size
>= 16384) && ((new_size
& 4095) == 0)) {
5340 mic_stack_size
= new_size
;
5343 LIBOFFLOAD_ERROR(c_mic_init3
);
5348 env_var
= getenv("MIC_PROXY_IO");
5349 if (env_var
!= 0 && *env_var
!= '\0') {
5351 if (__offload_parse_int_string(env_var
, new_val
)) {
5352 mic_proxy_io
= new_val
;
5355 LIBOFFLOAD_ERROR(c_invalid_env_var_int_value
, "MIC_PROXY_IO");
5358 env_var
= getenv("MIC_PROXY_FS_ROOT");
5359 if (env_var
!= 0 && *env_var
!= '\0') {
5360 mic_proxy_fs_root
= strdup(env_var
);
5361 if (mic_proxy_fs_root
== NULL
)
5362 LIBOFFLOAD_ERROR(c_malloc
);
5365 // Prepare environment for the target process using the following
5367 // - If MIC_ENV_PREFIX is set then any environment variable on the
5368 // host which has that prefix are copied to the device without
5370 // All other host environment variables are ignored.
5371 // - If MIC_ENV_PREFIX is not set or if MIC_ENV_PREFIX="" then host
5372 // environment is duplicated.
5373 env_var
= getenv("MIC_ENV_PREFIX");
5374 if (env_var
!= 0 && *env_var
!= '\0') {
5375 mic_env_vars
.set_prefix(env_var
);
5377 int len
= strlen(env_var
);
5378 for (int i
= 0; environ
[i
] != 0; i
++) {
5379 if (strncmp(environ
[i
], env_var
, len
) == 0 &&
5380 strncmp(environ
[i
], "MIC_LD_LIBRARY_PATH", 19) != 0 &&
5381 environ
[i
][len
] != '=') {
5382 mic_env_vars
.analyze_env_var(environ
[i
]);
5387 // create key for thread data
5388 if (thread_key_create(&mic_thread_key
, Engine::destroy_thread_data
)) {
5389 LIBOFFLOAD_ERROR(c_mic_init4
, errno
);
5394 cpu_frequency
= COI::PerfGetCycleFrequency();
5396 env_var
= getenv(mic_use_2mb_buffers_envname
);
5397 if (env_var
!= 0 && *env_var
!= '\0') {
5399 if (__offload_parse_size_string(env_var
, new_size
)) {
5400 __offload_use_2mb_buffers
= new_size
;
5403 LIBOFFLOAD_ERROR(c_invalid_env_var_value
,
5404 mic_use_2mb_buffers_envname
);
5408 env_var
= getenv(mic_use_async_buffer_write_envname
);
5409 if (env_var
!= 0 && *env_var
!= '\0') {
5411 if (__offload_parse_size_string(env_var
, new_size
)) {
5412 __offload_use_async_buffer_write
= new_size
;
5416 env_var
= getenv(mic_use_async_buffer_read_envname
);
5417 if (env_var
!= 0 && *env_var
!= '\0') {
5419 if (__offload_parse_size_string(env_var
, new_size
)) {
5420 __offload_use_async_buffer_read
= new_size
;
5424 // mic initialization type
5425 env_var
= getenv(offload_init_envname
);
5426 if (env_var
!= 0 && *env_var
!= '\0') {
5427 if (strcmp(env_var
, "on_offload") == 0) {
5428 __offload_init_type
= c_init_on_offload
;
5430 else if (strcmp(env_var
, "on_offload_all") == 0) {
5431 __offload_init_type
= c_init_on_offload_all
;
5433 else if (strcmp(env_var
, "on_start") == 0) {
5434 __offload_init_type
= c_init_on_start
;
5437 LIBOFFLOAD_ERROR(c_invalid_env_var_value
, offload_init_envname
);
5442 env_var
= getenv(offload_active_wait_envname
);
5443 if (env_var
!= 0 && *env_var
!= '\0') {
5445 if (__offload_parse_int_string(env_var
, new_val
)) {
5446 __offload_active_wait
= new_val
;
5449 LIBOFFLOAD_ERROR(c_invalid_env_var_int_value
,
5450 offload_active_wait_envname
);
5455 env_var
= getenv(omp_device_num_envname
);
5456 if (env_var
!= 0 && *env_var
!= '\0') {
5458 if (__offload_parse_int_string(env_var
, new_val
) && new_val
>= 0) {
5459 __omp_device_num
= new_val
;
5462 LIBOFFLOAD_ERROR(c_omp_invalid_device_num_env
,
5463 omp_device_num_envname
);
5467 // parallel copy of offload_transfer
5468 env_var
= getenv(parallel_copy_envname
);
5469 if (env_var
!= 0 && *env_var
!= '\0') {
5471 if (__offload_parse_int_string(env_var
, new_val
) && new_val
>= 0) {
5472 __offload_parallel_copy
= new_val
;
5475 LIBOFFLOAD_ERROR(c_invalid_env_var_value
,
5476 parallel_copy_envname
);
5480 // use COI interface for noncontiguous arrays transfer
5481 env_var
= getenv(use_coi_noncontiguous_transfer_envname
);
5482 if (env_var
!= 0 && *env_var
!= '\0') {
5484 if (__offload_parse_size_string(env_var
, new_size
)) {
5485 __offload_use_coi_noncontiguous_transfer
= new_size
;
5488 LIBOFFLOAD_ERROR(c_invalid_env_var_value
,
5489 use_coi_noncontiguous_transfer_envname
);
5497 extern int __offload_init_library(void)
5499 // do one time intialization
5500 static OffloadOnceControl ctrl
= OFFLOAD_ONCE_CONTROL_INIT
;
5501 __offload_run_once(&ctrl
, __offload_init_library_once
);
5503 // offload is available if COI is available and the number of devices > 0
5504 bool is_available
= COI::is_available
&& (mic_engines_total
> 0);
5506 // register pending libraries if there are any
5507 if (is_available
&& __target_libs
) {
5508 mutex_locker_t
locker(__target_libs_lock
);
5510 for (TargetImageList::iterator it
= __target_libs_list
.begin();
5511 it
!= __target_libs_list
.end(); it
++) {
5512 // Register library in COI
5513 COI::ProcessRegisterLibraries(1, &it
->data
, &it
->size
,
5514 &it
->origin
, &it
->offset
);
5516 // add lib to all engines
5517 for (int i
= 0; i
< mic_engines_total
; i
++) {
5518 mic_engines
[i
].add_lib(*it
);
5522 __target_libs
= false;
5523 __target_libs_list
.clear();
5526 return is_available
;
5529 extern "C" bool __offload_target_image_is_executable(const void *target_image
)
5531 const struct Image
*image
= static_cast<const struct Image
*>(target_image
);
5534 const char *name
= image
->data
;
5535 const void *data
= image
->data
+ strlen(image
->data
) + 1;
5537 // determine image type
5538 const Elf64_Ehdr
*hdr
= static_cast<const Elf64_Ehdr
*>(data
);
5539 return (hdr
->e_type
== ET_EXEC
);
5542 extern "C" bool __offload_register_image(const void *target_image
)
5544 const struct Image
*image
= static_cast<const struct Image
*>(target_image
);
5547 const char *name
= image
->data
;
5548 const void *data
= image
->data
+ strlen(image
->data
) + 1;
5549 uint64_t size
= image
->size
;
5550 char *origin
= (char *) malloc(strlen(image
->data
) + 1);
5551 uint64_t offset
= 0;
5552 const char *host_name
= image
->data
;
5556 LIBOFFLOAD_ERROR(c_malloc
);
5558 // The origin name is the name of the file on the host
5559 // this is used by Vtune, since it is a fat binary we
5560 // use the host file name of the fat binary.
5561 // Driver prepends the host file name ending with "?"
5562 // to the image->data name so need to extract the string
5564 while (*host_name
!= '\0' && *host_name
!= '?') {
5565 origin
[i
] = *host_name
;
5570 // Implies the host name does not exist which really should
5571 // not occur. Allow this since only consumer is Vtune.
5572 if ((i
== 0) || (*host_name
!= '?')) {
5577 // our actions depend on the image type
5578 const Elf64_Ehdr
*hdr
= static_cast<const Elf64_Ehdr
*>(data
);
5579 switch (hdr
->e_type
) {
5581 // Each offload application is supposed to have only one target
5582 // image representing target executable.
5583 // No thread synchronization is required here as the initialization
5584 // code is always executed in a single thread.
5585 if (__target_exe
!= 0) {
5586 LIBOFFLOAD_ERROR(c_multiple_target_exes
);
5589 __target_exe
= new TargetImage(name
, data
, size
, origin
, offset
);
5591 // Registration code for execs is always called from the context
5592 // of main and thus we can safely call any function here,
5593 // including LoadLibrary API on windows. This is the place where
5594 // we do the offload library initialization.
5595 if (__offload_init_library()) {
5596 // initialize engine if init_type is on_start
5597 if (__offload_init_type
== c_init_on_start
) {
5598 for (int i
= 0; i
< mic_engines_total
; i
++) {
5599 mic_engines
[i
].init();
5603 return mic_engines_total
> 0;
5607 char *fullname
= origin
;
5608 // We add the library to a list of pending libraries
5609 __target_libs_lock
.lock();
5610 __target_libs
= true;
5611 __target_libs_list
.push_back(
5612 TargetImage(name
, data
, size
, fullname
, offset
));
5613 __target_libs_lock
.unlock();
5614 // If __target_exe is set, then main has started running
5615 // If not main, then we can't do anything useful here
5616 // because this registration code is called from DllMain
5617 // context (on windows).
5618 if (__target_exe
!= 0) {
5619 // There is no need to delay loading the library
5620 if (!__offload_init_library()) {
5621 // Couldn't validate library as a fat offload library
5622 LIBOFFLOAD_ERROR(c_unknown_binary_type
);
5630 // something is definitely wrong, issue an error and exit
5631 LIBOFFLOAD_ERROR(c_unknown_binary_type
);
5636 extern "C" void __offload_unregister_image(const void *target_image
)
5638 // Target image is packed as follows:
5639 // 8 bytes - size of the target binary
5640 // null-terminated string - binary name
5641 // <size> bytes - binary contents
5642 const struct Image
{
5645 } *image
= static_cast<const struct Image
*>(target_image
);
5648 const char *name
= image
->data
;
5649 const void *data
= image
->data
+ strlen(image
->data
) + 1;
5651 // our actions depend on the image type
5652 const Elf64_Ehdr
*hdr
= static_cast<const Elf64_Ehdr
*>(data
);
5653 if (hdr
->e_type
== ET_EXEC
) {
5654 // We are executing exec's desctructors.
5655 // It is time to do a library cleanup.
5656 if (timer_enabled
) {
5657 Offload_Timer_Print();
5661 __offload_myoFini();
5662 #endif // MYO_SUPPORT
5664 __offload_fini_library();
5666 else if (hdr
->e_type
== ET_DYN
) {
5667 for (int i
= 0; i
< mic_engines_total
; i
++) {
5668 mic_engines
[i
].unload_library(data
, name
);
5674 extern "C" void __offload_register_task_callback(void (*cb
)(void *))
5676 task_completion_callback
= cb
;
5679 // Runtime trace interface for user programs
5681 void __offload_console_trace(int level
)
5683 console_enabled
= level
;
5686 // User-visible offload API
5688 int _Offload_number_of_devices(void)
5690 __offload_init_library();
5691 return mic_engines_total
;
5694 int _Offload_get_device_number(void)
5699 int _Offload_get_physical_device_number(void)
5704 int _Offload_signaled(int index
, void *signal
)
5706 __offload_init_library();
5708 // check index value
5710 LIBOFFLOAD_ERROR(c_offload_signaled1
, index
);
5714 index
%= mic_engines_total
;
5716 // find associated async task
5717 OffloadDescriptor
*task
=
5718 mic_engines
[index
].find_signal(signal
, false);
5720 LIBOFFLOAD_ERROR(c_offload_signaled2
, signal
);
5723 // if signal is removed by wait completing
5724 else if (task
== SIGNAL_IS_REMOVED
) {
5727 return task
->is_signaled();
5730 void _Offload_report(int val
)
5732 if (val
== OFFLOAD_REPORT_ON
||
5733 val
== OFFLOAD_REPORT_OFF
) {
5734 offload_report_enabled
= val
;
5738 int _Offload_find_associated_mic_memory(
5740 const void* cpu_addr
,
5741 void** cpu_base_addr
,
5742 uint64_t* buf_length
,
5744 uint64_t* mic_buf_start_offset
,
5748 __offload_init_library();
5750 // check target value
5752 LIBOFFLOAD_ERROR(c_offload_signaled1
, target
);
5755 target
%= mic_engines_total
;
5757 // find existing association in pointer table
5758 PtrData
* ptr_data
= mic_engines
[target
].find_ptr_data(cpu_addr
);
5759 if (ptr_data
== 0) {
5760 OFFLOAD_TRACE(3, "Association does not exist\n");
5764 OFFLOAD_TRACE(3, "Found association: base %p, length %lld, is_static %d\n",
5765 ptr_data
->cpu_addr
.start(), ptr_data
->cpu_addr
.length(),
5766 ptr_data
->is_static
);
5768 if (ptr_data
->mic_buf
!= 0 && ptr_data
->mic_addr
== 0) {
5769 COIRESULT res
= COI::BufferGetSinkAddress(ptr_data
->mic_buf
,
5770 &ptr_data
->mic_addr
);
5771 if (res
!= COI_SUCCESS
) {
5775 *cpu_base_addr
= const_cast<void *>(ptr_data
->cpu_addr
.start());
5776 *buf_length
= ptr_data
->cpu_addr
.length() - ptr_data
->alloc_disp
;
5777 *mic_addr
= (void *)(ptr_data
->mic_addr
+ ptr_data
->mic_offset
);
5778 *mic_buf_start_offset
= ptr_data
->alloc_disp
;
5779 *is_static
= ptr_data
->is_static
;
5780 return ptr_data
->is_static
? 1 : ptr_data
->get_reference();
5783 _Offload_stream
_Offload_stream_create(
5784 int device
, // MIC device number
5785 int number_of_cpus
// Cores allocated to the stream
5788 __offload_init_library();
5790 // check target value
5792 LIBOFFLOAD_ERROR(c_offload_signaled1
, device
);
5795 device
%= mic_engines_total
;
5797 // Create new stream and get its handle
5798 _Offload_stream handle
= Stream::add_stream(device
, number_of_cpus
);
5800 OFFLOAD_TRACE(3, "Can't create stream\n");
5804 // create pipeline associated with the new stream
5805 mic_engines
[device
].get_pipeline(handle
);
5810 int _Offload_stream_destroy(
5811 int device
, // MIC device number
5812 _Offload_stream handle
// stream to destroy
5815 __offload_init_library();
5817 // check target value
5819 LIBOFFLOAD_ERROR(c_offload_signaled1
, device
);
5822 device
%= mic_engines_total
;
5824 mic_engines
[device
].stream_destroy(handle
);
5829 int _Offload_stream_completed(int device
, _Offload_stream handler
)
5831 __offload_init_library();
5833 // check index value
5835 LIBOFFLOAD_ERROR(c_offload_signaled1
, device
);
5839 device
%= mic_engines_total
;
5845 stream
= Stream::find_stream(handler
, false);
5847 // the stream was not created or was destroyed
5849 LIBOFFLOAD_ERROR(c_offload_no_stream
, device
);
5853 // find associated async task
5854 OffloadDescriptor
*task
= stream
->get_last_offload();
5856 // offload was completed by offload_wait pragma or wait clause
5860 return task
->is_signaled();
5862 // zero handler is for all streams at the device
5864 StreamMap stream_map
= Stream::all_streams
;
5865 for (StreamMap::iterator it
= stream_map
.begin();
5866 it
!= stream_map
.end(); it
++) {
5867 Stream
* stream
= it
->second
;
5868 // find associated async task
5869 OffloadDescriptor
*task
= stream
->get_last_offload();
5871 // offload was completed by offload_wait pragma or wait clause
5875 // if even one stream is not completed result is false
5876 if (!task
->is_signaled()) {
5880 // no uncompleted streams
5886 int __dbg_is_attached
= 0;
5887 int __dbg_target_id
= -1;
5888 pid_t __dbg_target_so_pid
= -1;
5889 char __dbg_target_exe_name
[MAX_TARGET_NAME
] = {0};
5890 const int __dbg_api_major_version
= 1;
5891 const int __dbg_api_minor_version
= 0;
5893 void __dbg_target_so_loaded()
5896 void __dbg_target_so_unloaded()