2 Copyright (c) 2014-2016 Intel Corporation. All Rights Reserved.
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions
8 * Redistributions of source code must retain the above copyright
9 notice, this list of conditions and the following disclaimer.
10 * Redistributions in binary form must reproduce the above copyright
11 notice, this list of conditions and the following disclaimer in the
12 documentation and/or other materials provided with the distribution.
13 * Neither the name of Intel Corporation nor the names of its
14 contributors may be used to endorse or promote products derived
15 from this software without specific prior written permission.
17 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 // Forward declaration as the following 2 functions are declared as friend
32 // in offload_engine.h.
33 // CLANG does not like static to been after friend declaration.
34 static void __offload_init_library_once(void);
35 static void __offload_fini_library(void);
37 #include "offload_host.h"
39 #include "offload_myo_host.h"
46 #endif // TARGET_WINNT
52 #include <sys/types.h>
58 #if defined(HOST_WINNT)
59 #define PATH_SEPARATOR ";"
61 #define PATH_SEPARATOR ":"
64 #define GET_OFFLOAD_NUMBER(timer_data) \
65 timer_data? timer_data->offload_number : 0
67 static void (*task_completion_callback
)(void *);
71 // Windows does not support imports from libraries without actually
72 // including them as dependence. We don't want to include in the
73 // dependence since is it used only for Fortran when traceback is enabled.
74 // Chose to implement it with GetProcAddress.
75 #define FORTRAN_TRACE_BACK win_for__continue_traceback
76 int win_for__continue_traceback( _Offload_result coi_offload_result
)
79 int (* TraceBackRoutine
)(_Offload_result value
);
81 hDLL
= LoadLibrary("libifcoremd.dll");
83 TraceBackRoutine
= (int (*)(_Offload_result
)) GetProcAddress(hDLL
,
84 "for__continue_traceback");
85 if (TraceBackRoutine
!= 0) {
86 return TraceBackRoutine(coi_offload_result
);
90 "Cannot find for__continue_traceback routine in libifcorert.dll\n");
95 OFFLOAD_TRACE(3, "Cannot load libifcorert.dll\n");
101 #else // TARGET_WINNT
103 #define FORTRAN_TRACE_BACK for__continue_traceback
105 // for__continue_traceback is provided as a dummy to resolve link time symbols
106 // for C/C++ programs. For Fortran the actual fortran library function in
107 // libifcore.so is used.
108 #pragma weak for__continue_traceback
109 int for__continue_traceback( _Offload_result coi_offload_result
)
112 "liboffload function for_continue_traceback should not be called.\n");
115 #endif //TARGET_WINNT
119 // Small subset of ELF declarations for Windows which is needed to compile
120 // this file. ELF header is used to understand what binary type is contained
121 // in the target image - shared library or executable.
123 typedef uint16_t Elf64_Half
;
124 typedef uint32_t Elf64_Word
;
125 typedef uint64_t Elf64_Addr
;
126 typedef uint64_t Elf64_Off
;
135 unsigned char e_ident
[EI_NIDENT
];
137 Elf64_Half e_machine
;
138 Elf64_Word e_version
;
144 Elf64_Half e_phentsize
;
146 Elf64_Half e_shentsize
;
148 Elf64_Half e_shstrndx
;
150 #endif // TARGET_WINNT
152 // Host console and file logging
154 int console_enabled
= 0;
155 int offload_number
= 0;
157 static const char *htrace_envname
= "H_TRACE";
158 static const char *offload_report_envname
= "OFFLOAD_REPORT";
159 static const char *timer_envname
= "H_TIME";
161 // DMA channel count used by COI and set via
162 // OFFLOAD_DMA_CHANNEL_COUNT environment variable
163 uint32_t mic_dma_channel_count
;
166 static const char* vardesc_direction_as_string
[] = {
172 static const char* vardesc_type_as_string
[] = {
191 "c_string_ptr_array",
196 "c_cean_var_ptr_ptr",
199 Engine
* mic_engines
= 0;
200 uint32_t mic_engines_total
= 0;
201 pthread_key_t mic_thread_key
;
202 MicEnvVar mic_env_vars
;
203 uint64_t cpu_frequency
= 0;
206 uint32_t mic_stack_size
= 12 * 1024 * 1024;
209 uint64_t mic_buffer_size
= 0;
211 // Preallocated 4K page memory size for buffers on MIC
212 uint64_t mic_4k_buffer_size
= 0;
214 // Preallocated 2M page memory size for buffers on MIC
215 uint64_t mic_2m_buffer_size
= 0;
218 // LD_LIBRARY_PATH for KNC
219 char* knc_library_path
= 0;
221 // LD_LIBRARY_PATH for KNL
222 char* knl_library_path
= 0;
226 bool mic_proxy_io
= true;
229 char* mic_proxy_fs_root
= 0;
231 // Threshold for creating buffers with large pages. Buffer is created
232 // with large pages hint if its size exceeds the threshold value.
233 // By default large pages are disabled right now (by setting default
234 // value for threshold to MAX) due to HSD 4114629.
235 uint64_t __offload_use_2mb_buffers
= 0xffffffffffffffffULL
;
236 static const char *mic_use_2mb_buffers_envname
=
237 "MIC_USE_2MB_BUFFERS";
239 static uint64_t __offload_use_async_buffer_write
= 2 * 1024 * 1024;
240 static const char *mic_use_async_buffer_write_envname
=
241 "MIC_USE_ASYNC_BUFFER_WRITE";
243 static uint64_t __offload_use_async_buffer_read
= 2 * 1024 * 1024;
244 static const char *mic_use_async_buffer_read_envname
=
245 "MIC_USE_ASYNC_BUFFER_READ";
247 // device initialization type
248 OffloadInitType __offload_init_type
= c_init_on_offload_all
;
249 static const char *offload_init_envname
= "OFFLOAD_INIT";
252 static bool __offload_active_wait
= true;
253 static const char *offload_active_wait_envname
= "OFFLOAD_ACTIVE_WAIT";
255 // wait even for asynchronous offload
256 // true for now still the performance issue with COI is not fixed
257 static bool __offload_always_wait
= true;
258 static const char *offload_always_wait_envname
= "OFFLOAD_ALWAYS_WAIT";
260 // OMP_DEFAULT_DEVICE
261 int __omp_device_num
= 0;
262 static const char *omp_device_num_envname
= "OMP_DEFAULT_DEVICE";
264 //OFFLOAD_PARALLEL_COPY
265 static bool __offload_parallel_copy
= false;
266 static const char *parallel_copy_envname
= "OFFLOAD_PARALLEL_COPY";
268 //Use COI interface for noncontiguous transfer if it exists.
269 static bool __offload_use_coi_noncontiguous_transfer
= false;
270 static const char *use_coi_noncontiguous_transfer_envname
=
271 "MIC_USE_COI_MULTI_D";
273 // The list of pending target libraries
274 static bool __target_libs
;
275 static TargetImageList __target_libs_list
;
276 static mutex_t __target_libs_lock
;
277 static mutex_t stack_alloc_lock
;
278 static mutex_t lock_complete
;
280 // Set of OffloadDescriptors of asynchronous offloads that are not destroyed
281 std::map
<void *, bool> offload_descr_map
;
284 TargetImage
* __target_exe
;
285 // is true if last loaded image is dll
286 bool __current_image_is_dll
= false;
287 // is true if myo library is loaded when dll is loaded
288 bool __myo_init_in_so
= false;
290 // Print readable offload flags
291 static void trace_offload_flags(
292 OffloadHostTimerData
* timer_data
,
293 OffloadFlags offload_flags
296 // Sized big enough for all flag names
299 if (!OFFLOAD_DO_TRACE
&& (console_enabled
>= 1)) {
300 sprintf(fbuffer
, " OffloadFlags=(");
301 if (offload_flags
.bits
.fortran_traceback
) {
302 sprintf(fbuffer
+strlen(fbuffer
), "fortran_traceback");
305 if (offload_flags
.bits
.omp_async
) {
306 sprintf(fbuffer
+strlen(fbuffer
), first
? "omp_async" : ",omp_async");
309 OFFLOAD_DEBUG_TRACE_1(1,
310 GET_OFFLOAD_NUMBER(timer_data
), c_offload_init_func
,
315 // Print readable varDesc flags
316 static void trace_varDesc_flags(
317 OffloadHostTimerData
* timer_data
,
318 varDescFlags offload_flags
321 // Sized big enough for all flag names
324 if (!OFFLOAD_DO_TRACE
&& (console_enabled
>= 1)) {
325 sprintf(fbuffer
, " varDescFlags=(");
326 if (offload_flags
.is_static
) {
327 sprintf(fbuffer
+strlen(fbuffer
), "is_static");
330 if (offload_flags
.is_static_dstn
) {
331 sprintf(fbuffer
+strlen(fbuffer
),
332 first
? "is_static_dstn" : ",is_static_dstn");
335 if (offload_flags
.has_length
) {
336 sprintf(fbuffer
+strlen(fbuffer
),
337 first
? "has_length" : ",has_length");
340 if (offload_flags
.is_stack_buf
) {
341 sprintf(fbuffer
+strlen(fbuffer
),
342 first
? "is_stack_buf" : ",is_stack_buf");
345 if (offload_flags
.targetptr
) {
346 sprintf(fbuffer
+strlen(fbuffer
),
347 first
? "targetptr" : ",targetptr");
350 if (offload_flags
.preallocated
) {
351 sprintf(fbuffer
+strlen(fbuffer
),
352 first
? "preallocated" : ",preallocated");
355 if (offload_flags
.is_pointer
) {
356 sprintf(fbuffer
+strlen(fbuffer
),
357 first
? "is_pointer" : ",is_pointer");
360 if (offload_flags
.sink_addr
) {
361 sprintf(fbuffer
+strlen(fbuffer
),
362 first
? "sink_addr" : ",sink_addr");
365 if (offload_flags
.alloc_disp
) {
366 sprintf(fbuffer
+strlen(fbuffer
),
367 first
? "alloc_disp" : ",alloc_disp");
370 if (offload_flags
.is_noncont_src
) {
371 sprintf(fbuffer
+strlen(fbuffer
),
372 first
? "is_noncont_src" : ",is_noncont_src");
375 if (offload_flags
.is_noncont_dst
) {
376 sprintf(fbuffer
+strlen(fbuffer
),
377 first
? "is_noncont_dst" : ",is_noncont_dst");
380 if (offload_flags
.always_copy
) {
381 sprintf(fbuffer
+strlen(fbuffer
),
382 first
? "always_copy" : ",always_copy");
385 if (offload_flags
.always_delete
) {
386 sprintf(fbuffer
+strlen(fbuffer
),
387 first
? "always_delete" : ",always_delete");
390 if (offload_flags
.is_non_cont_struct
) {
391 sprintf(fbuffer
+strlen(fbuffer
),
392 first
? "is_non_cont_struct" : ",is_non_cont_struct");
395 if (offload_flags
.pin
) {
396 sprintf(fbuffer
+strlen(fbuffer
),
397 first
? "pin" : ",pin");
400 if (offload_flags
.is_device_ptr
) {
401 sprintf(fbuffer
+strlen(fbuffer
),
402 first
? "is_device_ptr" : ",is_device_ptr");
405 if (offload_flags
.use_device_ptr
) {
406 sprintf(fbuffer
+strlen(fbuffer
),
407 first
? "use_device_ptr" : ",use_device_ptr");
409 OFFLOAD_DEBUG_TRACE_1(1,
410 GET_OFFLOAD_NUMBER(timer_data
), c_offload_init_func
,
415 static char * offload_get_src_base(void * ptr
, uint8_t type
)
418 if (VAR_TYPE_IS_PTR(type
)) {
419 base
= *static_cast<char**>(ptr
);
421 else if (VAR_TYPE_IS_SCALAR(type
)) {
422 base
= static_cast<char*>(ptr
);
424 else if (VAR_TYPE_IS_DV_DATA_SLICE(type
) || VAR_TYPE_IS_DV_DATA(type
)) {
426 if (VAR_TYPE_IS_DV_DATA_SLICE(type
)) {
427 const Arr_Desc
*ap
= static_cast<const Arr_Desc
*>(ptr
);
428 dvp
= (type
== c_dv_data_slice
) ?
429 reinterpret_cast<ArrDesc
*>(ap
->base
) :
430 *reinterpret_cast<ArrDesc
**>(ap
->base
);
433 dvp
= (type
== c_dv_data
) ?
434 static_cast<ArrDesc
*>(ptr
) :
435 *static_cast<ArrDesc
**>(ptr
);
437 base
= reinterpret_cast<char*>(dvp
->Base
);
445 void OffloadDescriptor::report_coi_error(error_types msg
, COIRESULT res
)
447 // special case for the 'process died' error
448 if (res
== COI_PROCESS_DIED
) {
449 m_device
.fini_process(true);
454 if (res
== COI_OUT_OF_MEMORY
) {
455 msg
= c_buf_create_out_of_mem
;
459 case c_buf_create_from_mem
:
460 case c_buf_get_address
:
461 case c_pipeline_create
:
462 case c_pipeline_run_func
:
463 LIBOFFLOAD_ERROR(msg
, m_device
.get_logical_index(), res
);
472 case c_buf_set_state
:
473 LIBOFFLOAD_ERROR(msg
, res
);
484 _Offload_result
OffloadDescriptor::translate_coi_error(COIRESULT res
) const
488 return OFFLOAD_SUCCESS
;
490 case COI_PROCESS_DIED
:
491 return OFFLOAD_PROCESS_DIED
;
493 case COI_OUT_OF_MEMORY
:
494 return OFFLOAD_OUT_OF_MEMORY
;
497 return OFFLOAD_ERROR
;
501 // is_targetptr == 0 && is_prealloc == 0 - allocation of pointer data;
502 // is_targetptr == 1 && is_prealloc == 0 - allocation of target memory:
503 // allocate memory at target; use its value as base in target table.
504 // is_targetptr == 1 && is_prealloc == 1 - use preallocated target memory:
505 // base - is address at target of preallocated memory; use its value as
506 // base in target table.
508 bool OffloadDescriptor::alloc_ptr_data(
520 // total length of base
521 int64_t length
= size
;
523 COIBUFFER targptr_buf
;
525 uint32_t buffer_flags
= 0;
526 char * base_disp
= reinterpret_cast<char *>(base
) + disp
;
528 // create buffer with large pages if data length exceeds
529 // large page threshold
530 if (length
>= __offload_use_2mb_buffers
) {
531 buffer_flags
= COI_OPTIMIZE_HUGE_PAGE_SIZE
;
533 // Allocate memory at target for targetptr without preallocated as we need
534 // its address as base argument in call to m_device.insert_ptr_data
535 if (is_targptr
&& !is_prealloc
) {
536 length
= alloc_disp
? length
: size
+ disp
;
537 res
= COI::BufferCreate(
543 &m_device
.get_process(),
545 if (res
!= COI_SUCCESS
) {
547 m_status
->result
= translate_coi_error(res
);
549 else if (m_is_mandatory
) {
550 report_coi_error(c_buf_create
, res
);
555 res
= COI::BufferGetSinkAddress(
556 targptr_buf
, reinterpret_cast<uint64_t *>(&base
));
557 if (res
!= COI_SUCCESS
) {
559 m_status
->result
= translate_coi_error(res
);
561 else if (m_is_mandatory
) {
562 report_coi_error(c_buf_get_address
, res
);
568 OFFLOAD_TRACE(3, "Creating association for data: addr %p, length %lld\n",
569 alloc_disp
? base
: base_disp
,
570 alloc_disp
? length
: size
+ disp
);
574 ptr_data
= is_targptr
?
575 m_device
.find_targetptr_data(base_disp
) :
576 m_device
.find_ptr_data(base_disp
);
577 // if ptr_data is found just need to check it for overlapping
583 // If association is not found we must create it.
584 length
= alloc_disp
? length
: size
+ disp
;
585 ptr_data
= is_targptr
?
586 m_device
.insert_targetptr_data(base
, length
, is_new
) :
587 m_device
.insert_ptr_data(base
, length
, is_new
);
591 OFFLOAD_TRACE(3, "Added new association\n");
594 OffloadTimer
timer(get_timer_data(), c_offload_host_alloc_buffers
);
596 // align should be a power of 2
597 if (!pin
&& !is_targptr
&&
598 align
> 0 && (align
& (align
- 1)) == 0) {
599 // offset within mic_buffer. Can do offset optimization
600 // only when source address alignment satisfies requested
601 // alignment on the target (cq172736).
602 if ((reinterpret_cast<intptr_t>(base
) & (align
- 1)) == 0) {
603 ptr_data
->mic_offset
=
604 reinterpret_cast<intptr_t>(base
) & 4095;
608 // buffer size and flags
609 uint64_t buffer_size
= length
+ ptr_data
->mic_offset
;
611 // For targetptr there is no CPU buffer
612 if (pin
|| !is_targptr
) {
614 OFFLOAD_DEBUG_TRACE_1(3,
615 GET_OFFLOAD_NUMBER(get_timer_data()),
616 c_offload_create_buf_host
,
617 "Creating buffer from source memory %p, "
618 "length %lld\n", base
, length
);
620 // result is not checked because we can continue without cpu
621 // buffer. In this case we will use COIBufferRead/Write
622 // instead of COIBufferCopy.
624 COI::BufferCreateFromMemory(length
,
629 &m_device
.get_process(),
635 OFFLOAD_DEBUG_TRACE_1(3,
636 GET_OFFLOAD_NUMBER(get_timer_data()),
637 c_offload_create_buf_mic
,
638 "Creating buffer from sink memory: "
639 "addr %p, size %lld, offset %d, flags 0x%x\n",
640 base
, buffer_size
, ptr_data
->mic_offset
,
642 res
= COI::BufferCreateFromMemory(ptr_data
->cpu_addr
.length(),
647 &m_device
.get_process(),
649 if (res
!= COI_SUCCESS
) {
651 m_status
->result
= translate_coi_error(res
);
653 else if (m_is_mandatory
) {
654 report_coi_error(c_buf_create
, res
);
656 ptr_data
->alloc_ptr_data_lock
.unlock();
660 else if (is_targptr
) {
661 ptr_data
->mic_buf
= targptr_buf
;
664 OFFLOAD_DEBUG_TRACE_1(3,
665 GET_OFFLOAD_NUMBER(get_timer_data()),
666 c_offload_create_buf_mic
,
667 "Creating buffer for sink: size %lld, offset %d, "
668 "flags =0x%x\n", buffer_size
,
669 ptr_data
->mic_offset
, buffer_flags
);
670 res
= COI::BufferCreate(buffer_size
,
675 &m_device
.get_process(),
677 if (res
!= COI_SUCCESS
) {
679 m_status
->result
= translate_coi_error(res
);
681 else if (m_is_mandatory
) {
682 report_coi_error(c_buf_create
, res
);
684 ptr_data
->alloc_ptr_data_lock
.unlock();
690 // make buffer valid on the device.
691 res
= COI::BufferSetState(ptr_data
->mic_buf
,
692 m_device
.get_process(),
696 if (res
!= COI_SUCCESS
) {
698 m_status
->result
= translate_coi_error(res
);
700 else if (m_is_mandatory
) {
701 report_coi_error(c_buf_set_state
, res
);
703 ptr_data
->alloc_ptr_data_lock
.unlock();
707 res
= COI::BufferSetState(ptr_data
->mic_buf
,
712 if (res
!= COI_SUCCESS
) {
714 m_status
->result
= translate_coi_error(res
);
716 else if (m_is_mandatory
) {
717 report_coi_error(c_buf_set_state
, res
);
719 ptr_data
->alloc_ptr_data_lock
.unlock();
724 ptr_data
->alloc_disp
= alloc_disp
;
725 ptr_data
->alloc_ptr_data_lock
.unlock();
728 mutex_locker_t
locker(ptr_data
->alloc_ptr_data_lock
);
730 OFFLOAD_TRACE(3, "Found existing association: addr %p, length %lld, "
732 ptr_data
->cpu_addr
.start(), ptr_data
->cpu_addr
.length(),
733 ptr_data
->is_static
);
735 // This is not a new entry. Make sure that provided address range fits
736 // into existing one.
737 MemRange
addr_range(base
, length
);
738 if (!ptr_data
->cpu_addr
.contains(addr_range
)) {
739 LIBOFFLOAD_ERROR(c_bad_ptr_mem_alloc
, base
, length
,
740 const_cast<void *>(ptr_data
->cpu_addr
.start()),
741 ptr_data
->cpu_addr
.length());
745 // if the entry is associated with static data it may not have buffers
746 // created because they are created on demand.
747 if (ptr_data
->is_static
&& !init_static_ptr_data(ptr_data
)) {
755 bool OffloadDescriptor::find_ptr_data(
764 // total length of base
765 int64_t length
= size
;
766 char *base
= reinterpret_cast<char *>(in_base
) + disp
;
768 OFFLOAD_TRACE(3, "Looking for association for data: addr %p, "
769 "length %lld\n", base
, length
);
771 // find existing association in pointer table
772 ptr_data
= is_targetptr
?
773 m_device
.find_targetptr_data(base
) :
774 m_device
.find_ptr_data(base
);
777 LIBOFFLOAD_ERROR(c_no_ptr_data
, base
);
780 OFFLOAD_TRACE(3, "Association does not exist\n");
784 OFFLOAD_TRACE(3, "Found association: base %p, length %lld, is_static %d\n",
785 ptr_data
->cpu_addr
.start(), ptr_data
->cpu_addr
.length(),
786 ptr_data
->is_static
);
788 // make sure that provided address range fits into existing one
789 MemRange
addr_range(base
, length
);
790 if (!ptr_data
->cpu_addr
.contains(addr_range
)) {
792 LIBOFFLOAD_ERROR(c_bad_ptr_mem_range
, base
, length
,
793 const_cast<void *>(ptr_data
->cpu_addr
.start()),
794 ptr_data
->cpu_addr
.length());
797 OFFLOAD_TRACE(3, "Existing association partially overlaps with "
798 "data address range\n");
803 // if the entry is associated with static data it may not have buffers
804 // created because they are created on demand.
805 if (ptr_data
->is_static
&& !init_static_ptr_data(ptr_data
)) {
812 void OffloadDescriptor::find_device_ptr(
813 int64_t* &device_ptr
,
818 char *base
= reinterpret_cast<char *>(host_ptr
);
820 OFFLOAD_TRACE(3, "Looking for association for data: addr %p\n", base
);
822 // find existing association in pointer table
823 ptr_data
= m_device
.find_ptr_data(base
);
825 // MIC address should have been assigned.
826 // For now assume does not exist and get the addr
827 // if ((ptr_data == 0) || ptr_data->mic_addr) {
830 OFFLOAD_TRACE(3, "Association does not exist\n");
831 LIBOFFLOAD_ERROR(c_no_ptr_data
, base
);
834 if (!ptr_data
->mic_addr
) {
835 COIRESULT res
= COI::BufferGetSinkAddress(ptr_data
->mic_buf
,
836 &ptr_data
->mic_addr
);
837 if (res
!= COI_SUCCESS
) {
839 m_status
->result
= translate_coi_error(res
);
840 report_coi_error(c_buf_get_address
, res
);
844 device_ptr
= (int64_t *) ptr_data
->mic_addr
;
846 OFFLOAD_TRACE(3, "Found association: host_ptr %p, device_ptr = %p\n",
847 ptr_data
->cpu_addr
.start(), device_ptr
);
850 bool OffloadDescriptor::init_static_ptr_data(PtrData
*ptr_data
)
852 OffloadTimer
timer(get_timer_data(), c_offload_host_alloc_buffers
);
854 if (ptr_data
->cpu_buf
== 0) {
855 OFFLOAD_TRACE(3, "Creating buffer from source memory %llx\n",
856 ptr_data
->cpu_addr
.start());
858 COIRESULT res
= COI::BufferCreateFromMemory(
859 ptr_data
->cpu_addr
.length(),
862 const_cast<void*>(ptr_data
->cpu_addr
.start()),
863 1, &m_device
.get_process(),
866 if (res
!= COI_SUCCESS
) {
868 m_status
->result
= translate_coi_error(res
);
871 report_coi_error(c_buf_create_from_mem
, res
);
875 if (ptr_data
->mic_buf
== 0) {
876 OFFLOAD_TRACE(3, "Creating buffer from sink memory %llx\n",
879 COIRESULT res
= COI::BufferCreateFromMemory(
880 ptr_data
->cpu_addr
.length(),
883 reinterpret_cast<void*>(ptr_data
->mic_addr
),
884 1, &m_device
.get_process(),
887 if (res
!= COI_SUCCESS
) {
889 m_status
->result
= translate_coi_error(res
);
892 report_coi_error(c_buf_create_from_mem
, res
);
899 bool OffloadDescriptor::init_mic_address(PtrData
*ptr_data
)
901 if (ptr_data
->mic_buf
!= 0 && ptr_data
->mic_addr
== 0) {
902 COIRESULT res
= COI::BufferGetSinkAddress(ptr_data
->mic_buf
,
903 &ptr_data
->mic_addr
);
904 if (res
!= COI_SUCCESS
) {
906 m_status
->result
= translate_coi_error(res
);
908 else if (m_is_mandatory
) {
909 report_coi_error(c_buf_get_address
, res
);
917 bool OffloadDescriptor::nullify_target_stack(
922 char * ptr
= (char*)malloc(size
);
924 LIBOFFLOAD_ERROR(c_malloc
);
927 memset(ptr
, 0, size
);
928 res
= COI::BufferWrite(
933 COI_COPY_UNSPECIFIED
,
936 if (res
!= COI_SUCCESS
) {
938 m_status
->result
= translate_coi_error(res
);
941 report_coi_error(c_buf_write
, res
);
946 static void print_persistList_item(
951 OFFLOAD_TRACE(4, "%s\n", msg
);
952 OFFLOAD_TRACE(4, " stack_cpu_addr = %p\n", cur_el
->stack_cpu_addr
);
953 OFFLOAD_TRACE(4, " routine_id = %d\n", cur_el
->routine_id
);
954 OFFLOAD_TRACE(4, " thread_id = %lld\n", cur_el
->thread_id
);
955 OFFLOAD_TRACE(4, " stack_ptr_data = %p\n", cur_el
->stack_ptr_data
);
956 OFFLOAD_TRACE(4, " MIC buffer = %p\n", cur_el
->stack_ptr_data
->mic_buf
);
957 OFFLOAD_TRACE(4, " MIC addr = %p\n", cur_el
->stack_ptr_data
->mic_addr
);
958 OFFLOAD_TRACE(4, " cpu_stack_addr = %p\n", cur_el
->cpu_stack_addr
);
961 static mutex_t stack_memory_manager_lock
;
963 bool OffloadDescriptor::offload_stack_memory_manager(
964 const void * stack_begin
,
968 bool thread_specific_function_locals
,
971 //mutex_locker_t locker(stack_alloc_lock);
972 stack_memory_manager_lock
.lock();
974 PersistData
* new_el
;
975 PersistDataList::iterator it_begin
= m_device
.m_persist_list
.begin();
976 PersistDataList::iterator it_end
;
978 uint64_t cur_thread_id
= m_device
.get_thread_id();
980 OFFLOAD_TRACE(3, "offload_stack_memory_manager("
981 "stack_begin=%p, routine_id=%d, buf_size=%d,"
982 "align=%d, thread_specific_function_locals=%d, bool=%p)\n",
983 stack_begin
, routine_id
, buf_size
,
984 align
, thread_specific_function_locals
, is_new
);
985 OFFLOAD_TRACE(3, "cur_thread_id=%lld\n", cur_thread_id
);
988 for (PersistDataList::iterator it
= m_device
.m_persist_list
.begin();
989 it
!= m_device
.m_persist_list
.end(); it
++) {
990 PersistData cur_el
= *it
;
992 print_persistList_item("Current element in persist list:", &cur_el
);
993 if (stack_begin
> it
->stack_cpu_addr
) {
994 if (cur_thread_id
== cur_el
.thread_id
) {
995 // this stack data must be destroyed
996 m_destroy_stack
.push_front(cur_el
.stack_ptr_data
);
999 OFFLOAD_TRACE(3, "Current element below TOS: so delete\n");
1002 else if (stack_begin
== it
->stack_cpu_addr
) {
1003 if (routine_id
!= it
-> routine_id
) {
1004 // this stack data must be destroyed
1005 // because the current function is a dynamic sibling
1006 m_destroy_stack
.push_front(cur_el
.stack_ptr_data
);
1009 OFFLOAD_TRACE(3, "Current element is sibling: so delete\n");
1012 else if (!thread_specific_function_locals
||
1013 cur_thread_id
== cur_el
.thread_id
) {
1014 // stack data is reused
1015 m_stack_ptr_data
= it
->stack_ptr_data
;
1017 // all obsolete stack sections must be erased from the list
1018 m_device
.m_persist_list
.erase(it_begin
, ++it_end
);
1020 erase
* sizeof(new_el
->stack_ptr_data
->mic_addr
);
1022 OFFLOAD_TRACE(3, "Reuse of stack buffer with addr %p\n",
1023 m_stack_ptr_data
->mic_addr
);
1024 stack_memory_manager_lock
.unlock();
1028 else if (stack_begin
< it
->stack_cpu_addr
&&
1029 cur_thread_id
== cur_el
.thread_id
) {
1030 OFFLOAD_TRACE(3, "Current element is above TOS\n");
1036 // all obsolete stack sections must be erased from the list
1037 m_device
.m_persist_list
.erase(it_begin
, ++it_end
);
1038 m_in_datalen
+= erase
* sizeof(new_el
->stack_ptr_data
->mic_addr
);
1040 // new stack table is created
1041 new_el
= new PersistData(stack_begin
, routine_id
, buf_size
, cur_thread_id
);
1042 // create MIC buffer
1044 uint32_t buffer_flags
= 0;
1046 // create buffer with large pages if data length exceeds
1047 // large page threshold
1048 if (buf_size
>= __offload_use_2mb_buffers
) {
1049 buffer_flags
= COI_OPTIMIZE_HUGE_PAGE_SIZE
;
1051 res
= COI::BufferCreate(buf_size
,
1056 &m_device
.get_process(),
1057 &new_el
->stack_ptr_data
->mic_buf
);
1058 if (res
!= COI_SUCCESS
) {
1059 if (m_status
!= 0) {
1060 m_status
->result
= translate_coi_error(res
);
1062 else if (m_is_mandatory
) {
1063 report_coi_error(c_buf_create
, res
);
1065 stack_memory_manager_lock
.unlock();
1068 // make buffer valid on the device.
1069 res
= COI::BufferSetState(new_el
->stack_ptr_data
->mic_buf
,
1070 m_device
.get_process(),
1074 if (res
!= COI_SUCCESS
) {
1075 if (m_status
!= 0) {
1076 m_status
->result
= translate_coi_error(res
);
1078 else if (m_is_mandatory
) {
1079 report_coi_error(c_buf_set_state
, res
);
1081 stack_memory_manager_lock
.unlock();
1084 res
= COI::BufferSetState(new_el
->stack_ptr_data
->mic_buf
,
1089 if (res
!= COI_SUCCESS
) {
1090 if (m_status
!= 0) {
1091 m_status
->result
= translate_coi_error(res
);
1093 else if (m_is_mandatory
) {
1094 report_coi_error(c_buf_set_state
, res
);
1096 stack_memory_manager_lock
.unlock();
1099 // persistence algorithm requires target stack initialy to be nullified
1100 if (!nullify_target_stack(new_el
->stack_ptr_data
->mic_buf
, buf_size
)) {
1101 stack_memory_manager_lock
.unlock();
1105 m_stack_ptr_data
= new_el
->stack_ptr_data
;
1106 init_mic_address(m_stack_ptr_data
);
1107 OFFLOAD_TRACE(3, "Allocating stack buffer with addr %p\n",
1108 m_stack_ptr_data
->mic_addr
);
1109 m_device
.m_persist_list
.push_front(*new_el
);
1110 init_mic_address(new_el
->stack_ptr_data
);
1113 stack_memory_manager_lock
.unlock();
1117 // Search through persistent stack buffers
1118 // for the top-of-stack buffer for this thread
1119 char* OffloadDescriptor::get_this_threads_cpu_stack_addr(
1120 const void * stack_begin
,
1122 bool thread_specific_function_locals
1125 uint64_t cur_thread_id
= m_device
.get_thread_id();
1128 OFFLOAD_TRACE(3, "get_this_threads_cpu_stack_addr("
1129 "stack_begin=%p, routine_id=%d, thread_specific_function_locals=%d)\n",
1130 stack_begin
, routine_id
, thread_specific_function_locals
);
1131 OFFLOAD_TRACE(3, "cur_thread_id=%lld\n", cur_thread_id
);
1133 stack_memory_manager_lock
.lock();
1134 for (PersistDataList::iterator it
= m_device
.m_persist_list
.begin();
1135 it
!= m_device
.m_persist_list
.end(); it
++)
1137 PersistData cur_el
= *it
;
1138 print_persistList_item("Current element in persist list:", &cur_el
);
1139 if (stack_begin
== cur_el
.stack_cpu_addr
)
1141 // For OpenMP shared function locals matching is done without
1142 // regard to thread id. But, we return the last match, which
1143 // corresponds to the outer stack.
1144 if (!thread_specific_function_locals
)
1146 matched
= cur_el
.cpu_stack_addr
;
1149 // For non-OpenMP shared function-local variables
1150 // the thread-id must match
1151 if (cur_thread_id
== cur_el
.thread_id
)
1153 matched
= cur_el
.cpu_stack_addr
;
1158 stack_memory_manager_lock
.unlock();
1161 OFFLOAD_TRACE(3, "get_this_threads_cpu_stack_addr() => %p\n", matched
);
1166 "Could not find persistent data; expect Read/Write failure\n");
1170 // Search through persistent stack buffers
1171 // for the top-of-stack MIC buffer for this thread
1172 PtrData
* OffloadDescriptor::get_this_threads_mic_stack_addr(
1173 const void * stack_begin
,
1175 bool thread_specific_function_locals
1178 uint64_t cur_thread_id
= m_device
.get_thread_id();
1179 PtrData
* matched
= 0;
1181 OFFLOAD_TRACE(3, "get_this_threads_mic_stack_addr("
1182 "stack_begin=%p, routine_id=%d, thread_specific_function_locals=%d)\n",
1183 stack_begin
, routine_id
, thread_specific_function_locals
);
1184 OFFLOAD_TRACE(3, "cur_thread_id=%lld\n", cur_thread_id
);
1186 stack_memory_manager_lock
.lock();
1187 for (PersistDataList::iterator it
= m_device
.m_persist_list
.begin();
1188 it
!= m_device
.m_persist_list
.end(); it
++)
1190 PersistData cur_el
= *it
;
1191 print_persistList_item("Current element in persist list:", &cur_el
);
1192 if (stack_begin
== cur_el
.stack_cpu_addr
)
1194 // For OpenMP shared function locals matching is done without
1195 // regard to thread id. But, we return the last match, which
1196 // corresponds to the outer stack.
1197 if (!thread_specific_function_locals
)
1199 matched
= cur_el
.stack_ptr_data
;
1202 // For non-OpenMP shared function-local variables
1203 // the thread-id must match
1204 if (cur_thread_id
== cur_el
.thread_id
)
1206 matched
= cur_el
.stack_ptr_data
;
1211 stack_memory_manager_lock
.unlock();
1214 OFFLOAD_TRACE(3, "get_this_threads_mic_stack_addr() => %p\n", matched
);
1219 "Could not find persistent data; expect Read/Write failure\n");
1223 void OffloadDescriptor::setup_use_device_ptr(int i
)
1228 if (m_vars_extra
[i
].type_src
== c_dv_ptr
) {
1229 dvp
= *static_cast<ArrDesc
**>(m_vars
[i
].ptr
);
1230 base
= reinterpret_cast<void*>(dvp
->Base
);
1233 base
= *static_cast<void**>(m_vars
[i
].ptr
);
1235 if (m_vars
[i
].direction
.in
) {
1236 int64_t *device_ptr
;
1239 find_device_ptr(device_ptr
, base
);
1241 // Create a entry in targetptr table using device_ptr
1242 // as lookup for later recover the host pointer
1243 ptr_data
= m_device
.insert_targetptr_data(device_ptr
,
1246 // Actually the base is a host pointer and cpu_addr is
1247 // device pointer. This is special case where the 2
1248 // address usage is reversed to enable using existing
1249 // PtrData structure instead of adding new fields.
1250 ptr_data
->mic_addr
= (uint64_t) base
;
1252 ptr_data
->alloc_ptr_data_lock
.unlock();
1254 // Replace host pointer with device pointer
1255 if (m_vars_extra
[i
].type_src
== c_dv_ptr
) {
1256 dvp
->Base
= reinterpret_cast<dv_size
>(device_ptr
);
1259 *static_cast<void**>(m_vars
[i
].ptr
) = device_ptr
;
1262 else if (m_vars
[i
].direction
.out
) {
1263 // For use_device_ptr and out find associated host ptr
1264 // and assign to host ptr
1265 ptr_data
= m_device
.find_targetptr_data(base
);
1267 LIBOFFLOAD_ERROR(c_no_ptr_data
, base
);
1270 if (m_vars_extra
[i
].type_src
== c_dv_ptr
) {
1271 dvp
->Base
= ptr_data
->mic_addr
;
1274 *static_cast<void**>(m_vars
[i
].ptr
) =
1275 reinterpret_cast<void*>(ptr_data
->mic_addr
);
1277 m_device
.remove_targetptr_data(
1278 ptr_data
->cpu_addr
.start());
1282 bool OffloadDescriptor::setup_descriptors(
1287 const void *stack_addr
1291 // To enable caching the CPU stack base address for stack variables
1292 char* this_threads_cpu_stack_addr
= 0;
1293 // To properly deal with non-OpenMP threading and function-local variables
1294 // For OpenMP threading we support all function-locals in shared mode only
1295 bool thread_specific_function_locals
= !omp_in_parallel();
1297 OffloadTimer
timer(get_timer_data(), c_offload_host_setup_buffers
);
1298 // make a copy of variable descriptors
1299 m_vars_total
= vars_total
;
1300 if (vars_total
> 0) {
1301 m_vars
= (VarDesc
*) malloc(m_vars_total
* sizeof(VarDesc
));
1303 LIBOFFLOAD_ERROR(c_malloc
);
1304 memcpy(m_vars
, vars
, m_vars_total
* sizeof(VarDesc
));
1305 m_vars_extra
= (VarExtra
*) malloc(m_vars_total
* sizeof(VarExtra
));
1306 if (m_vars_extra
== NULL
)
1307 LIBOFFLOAD_ERROR(c_malloc
);
1311 m_in_deps_allocated
= m_vars_total
+ 1;
1312 m_in_deps
= (COIEVENT
*) malloc(sizeof(COIEVENT
) * m_in_deps_allocated
);
1313 if (m_in_deps
== NULL
)
1314 LIBOFFLOAD_ERROR(c_malloc
);
1315 if (m_vars_total
> 0) {
1316 m_out_deps_allocated
= m_vars_total
;
1317 m_out_deps
= (COIEVENT
*) malloc(sizeof(COIEVENT
) * m_out_deps_allocated
);
1318 if (m_out_deps
== NULL
)
1319 LIBOFFLOAD_ERROR(c_malloc
);
1321 // copyin/copyout data length
1325 // First pass over variable descriptors
1326 // - Calculate size of the input and output non-pointer data
1327 // - Allocate buffers for input and output pointers
1328 for (int i
= 0; i
< m_vars_total
; i
++) {
1329 void* alloc_base
= NULL
;
1330 int64_t alloc_disp
= 0;
1331 int64_t alloc_size
= 0;
1332 bool src_is_for_mic
= (m_vars
[i
].direction
.out
||
1333 m_vars
[i
].into
== NULL
);
1334 bool src_is_for_host
= (m_vars
[i
].direction
.in
||
1335 m_vars
[i
].into
== NULL
);
1336 const char *var_sname
= "";
1337 if (vars2
!= NULL
&& i
< vars_total
) {
1338 if (vars2
[i
].sname
!= NULL
) {
1339 var_sname
= vars2
[i
].sname
;
1343 // instead of m_vars[i].type.src we will use m_vars_extra[i].type_src
1344 if (m_vars
[i
].type
.src
== c_extended_type
) {
1345 VarDescExtendedType
*etype
=
1346 reinterpret_cast<VarDescExtendedType
*>(m_vars
[i
].ptr
);
1347 m_vars_extra
[i
].type_src
= etype
->extended_type
;
1348 m_vars
[i
].ptr
= etype
->ptr
;
1351 m_vars_extra
[i
].type_src
= m_vars
[i
].type
.src
;
1353 // instead of m_vars[i].type.dst we will use m_vars_extra[i].type_dst
1354 if (m_vars
[i
].type
.dst
== c_extended_type
) {
1355 VarDescExtendedType
*etype
=
1356 reinterpret_cast<VarDescExtendedType
*>(m_vars
[i
].into
);
1358 m_vars_extra
[i
].type_dst
= etype
->extended_type
;
1359 m_vars
[i
].into
= etype
->ptr
;
1362 m_vars_extra
[i
].type_dst
= m_vars_extra
[i
].type_src
;
1366 m_vars_extra
[i
].type_dst
= m_vars
[i
].type
.dst
;
1368 OFFLOAD_TRACE(2, " VarDesc %d, var=%s, %s, %s\n",
1370 vardesc_direction_as_string
[m_vars
[i
].direction
.bits
],
1371 vardesc_type_as_string
[m_vars_extra
[i
].type_src
]);
1372 if (vars2
!= NULL
&& i
< vars_total
&& vars2
[i
].dname
!= NULL
) {
1373 OFFLOAD_TRACE(2, " into=%s, %s\n", vars2
[i
].dname
,
1374 vardesc_type_as_string
[m_vars_extra
[i
].type_dst
]);
1377 " type_src=%d, type_dstn=%d, direction=%d, "
1378 "alloc_if=%d, free_if=%d, align=%d, mic_offset=%d, flags=0x%x, "
1379 "offset=%lld, size=%lld, count/disp=%lld, ptr=%p, into=%p\n",
1380 m_vars_extra
[i
].type_src
,
1381 m_vars_extra
[i
].type_dst
,
1382 m_vars
[i
].direction
.bits
,
1386 m_vars
[i
].mic_offset
,
1387 m_vars
[i
].flags
.bits
,
1393 // If any varDesc flags bits set, show them
1394 if (console_enabled
>= 1 && m_vars
[i
].flags
.bits
!= 0) {
1395 trace_varDesc_flags(get_timer_data(), m_vars
[i
].flags
);
1398 // preallocated implies targetptr
1399 if (m_vars
[i
].flags
.preallocated
) {
1400 // targetptr preallocated alloc_if(1) may not be used with
1402 if (m_vars
[i
].direction
.in
&& m_vars
[i
].alloc_if
) {
1403 LIBOFFLOAD_ERROR(c_in_with_preallocated
);
1406 m_vars
[i
].flags
.targetptr
= 1;
1408 if (m_vars
[i
].alloc
!= NULL
) {
1410 const Arr_Desc
*ap
=
1411 static_cast<const Arr_Desc
*>(m_vars
[i
].alloc
);
1414 ARRAY_DESC_DUMP(" ", "ALLOC", ap
, 0, 1);
1416 __arr_data_offset_and_length(ap
, alloc_disp
, alloc_size
);
1418 alloc_base
= reinterpret_cast<void*>(ap
->base
);
1421 m_vars_extra
[i
].alloc
= m_vars
[i
].alloc
;
1422 m_vars_extra
[i
].auto_data
= 0;
1423 m_vars_extra
[i
].cpu_disp
= 0;
1424 m_vars_extra
[i
].cpu_offset
= 0;
1425 m_vars_extra
[i
].src_data
= 0;
1426 m_vars_extra
[i
].read_rng_src
= 0;
1427 m_vars_extra
[i
].read_rng_dst
= 0;
1428 m_vars_extra
[i
].omp_last_event_type
= c_last_not
;
1429 // flag is_arr_ptr_el is 1 only for var_descs generated
1430 // for c_data_ptr_array type
1431 if (i
< vars_total
) {
1432 m_vars_extra
[i
].is_arr_ptr_el
= 0;
1434 if (TYPE_IS_PTR_TO_PTR(m_vars_extra
[i
].type_src
) ||
1435 TYPE_IS_PTR_TO_PTR(m_vars_extra
[i
].type_dst
) ||
1436 m_vars
[i
].flags
.is_pointer
) {
1437 m_vars_extra
[i
].pointer_offset
= m_vars
[i
].offset
;
1438 m_vars
[i
].offset
= 0;
1439 m_in_datalen
+= sizeof(m_vars
[i
].offset
);
1442 switch (m_vars_extra
[i
].type_src
) {
1443 case c_data_ptr_array
:
1446 const VarDesc3
*vd3
=
1447 static_cast<const VarDesc3
*>(m_vars
[i
].ptr
);
1448 int flags
= vd3
->array_fields
;
1450 " pointer array flags = %04x\n", flags
);
1452 " pointer array type is %s\n",
1453 vardesc_type_as_string
[flags
& 0x3f]);
1454 ap
= static_cast<const Arr_Desc
*>(vd3
->ptr_array
);
1455 ARRAY_DESC_DUMP(" ", "ptr array", ap
,
1456 m_vars
[i
].flags
.is_pointer
, 1);
1457 if (m_vars
[i
].into
) {
1458 ap
= static_cast<const Arr_Desc
*>(m_vars
[i
].into
);
1460 " ", "into array", ap
, 0, 1);
1462 if ((flags
& (1<<flag_align_is_array
)) != 0) {
1463 ap
= static_cast<const Arr_Desc
*>(vd3
->align_array
);
1465 " ", "align array", ap
, 0, 1);
1467 if ((flags
& (1<<flag_alloc_if_is_array
)) != 0) {
1468 ap
= static_cast<const Arr_Desc
*>(vd3
->alloc_if_array
);
1470 " ", "alloc_if array", ap
, 0, 1);
1472 if ((flags
& (1<<flag_free_if_is_array
)) != 0) {
1473 ap
= static_cast<const Arr_Desc
*>(vd3
->free_if_array
);
1475 " ", "free_if array", ap
, 0, 1);
1477 if ((flags
& (1<<flag_extent_start_is_array
)) != 0) {
1478 ap
= static_cast<const Arr_Desc
*>(vd3
->extent_start
);
1480 " ", "extent_start array", ap
, 0, 1);
1482 (1<<flag_extent_start_is_scalar
)) != 0) {
1484 " extent_start scalar = %d\n",
1485 (int64_t)vd3
->extent_start
);
1487 if ((flags
& (1<<flag_extent_elements_is_array
)) != 0) {
1488 ap
= static_cast<const Arr_Desc
*>
1489 (vd3
->extent_elements
);
1490 ARRAY_DESC_DUMP(" ",
1491 "extent_elements array", ap
, 0, 1);
1493 (1<<flag_extent_elements_is_scalar
)) != 0) {
1495 " extent_elements scalar = %d\n",
1496 (int64_t)vd3
->extent_elements
);
1498 if ((flags
& (1<<flag_into_start_is_array
)) != 0) {
1499 ap
= static_cast<const Arr_Desc
*>(vd3
->into_start
);
1501 " ", "into_start array", ap
, 0, 1);
1503 (1<<flag_into_start_is_scalar
)) != 0) {
1505 " into_start scalar = %d\n",
1506 (int64_t)vd3
->into_start
);
1508 if ((flags
& (1<<flag_into_elements_is_array
)) != 0) {
1509 ap
= static_cast<const Arr_Desc
*>(vd3
->into_elements
);
1511 " ", "into_elements array", ap
, 0, 1);
1513 (1<<flag_into_elements_is_scalar
)) != 0) {
1515 " into_elements scalar = %d\n",
1516 (int64_t)vd3
->into_elements
);
1518 if ((flags
& (1<<flag_alloc_start_is_array
)) != 0) {
1519 ap
= static_cast<const Arr_Desc
*>(vd3
->alloc_start
);
1521 " ", "alloc_start array", ap
, 0, 1);
1523 (1<<flag_alloc_start_is_scalar
)) != 0) {
1525 " alloc_start scalar = %d\n",
1526 (int64_t)vd3
->alloc_start
);
1528 if ((flags
& (1<<flag_alloc_elements_is_array
)) != 0) {
1529 ap
= static_cast<const Arr_Desc
*>(vd3
->alloc_elements
);
1530 ARRAY_DESC_DUMP(" ",
1531 "alloc_elements array", ap
, 0, 1);
1533 (1<<flag_alloc_elements_is_scalar
)) != 0) {
1535 " alloc_elements scalar = %d\n",
1536 (int64_t)vd3
->alloc_elements
);
1539 if (!gen_var_descs_for_pointer_array(i
)) {
1546 case c_void_ptr_ptr
:
1548 // In all uses later
1549 // VarDesc.size will have the length of the data to be
1551 // VarDesc.disp will have an offset from base
1553 if (m_vars
[i
].flags
.is_non_cont_struct
&& src_is_for_host
) {
1554 NonContigDesc
*desc
=
1555 static_cast<NonContigDesc
*>(m_vars
[i
].ptr
);
1556 noncont_struct_dump(" ", "DATA", desc
);
1557 m_vars_extra
[i
].noncont_desc
= desc
;
1558 m_vars
[i
].ptr
= reinterpret_cast<void*>(desc
->base
);
1559 m_vars
[i
].size
= get_noncont_struct_size(desc
);
1562 else if (m_vars_extra
[i
].type_src
== c_cean_var
) {
1564 const Arr_Desc
*ap
=
1565 static_cast<const Arr_Desc
*>(m_vars
[i
].ptr
);
1568 ARRAY_DESC_DUMP("", "IN/OUT", ap
, 0, !src_is_for_mic
);
1570 // offset and length are derived from the array descriptor
1571 __arr_data_offset_and_length(ap
, m_vars
[i
].disp
,
1573 if (!is_arr_desc_contiguous(ap
)) {
1574 m_vars
[i
].flags
.is_noncont_src
= 1;
1575 m_vars_extra
[i
].read_rng_src
=
1576 init_read_ranges_arr_desc(ap
);
1578 // all necessary information about length and offset is
1579 // transferred in var descriptor. There is no need to send
1580 // array descriptor to the target side.
1581 m_vars
[i
].ptr
= reinterpret_cast<void*>(ap
->base
);
1584 m_vars
[i
].size
*= m_vars
[i
].count
;
1588 if (m_vars
[i
].direction
.bits
) {
1589 // make sure that transfer size > 0
1590 if (m_vars
[i
].size
<= 0) {
1591 LIBOFFLOAD_ERROR(c_zero_or_neg_transfer_size
);
1595 if (m_vars
[i
].flags
.is_static
) {
1597 // find data associated with variable
1598 if (!find_ptr_data(ptr_data
,
1606 if (ptr_data
!= 0) {
1607 // offset to base from the beginning of the buffer
1610 (char*) m_vars
[i
].ptr
-
1611 (char*) ptr_data
->cpu_addr
.start();
1614 m_vars
[i
].flags
.is_static
= false;
1615 if (m_vars
[i
].into
== NULL
) {
1616 m_vars
[i
].flags
.is_static_dstn
= false;
1619 m_vars_extra
[i
].src_data
= ptr_data
;
1622 if (m_vars
[i
].direction
.in
&&
1623 !m_vars
[i
].flags
.is_static
&&
1624 !m_vars
[i
].flags
.is_stack_buf
) {
1625 m_in_datalen
+= m_vars
[i
].size
;
1627 // for non-static target destination defined as CEAN
1628 // expression we pass to target its size and dist
1629 if (m_vars
[i
].into
== NULL
&&
1630 m_vars_extra
[i
].type_src
== c_cean_var
) {
1631 m_in_datalen
+= 2 * sizeof(uint64_t);
1633 m_need_runfunction
= true;
1635 if (m_vars
[i
].direction
.out
&&
1636 !m_vars
[i
].flags
.is_static
&&
1637 !m_vars
[i
].flags
.is_stack_buf
) {
1638 m_out_datalen
+= m_vars
[i
].size
;
1639 m_need_runfunction
= true;
1642 if (m_is_openmp
&& src_is_for_host
&&
1643 !m_vars
[i
].flags
.is_device_ptr
) {
1644 if (m_vars
[i
].flags
.is_static
) {
1645 PtrData
*ptr_data
= m_vars_extra
[i
].src_data
;
1646 // Static data is transferred either by omp target
1647 // update construct which passes zeros for
1648 // alloc_if and free_if or by always modifier.
1649 // Implicit openmp reference is transfered also
1650 // if its reference count is equal to 1
1652 IS_OPENMP_IMPLICIT_OR_LINK(ptr_data
->var_alloc_type
)) {
1653 if (m_vars
[i
].alloc_if
) {
1654 ptr_data
->add_reference();
1657 if (!m_vars
[i
].flags
.always_copy
&&
1658 (m_vars
[i
].alloc_if
|| m_vars
[i
].free_if
) &&
1659 ptr_data
->get_reference() != 1) {
1660 m_vars
[i
].direction
.bits
= c_parameter_nocopy
;
1664 !m_vars
[i
].flags
.always_copy
&&
1665 (m_vars
[i
].alloc_if
|| m_vars
[i
].free_if
)) {
1666 m_vars
[i
].direction
.bits
= c_parameter_nocopy
;
1670 AutoData
*auto_data
;
1671 if (m_vars
[i
].alloc_if
) {
1672 auto_data
= m_device
.insert_auto_data(
1673 m_vars
[i
].ptr
, m_vars
[i
].size
);
1674 auto_data
->add_reference();
1677 // TODO: what should be done if var is not in
1679 auto_data
= m_device
.find_auto_data(
1683 // For automatic variables data is transferred:
1684 // - if always modifier is used OR
1685 // - if alloc_if == 0 && free_if == 0 OR
1686 // - if reference count is 1
1687 if (!m_vars
[i
].flags
.always_copy
&&
1688 (m_vars
[i
].alloc_if
|| m_vars
[i
].free_if
) &&
1690 auto_data
->get_reference() != 1) {
1691 m_vars
[i
].direction
.bits
= c_parameter_nocopy
;
1694 // save data for later use
1695 m_vars_extra
[i
].auto_data
= auto_data
;
1701 if (m_vars
[i
].flags
.use_device_ptr
) {
1702 setup_use_device_ptr(i
);
1705 else if (m_vars
[i
].direction
.bits
||
1706 m_vars
[i
].alloc_if
||
1707 m_vars
[i
].free_if
) {
1708 ArrDesc
*dvp
= static_cast<ArrDesc
*>(m_vars
[i
].ptr
);
1711 __dv_desc_dump("IN/OUT", dvp
);
1713 // send dope vector contents excluding base
1714 m_in_datalen
+= m_vars
[i
].size
- sizeof(uint64_t);
1715 m_need_runfunction
= true;
1720 case c_string_ptr_ptr
:
1721 if ((m_vars
[i
].direction
.bits
||
1722 m_vars
[i
].alloc_if
||
1723 m_vars
[i
].free_if
) &&
1724 m_vars
[i
].size
== 0) {
1727 strlen(*static_cast<char**>(m_vars
[i
].ptr
)) + 1;
1732 case c_data_ptr_ptr
:
1733 if (m_vars
[i
].flags
.is_stack_buf
&&
1734 !m_vars
[i
].direction
.bits
&&
1735 m_vars
[i
].alloc_if
) {
1736 // this var_desc is for stack buffer
1739 if (!offload_stack_memory_manager(
1740 stack_addr
, entry_id
,
1741 m_vars
[i
].count
, m_vars
[i
].align
,
1742 thread_specific_function_locals
, &is_new
)) {
1746 m_compute_buffers
.push_back(
1747 m_stack_ptr_data
->mic_buf
);
1748 m_device
.m_persist_list
.front().cpu_stack_addr
=
1749 static_cast<char*>(m_vars
[i
].ptr
);
1750 PersistData
*new_el
= &m_device
.m_persist_list
.front();
1751 print_persistList_item(
1752 "New element in persist list:",
1756 m_vars
[i
].flags
.sink_addr
= 1;
1757 m_in_datalen
+= sizeof(m_stack_ptr_data
->mic_addr
);
1758 if (thread_specific_function_locals
) {
1759 m_stack_ptr_data
= get_this_threads_mic_stack_addr(
1760 stack_addr
, entry_id
,
1761 thread_specific_function_locals
);
1764 m_vars
[i
].size
= m_destroy_stack
.size();
1765 m_vars_extra
[i
].src_data
= m_stack_ptr_data
;
1767 // need to add or remove references for stack buffer at target
1768 if (is_new
|| m_destroy_stack
.size()) {
1769 m_need_runfunction
= true;
1776 case c_cean_var_ptr
:
1777 case c_cean_var_ptr_ptr
:
1779 if (m_vars
[i
].flags
.is_non_cont_struct
&& src_is_for_host
) {
1780 NonContigDesc
*desc
=
1781 static_cast<NonContigDesc
*>(m_vars
[i
].ptr
);
1782 noncont_struct_dump(" ", "PTR", desc
);
1783 m_vars_extra
[i
].noncont_desc
= desc
;
1784 m_vars
[i
].ptr
= reinterpret_cast<void*>(desc
->base
);
1787 else if (m_vars_extra
[i
].type_src
== c_cean_var_ptr
||
1788 m_vars_extra
[i
].type_src
== c_cean_var_ptr_ptr
) {
1790 const Arr_Desc
*ap
=
1791 static_cast<const Arr_Desc
*>(m_vars
[i
].ptr
);
1794 ARRAY_DESC_DUMP("", "IN/OUT", ap
, 1, !src_is_for_mic
);
1796 // offset and length are derived from the array descriptor
1797 __arr_data_offset_and_length(ap
, m_vars
[i
].disp
,
1800 if (!is_arr_desc_contiguous(ap
)) {
1801 m_vars
[i
].flags
.is_noncont_src
= 1;
1802 m_vars_extra
[i
].read_rng_src
=
1803 init_read_ranges_arr_desc(ap
);
1805 // all necessary information about length and offset is
1806 // transferred in var descriptor. There is no need to send
1807 // array descriptor to the target side.
1808 m_vars
[i
].ptr
= reinterpret_cast<void*>(ap
->base
);
1810 else if (m_vars_extra
[i
].type_src
== c_dv_ptr
) {
1811 // need to send DV to the device unless it is 'nocopy'
1812 if (m_vars
[i
].direction
.bits
||
1813 m_vars
[i
].alloc_if
||
1814 m_vars
[i
].free_if
) {
1815 ArrDesc
*dvp
= *static_cast<ArrDesc
**>(m_vars
[i
].ptr
);
1818 __dv_desc_dump("IN/OUT", dvp
);
1820 // for use_device_ptr don't need to change
1821 // OUT direction to IN direction
1822 if (!m_vars
[i
].flags
.use_device_ptr
) {
1823 m_vars
[i
].direction
.bits
= c_parameter_in
;
1831 // For "use_device_ptr" if direction is "in" then need to
1832 // find the associated device pointer and replace the host
1833 // pointer with device pointer. Also save the host pointer
1834 // to restore when "out" is encountered.
1835 // For "out" find the host pointer associated with the
1836 // device pointer and restore the host pointer
1837 if (m_vars
[i
].flags
.use_device_ptr
&& src_is_for_host
) {
1838 setup_use_device_ptr(i
);
1842 // c_data_ptr or c_string_ptr
1843 m_vars
[i
].size
*= m_vars
[i
].count
;
1847 if (m_vars
[i
].direction
.bits
||
1848 m_vars
[i
].alloc_if
||
1849 m_vars
[i
].free_if
) {
1852 // check that buffer length > 0
1853 if (m_vars
[i
].alloc_if
&&
1854 m_vars
[i
].disp
+ m_vars
[i
].size
<
1855 (m_is_openmp
? 0 : 1)) {
1856 LIBOFFLOAD_ERROR(c_zero_or_neg_ptr_len
);
1861 void *base
= *static_cast<void**>(m_vars
[i
].ptr
);
1863 // allocate buffer if we have no INTO and don't need
1864 // allocation for the ptr at target
1865 if (src_is_for_mic
) {
1866 if (m_vars
[i
].flags
.is_stack_buf
) {
1867 // for stack persistent objects ptr data is created
1868 // by var_desc with number 0.
1869 // Its ptr_data is stored at m_stack_ptr_data
1870 ptr_data
= m_stack_ptr_data
;
1872 else if (m_vars
[i
].alloc_if
) {
1873 if (m_vars
[i
].flags
.preallocated
) {
1874 m_out_datalen
+= sizeof(void*);
1875 m_need_runfunction
= true;
1879 if (!alloc_ptr_data(
1881 reinterpret_cast<char *>(base
) + alloc_disp
,
1882 (alloc_base
!= NULL
) ?
1883 alloc_disp
: m_vars
[i
].disp
,
1884 (alloc_base
!= NULL
) ?
1885 alloc_size
: m_vars
[i
].size
,
1887 (alloc_base
!= NULL
) ?
1888 0 : m_vars
[i
].align
,
1889 m_vars
[i
].flags
.targetptr
,
1891 m_vars
[i
].flags
.pin
)) {
1894 if (m_vars
[i
].flags
.targetptr
) {
1895 if (!init_mic_address(ptr_data
)) {
1898 *static_cast<void**>(m_vars
[i
].ptr
) = base
=
1899 reinterpret_cast<void*>(ptr_data
->mic_addr
);
1901 if (ptr_data
->add_reference() == 0 &&
1902 ptr_data
->mic_buf
!= 0) {
1903 // add buffer to the list of buffers that
1904 // are passed to dispatch call
1905 m_compute_buffers
.push_back(
1908 else if (!m_vars
[i
].flags
.pin
&&
1909 !m_vars
[i
].flags
.preallocated
) {
1910 // will send buffer address to device
1911 m_vars
[i
].flags
.sink_addr
= 1;
1912 m_in_datalen
+= sizeof(ptr_data
->mic_addr
);
1915 if (!m_vars
[i
].flags
.pin
&&
1916 !ptr_data
->is_static
) {
1917 // need to add reference for buffer
1918 m_need_runfunction
= true;
1922 bool error_if_not_found
= true;
1924 // For omp target update variable is ignored
1925 // if it does not exist.
1926 if (m_vars
[i
].flags
.always_copy
||
1927 (!m_vars
[i
].alloc_if
&&
1928 !m_vars
[i
].free_if
)) {
1929 error_if_not_found
= false;
1933 // use existing association from pointer table
1934 if (!find_ptr_data(ptr_data
,
1938 m_vars
[i
].flags
.targetptr
,
1939 error_if_not_found
)) {
1944 // make var nocopy if it does not exist
1945 if (ptr_data
== 0) {
1946 m_vars
[i
].direction
.bits
=
1951 if (ptr_data
!= 0) {
1952 m_vars
[i
].flags
.sink_addr
= 1;
1953 m_in_datalen
+= sizeof(ptr_data
->mic_addr
);
1957 if (ptr_data
!= 0) {
1959 if (ptr_data
->alloc_disp
!= 0) {
1960 m_vars
[i
].flags
.alloc_disp
= 1;
1961 m_in_datalen
+= sizeof(alloc_disp
);
1964 if (m_vars
[i
].flags
.sink_addr
) {
1965 // get buffers's address on the sink
1966 if (!init_mic_address(ptr_data
)) {
1970 m_in_datalen
+= sizeof(ptr_data
->mic_addr
);
1973 if (!m_vars
[i
].flags
.pin
&&
1974 !ptr_data
->is_static
&& m_vars
[i
].free_if
) {
1975 // need to decrement buffer reference on target
1976 m_need_runfunction
= true;
1979 // offset to base from the beginning of the buffer
1981 m_vars
[i
].offset
= (char*) base
-
1982 (char*) ptr_data
->cpu_addr
.start();
1984 // copy other pointer properties to var descriptor
1985 m_vars
[i
].mic_offset
= ptr_data
->mic_offset
;
1986 m_vars
[i
].flags
.is_static
= ptr_data
->is_static
;
1990 if (!find_ptr_data(ptr_data
,
2000 (char*) ptr_data
->cpu_addr
.start();
2005 if (m_vars
[i
].flags
.use_device_ptr
) {
2006 setup_use_device_ptr(i
);
2008 // for TO transfer of stack buffer's variable
2009 if (src_is_for_host
&& m_vars
[i
].flags
.is_stack_buf
) {
2010 AutoData
*auto_data
;
2011 char *base
= *static_cast<char**>(m_vars
[i
].ptr
);
2012 if (m_vars
[i
].alloc_if
) {
2013 auto_data
=m_device
.insert_auto_data(
2014 base
+ m_vars
[i
].disp
,
2016 auto_data
->add_reference();
2019 auto_data
= m_device
.find_auto_data(
2020 base
+ m_vars
[i
].disp
);
2022 // save data for later use
2023 m_vars_extra
[i
].auto_data
= auto_data
;
2025 // For automatic variables
2026 // data is transferred:
2027 // - if always modifier is used OR
2028 // - if alloc_if == 0 && free_if == 0 OR
2029 // - if reference count is 1
2030 if (!m_vars
[i
].flags
.always_copy
&&
2031 (m_vars
[i
].alloc_if
||
2032 m_vars
[i
].free_if
) &&
2034 auto_data
->get_reference() != 1) {
2035 m_vars
[i
].direction
.bits
=
2039 // for FROM transfer of global pointer variable
2040 // FROM transfer of stack buffer's variable
2041 // is treated at INTO branch
2042 else if (src_is_for_mic
&&
2043 !m_vars
[i
].flags
.is_stack_buf
) {
2044 // data is transferred only if
2045 // alloc_if == 0 && free_if == 0
2046 // or reference count is 1
2047 if (!m_vars
[i
].flags
.always_copy
&&
2048 (m_vars
[i
].alloc_if
||
2049 m_vars
[i
].free_if
) &&
2051 ptr_data
->get_reference() != 1)
2053 m_vars
[i
].direction
.bits
=
2058 // save pointer data
2059 m_vars_extra
[i
].src_data
= ptr_data
;
2064 case c_func_ptr_ptr
:
2065 if (m_vars
[i
].direction
.in
) {
2066 m_in_datalen
+= __offload_funcs
.max_name_length();
2068 if (m_vars
[i
].direction
.out
) {
2069 m_out_datalen
+= __offload_funcs
.max_name_length();
2071 m_need_runfunction
= true;
2076 case c_dv_data_slice
:
2077 case c_dv_ptr_data_slice
:
2079 if (m_vars
[i
].flags
.is_non_cont_struct
) {
2080 NonContigDesc
*desc
=
2081 static_cast<NonContigDesc
*>(m_vars
[i
].ptr
);
2082 noncont_struct_dump(" ", "DV-DATA", desc
);
2083 dvp
= reinterpret_cast<ArrDesc
*>(desc
->base
);
2085 else if (VAR_TYPE_IS_DV_DATA_SLICE(m_vars_extra
[i
].type_src
)) {
2087 ap
= static_cast<const Arr_Desc
*>(m_vars
[i
].ptr
);
2089 dvp
= (m_vars_extra
[i
].type_src
== c_dv_data_slice
) ?
2090 reinterpret_cast<ArrDesc
*>(ap
->base
) :
2091 *reinterpret_cast<ArrDesc
**>(ap
->base
);
2094 dvp
= (m_vars_extra
[i
].type_src
== c_dv_data
) ?
2095 static_cast<ArrDesc
*>(m_vars
[i
].ptr
) :
2096 *static_cast<ArrDesc
**>(m_vars
[i
].ptr
);
2099 // if allocatable dope vector isn't allocated don't
2100 // transfer its data
2101 if (!__dv_is_allocated(dvp
)) {
2102 m_vars
[i
].direction
.bits
= c_parameter_nocopy
;
2103 m_vars
[i
].alloc_if
= 0;
2104 m_vars
[i
].free_if
= 0;
2106 if (m_vars
[i
].direction
.bits
||
2107 m_vars
[i
].alloc_if
||
2108 m_vars
[i
].free_if
) {
2111 if (VAR_TYPE_IS_DV_DATA_SLICE(m_vars_extra
[i
].type_src
)) {
2112 ap
= static_cast<const Arr_Desc
*>(m_vars
[i
].ptr
);
2115 ARRAY_DESC_DUMP("", "IN/OUT", ap
, 0, !src_is_for_mic
);
2117 if (!__dv_is_contiguous(dvp
)) {
2118 m_vars
[i
].flags
.is_noncont_src
= 1;
2119 m_vars_extra
[i
].read_rng_src
=
2120 init_read_ranges_dv(dvp
);
2123 // size and displacement
2124 if (VAR_TYPE_IS_DV_DATA_SLICE(m_vars_extra
[i
].type_src
)) {
2125 // offset and length are derived from the
2127 __arr_data_offset_and_length(ap
,
2130 if (m_vars
[i
].direction
.bits
) {
2131 if (!is_arr_desc_contiguous(ap
)) {
2132 if (m_vars
[i
].flags
.is_noncont_src
) {
2133 LIBOFFLOAD_ERROR(c_slice_of_noncont_array
);
2136 m_vars
[i
].flags
.is_noncont_src
= 1;
2137 m_vars_extra
[i
].read_rng_src
=
2138 init_read_ranges_arr_desc(ap
);
2143 if (m_vars
[i
].flags
.has_length
) {
2145 __dv_data_length(dvp
, m_vars
[i
].count
);
2148 m_vars
[i
].size
= __dv_data_length(dvp
);
2153 // check that length >= 0
2154 if (m_vars
[i
].alloc_if
&&
2155 (m_vars
[i
].disp
+ m_vars
[i
].size
< 0)) {
2156 LIBOFFLOAD_ERROR(c_zero_or_neg_ptr_len
);
2161 void *base
= reinterpret_cast<void*>(dvp
->Base
);
2164 // allocate buffer if we have no INTO and don't need
2165 // allocation for the ptr at target
2166 if (src_is_for_mic
) {
2167 if (m_vars
[i
].alloc_if
) {
2169 if (!alloc_ptr_data(
2171 reinterpret_cast<char *>(base
) + alloc_disp
,
2172 (alloc_base
!= NULL
) ?
2173 alloc_disp
: m_vars
[i
].disp
,
2174 (alloc_base
!= NULL
) ?
2175 alloc_size
: m_vars
[i
].size
,
2177 (alloc_base
!= NULL
) ?
2178 0 : m_vars
[i
].align
,
2179 m_vars
[i
].flags
.targetptr
,
2180 m_vars
[i
].flags
.preallocated
,
2181 m_vars
[i
].flags
.pin
)) {
2185 if (ptr_data
->add_reference() == 0 &&
2186 ptr_data
->mic_buf
!= 0) {
2187 // add buffer to the list of buffers
2188 // that are passed to dispatch call
2189 m_compute_buffers
.push_back(
2193 // will send buffer address to device
2194 m_vars
[i
].flags
.sink_addr
= 1;
2197 if (!ptr_data
->is_static
) {
2198 // need to add reference for buffer
2199 m_need_runfunction
= true;
2203 bool error_if_not_found
= true;
2205 // For omp target update variable is ignored
2206 // if it does not exist.
2207 if (m_vars
[i
].flags
.always_copy
||
2208 (!m_vars
[i
].alloc_if
&&
2209 !m_vars
[i
].free_if
)) {
2210 error_if_not_found
= false;
2214 // use existing association from pointer table
2215 if (!find_ptr_data(ptr_data
,
2219 m_vars
[i
].flags
.targetptr
,
2220 error_if_not_found
)) {
2225 // make var nocopy if it does not exist
2226 if (ptr_data
== 0) {
2227 m_vars
[i
].direction
.bits
=
2232 if (ptr_data
!= 0) {
2233 // need to update base in dope vector on device
2234 m_vars
[i
].flags
.sink_addr
= 1;
2238 if (ptr_data
!= 0) {
2240 // data is transferred if
2241 // - if always modifier is used OR
2242 // - if alloc_if == 0 && free_if == 0 OR
2243 // - if reference count is 1
2244 if (!m_vars
[i
].flags
.always_copy
&&
2245 (m_vars
[i
].alloc_if
||
2246 m_vars
[i
].free_if
) &&
2247 ptr_data
->get_reference() != 1) {
2248 m_vars
[i
].direction
.bits
=
2253 if (ptr_data
->alloc_disp
!= 0) {
2254 m_vars
[i
].flags
.alloc_disp
= 1;
2255 m_in_datalen
+= sizeof(alloc_disp
);
2258 if (m_vars
[i
].flags
.sink_addr
) {
2259 // get buffers's address on the sink
2260 if (!init_mic_address(ptr_data
)) {
2264 m_in_datalen
+= sizeof(ptr_data
->mic_addr
);
2267 if (!ptr_data
->is_static
&& m_vars
[i
].free_if
) {
2268 // need to decrement buffer reference on target
2269 m_need_runfunction
= true;
2272 // offset to base from the beginning of the buffer
2276 (char*) ptr_data
->cpu_addr
.start();
2278 // copy other pointer properties to var descriptor
2279 m_vars
[i
].mic_offset
= ptr_data
->mic_offset
;
2280 m_vars
[i
].flags
.is_static
= ptr_data
->is_static
;
2283 else { // !src_is_for_mic
2284 if (!find_ptr_data(ptr_data
,
2291 m_vars
[i
].offset
= !ptr_data
? 0 :
2293 (char*) ptr_data
->cpu_addr
.start();
2296 // save pointer data
2297 m_vars_extra
[i
].src_data
= ptr_data
;
2302 LIBOFFLOAD_ERROR(c_unknown_var_type
, m_vars_extra
[i
].type_src
);
2305 if (m_vars_extra
[i
].type_src
== c_data_ptr_array
) {
2309 if (src_is_for_mic
&& m_vars
[i
].flags
.is_stack_buf
) {
2310 if (this_threads_cpu_stack_addr
== 0) {
2311 this_threads_cpu_stack_addr
=
2312 get_this_threads_cpu_stack_addr(
2313 stack_addr
, entry_id
, thread_specific_function_locals
);
2315 m_vars
[i
].offset
= static_cast<char*>
2317 this_threads_cpu_stack_addr
;
2319 // if source is used at CPU save its offset and disp
2320 if (m_vars
[i
].into
== NULL
|| m_vars
[i
].direction
.in
) {
2321 m_vars_extra
[i
].cpu_offset
= m_vars
[i
].offset
;
2322 m_vars_extra
[i
].cpu_disp
= m_vars
[i
].disp
;
2325 // If "into" is define we need to do the similar work for it
2326 if (!m_vars
[i
].into
) {
2330 int64_t into_disp
=0, into_offset
= 0;
2332 switch (m_vars_extra
[i
].type_dst
) {
2333 case c_data_ptr_array
:
2337 case c_void_ptr_ptr
:
2339 int64_t size
= m_vars
[i
].size
;
2341 if (m_vars
[i
].flags
.is_non_cont_struct
&& src_is_for_mic
) {
2342 NonContigDesc
*desc
=
2343 static_cast<NonContigDesc
*>(m_vars
[i
].into
);
2344 noncont_struct_dump("", "INTO DATA", desc
);
2345 m_vars_extra
[i
].noncont_desc
= desc
;
2346 m_vars
[i
].into
= reinterpret_cast<void*>(desc
->base
);
2347 size
= get_noncont_struct_size(desc
);
2350 else if (m_vars_extra
[i
].type_dst
== c_cean_var
) {
2352 const Arr_Desc
*ap
=
2353 static_cast<const Arr_Desc
*>(m_vars
[i
].into
);
2356 ARRAY_DESC_DUMP(" ", "INTO", ap
, 0, src_is_for_mic
);
2358 // offset and length are derived from the array descriptor
2359 __arr_data_offset_and_length(ap
, into_disp
, size
);
2361 if (!is_arr_desc_contiguous(ap
)) {
2362 m_vars
[i
].flags
.is_noncont_dst
= 1;
2363 m_vars_extra
[i
].read_rng_dst
=
2364 init_read_ranges_arr_desc(ap
);
2365 if (!cean_ranges_match(
2366 m_vars_extra
[i
].read_rng_src
,
2367 m_vars_extra
[i
].read_rng_dst
)) {
2368 LIBOFFLOAD_ERROR(c_ranges_dont_match
);
2372 m_vars
[i
].into
= reinterpret_cast<void*>(ap
->base
);
2375 int64_t size_src
= m_vars_extra
[i
].read_rng_src
&&
2376 !m_vars
[i
].flags
.is_non_cont_struct
?
2377 cean_get_transf_size(m_vars_extra
[i
].read_rng_src
) :
2379 int64_t size_dst
= m_vars_extra
[i
].read_rng_dst
?
2380 cean_get_transf_size(m_vars_extra
[i
].read_rng_dst
) :
2382 // It's supposed that "into" size must be not less
2384 if (size_src
> size_dst
) {
2385 LIBOFFLOAD_ERROR(c_different_src_and_dstn_sizes
,
2386 size_src
, size_dst
);
2390 if (m_vars
[i
].direction
.bits
) {
2391 if (m_vars
[i
].flags
.is_static_dstn
) {
2394 // find data associated with variable
2395 if (!find_ptr_data(ptr_data
, m_vars
[i
].into
,
2396 into_disp
, size
, false, false)) {
2399 if (ptr_data
!= 0) {
2400 // offset to base from the beginning of the buffer
2403 (char*) m_vars
[i
].into
-
2404 (char*) ptr_data
->cpu_addr
.start();
2407 m_vars
[i
].flags
.is_static_dstn
= false;
2409 m_vars_extra
[i
].dst_data
= ptr_data
;
2413 if (m_vars
[i
].direction
.in
&&
2414 !m_vars
[i
].flags
.is_static_dstn
) {
2415 m_in_datalen
+= m_vars
[i
].size
;
2417 // for non-static target destination defined as CEAN
2418 // expression we pass to target its size and dist
2419 if (m_vars_extra
[i
].type_dst
== c_cean_var
) {
2420 m_in_datalen
+= 2 * sizeof(uint64_t);
2422 m_need_runfunction
= true;
2425 if (m_is_openmp
&& src_is_for_mic
) {
2426 if (m_vars
[i
].flags
.is_static_dstn
) {
2427 // Static data is transferred either by omp target
2428 // update construct which passes zeros for
2429 // alloc_if and free_if or by always modifier.
2430 if (!m_vars
[i
].flags
.always_copy
&&
2431 (m_vars
[i
].alloc_if
|| m_vars
[i
].free_if
)) {
2432 m_vars
[i
].direction
.bits
= c_parameter_nocopy
;
2436 AutoData
*auto_data
;
2437 if (m_vars
[i
].alloc_if
) {
2438 auto_data
= m_device
.insert_auto_data(
2439 m_vars
[i
].into
, size_dst
);
2440 auto_data
->add_reference();
2443 // TODO: what should be done if var is not in
2445 auto_data
= m_device
.find_auto_data(
2449 // For automatic variables data is transferred:
2450 // - if always modifier is used OR
2451 // - if alloc_if == 0 && free_if == 0 OR
2452 // - if reference count is 1
2453 if (!m_vars
[i
].flags
.always_copy
&&
2454 (m_vars
[i
].alloc_if
|| m_vars
[i
].free_if
) &&
2456 auto_data
->get_reference() != 1)) {
2457 m_vars
[i
].direction
.bits
= c_parameter_nocopy
;
2459 // save data for later use
2460 m_vars_extra
[i
].auto_data
= auto_data
;
2467 if (m_vars
[i
].direction
.bits
||
2468 m_vars
[i
].alloc_if
||
2469 m_vars
[i
].free_if
) {
2470 ArrDesc
*dvp
= static_cast<ArrDesc
*>(m_vars
[i
].into
);
2473 __dv_desc_dump("INTO", dvp
);
2475 // send dope vector contents excluding base
2476 m_in_datalen
+= m_vars
[i
].size
- sizeof(uint64_t);
2477 m_need_runfunction
= true;
2483 case c_string_ptr_ptr
:
2484 case c_data_ptr_ptr
:
2485 case c_cean_var_ptr
:
2486 case c_cean_var_ptr_ptr
:
2488 int64_t size
= m_vars
[i
].size
;
2490 if (m_vars_extra
[i
].type_dst
== c_cean_var_ptr
||
2491 m_vars_extra
[i
].type_dst
== c_cean_var_ptr_ptr
) {
2493 const Arr_Desc
*ap
=
2494 static_cast<const Arr_Desc
*>(m_vars
[i
].into
);
2497 ARRAY_DESC_DUMP(" ", "INTO", ap
, 1, src_is_for_mic
);
2499 // offset and length are derived from the array descriptor
2500 __arr_data_offset_and_length(ap
, into_disp
, size
);
2502 if (!is_arr_desc_contiguous(ap
)) {
2503 m_vars
[i
].flags
.is_noncont_src
= 1;
2504 m_vars_extra
[i
].read_rng_dst
=
2505 init_read_ranges_arr_desc(ap
);
2506 if (!cean_ranges_match(
2507 m_vars_extra
[i
].read_rng_src
,
2508 m_vars_extra
[i
].read_rng_dst
)) {
2509 LIBOFFLOAD_ERROR(c_ranges_dont_match
);
2512 m_vars
[i
].into
= reinterpret_cast<char**>(ap
->base
);
2514 else if (m_vars_extra
[i
].type_dst
== c_dv_ptr
) {
2515 // need to send DV to the device unless it is 'nocopy'
2516 if (m_vars
[i
].direction
.bits
||
2517 m_vars
[i
].alloc_if
||
2518 m_vars
[i
].free_if
) {
2519 ArrDesc
*dvp
= *static_cast<ArrDesc
**>(m_vars
[i
].into
);
2522 __dv_desc_dump("INTO", dvp
);
2524 m_vars
[i
].direction
.bits
= c_parameter_in
;
2528 int64_t size_src
= m_vars_extra
[i
].read_rng_src
&&
2529 !m_vars
[i
].flags
.is_non_cont_struct
?
2530 cean_get_transf_size(m_vars_extra
[i
].read_rng_src
) :
2532 int64_t size_dst
= m_vars_extra
[i
].read_rng_dst
?
2533 cean_get_transf_size(m_vars_extra
[i
].read_rng_dst
) :
2535 // It's supposed that "into" size must be not less than
2537 if (size_src
> size_dst
) {
2538 LIBOFFLOAD_ERROR(c_different_src_and_dstn_sizes
,
2539 size_src
, size_dst
);
2543 if (m_vars
[i
].direction
.bits
) {
2547 void *base
= *static_cast<void**>(m_vars
[i
].into
);
2549 if (m_vars
[i
].direction
.in
) {
2551 if (m_vars
[i
].flags
.is_stack_buf
) {
2552 // for stack persistent objects ptr data is created
2553 // by var_desc with number 0.
2554 // Its ptr_data is stored at m_stack_ptr_data
2555 ptr_data
= m_stack_ptr_data
;
2557 else if (m_vars
[i
].alloc_if
) {
2558 if (m_vars
[i
].flags
.preallocated
) {
2559 m_out_datalen
+= sizeof(void*);
2560 m_need_runfunction
= true;
2564 if (!alloc_ptr_data(
2566 reinterpret_cast<char *>(base
) + alloc_disp
,
2567 (alloc_base
!= NULL
) ?
2568 alloc_disp
: into_disp
,
2569 (alloc_base
!= NULL
) ?
2572 (alloc_base
!= NULL
) ?
2573 0 : m_vars
[i
].align
,
2574 m_vars
[i
].flags
.targetptr
,
2575 m_vars
[i
].flags
.preallocated
,
2576 m_vars
[i
].flags
.pin
)) {
2579 if (m_vars
[i
].flags
.targetptr
) {
2580 if (!init_mic_address(ptr_data
)) {
2583 *static_cast<void**>(m_vars
[i
].into
) = base
=
2584 reinterpret_cast<void*>(ptr_data
->mic_addr
);
2586 if (ptr_data
->add_reference() == 0 &&
2587 ptr_data
->mic_buf
!= 0) {
2588 // add buffer to the list of buffers that
2589 // are passed to dispatch call
2590 m_compute_buffers
.push_back(
2594 // will send buffer address to device
2595 m_vars
[i
].flags
.sink_addr
= 1;
2598 if (!ptr_data
->is_static
) {
2599 // need to add reference for buffer
2600 m_need_runfunction
= true;
2604 // use existing association from pointer table
2605 if (!find_ptr_data(ptr_data
, base
, into_disp
,
2606 size
, m_vars
[i
].flags
.targetptr
, true)) {
2609 m_vars
[i
].flags
.sink_addr
= 1;
2612 if (ptr_data
->alloc_disp
!= 0) {
2613 m_vars
[i
].flags
.alloc_disp
= 1;
2614 m_in_datalen
+= sizeof(alloc_disp
);
2617 if (m_vars
[i
].flags
.sink_addr
) {
2618 // get buffers's address on the sink
2619 if (!init_mic_address(ptr_data
)) {
2623 m_in_datalen
+= sizeof(ptr_data
->mic_addr
);
2626 if (!ptr_data
->is_static
&& m_vars
[i
].free_if
) {
2627 // need to decrement buffer reference on target
2628 m_need_runfunction
= true;
2631 // copy other pointer properties to var descriptor
2632 m_vars
[i
].mic_offset
= ptr_data
->mic_offset
;
2633 m_vars
[i
].flags
.is_static_dstn
= ptr_data
->is_static
;
2636 if (!find_ptr_data(ptr_data
,
2645 into_offset
= ptr_data
?
2647 (char*) ptr_data
->cpu_addr
.start() :
2652 // for FROM transfer of stack buffer's variable
2653 if (src_is_for_mic
&& m_vars
[i
].flags
.is_stack_buf
) {
2654 AutoData
*auto_data
;
2655 char *base
= *static_cast<char**>(m_vars
[i
].into
);
2656 if (m_vars
[i
].alloc_if
) {
2657 auto_data
=m_device
.insert_auto_data(
2660 auto_data
->add_reference();
2663 auto_data
= m_device
.find_auto_data(
2666 // save data for later use
2667 m_vars_extra
[i
].auto_data
= auto_data
;
2668 // For automatic variables
2669 // data is transferred:
2670 // - if always modifier is used OR
2671 // - if alloc_if == 0 && free_if == 0 OR
2672 // - if reference count is 1
2673 if (!m_vars
[i
].flags
.always_copy
&&
2674 (m_vars
[i
].alloc_if
||
2675 m_vars
[i
].free_if
) &&
2677 auto_data
->get_reference() != 1) {
2678 m_vars
[i
].direction
.bits
=
2683 // save pointer data
2684 m_vars_extra
[i
].dst_data
= ptr_data
;
2690 case c_func_ptr_ptr
:
2695 case c_dv_data_slice
:
2696 case c_dv_ptr_data_slice
:
2697 if (m_vars
[i
].direction
.bits
||
2698 m_vars
[i
].alloc_if
||
2699 m_vars
[i
].free_if
) {
2706 if (VAR_TYPE_IS_DV_DATA_SLICE(m_vars_extra
[i
].type_dst
)) {
2707 ap
= static_cast<const Arr_Desc
*>(m_vars
[i
].into
);
2710 ARRAY_DESC_DUMP(" ", "INTO", ap
, 0, src_is_for_mic
);
2712 dvp
= (m_vars_extra
[i
].type_dst
== c_dv_data_slice
) ?
2713 reinterpret_cast<ArrDesc
*>(ap
->base
) :
2714 *reinterpret_cast<ArrDesc
**>(ap
->base
);
2717 dvp
= (m_vars_extra
[i
].type_dst
== c_dv_data
) ?
2718 static_cast<ArrDesc
*>(m_vars
[i
].into
) :
2719 *static_cast<ArrDesc
**>(m_vars
[i
].into
);
2721 if (!__dv_is_contiguous(dvp
)) {
2722 m_vars
[i
].flags
.is_noncont_dst
= 1;
2723 m_vars_extra
[i
].read_rng_dst
=
2724 init_read_ranges_dv(dvp
);
2726 // size and displacement
2727 if (VAR_TYPE_IS_DV_DATA_SLICE(m_vars_extra
[i
].type_dst
)) {
2728 // offset and length are derived from the array
2730 __arr_data_offset_and_length(ap
, into_disp
, size
);
2731 if (m_vars
[i
].direction
.bits
) {
2732 if (!is_arr_desc_contiguous(ap
)) {
2733 if (m_vars
[i
].flags
.is_noncont_dst
) {
2734 LIBOFFLOAD_ERROR(c_slice_of_noncont_array
);
2737 m_vars
[i
].flags
.is_noncont_dst
= 1;
2738 m_vars_extra
[i
].read_rng_dst
=
2739 init_read_ranges_arr_desc(ap
);
2740 if (!cean_ranges_match(
2741 m_vars_extra
[i
].read_rng_src
,
2742 m_vars_extra
[i
].read_rng_dst
)) {
2743 LIBOFFLOAD_ERROR(c_ranges_dont_match
);
2749 if (m_vars
[i
].flags
.has_length
) {
2750 size
= __dv_data_length(dvp
, m_vars
[i
].count
);
2753 size
= __dv_data_length(dvp
);
2759 m_vars_extra
[i
].read_rng_src
&&
2760 (!m_vars
[i
].flags
.is_non_cont_struct
||
2762 cean_get_transf_size(m_vars_extra
[i
].read_rng_src
) :
2765 m_vars_extra
[i
].read_rng_dst
?
2766 cean_get_transf_size(m_vars_extra
[i
].read_rng_dst
) :
2768 // It's supposed that "into" size must be not less
2770 if (size_src
> size_dst
) {
2771 LIBOFFLOAD_ERROR(c_different_src_and_dstn_sizes
,
2772 size_src
, size_dst
);
2777 void *base
= reinterpret_cast<void*>(dvp
->Base
);
2780 if (m_vars
[i
].direction
.in
) {
2781 if (m_vars
[i
].alloc_if
) {
2783 if (!alloc_ptr_data(
2785 reinterpret_cast<char *>(base
) + alloc_disp
,
2786 (alloc_base
!= NULL
) ?
2787 alloc_disp
: into_disp
,
2788 (alloc_base
!= NULL
) ?
2791 (alloc_base
!= NULL
) ?
2792 0 : m_vars
[i
].align
,
2793 m_vars
[i
].flags
.targetptr
,
2794 m_vars
[i
].flags
.preallocated
,
2795 m_vars
[i
].flags
.pin
)) {
2798 if (ptr_data
->add_reference() == 0 &&
2799 ptr_data
->mic_buf
!=0) {
2800 // add buffer to the list of buffers
2801 // that are passed to dispatch call
2802 m_compute_buffers
.push_back(
2806 // will send buffer address to device
2807 m_vars
[i
].flags
.sink_addr
= 1;
2810 if (!ptr_data
->is_static
) {
2811 // need to add reference for buffer
2812 m_need_runfunction
= true;
2816 // use existing association from pointer table
2817 if (!find_ptr_data(ptr_data
, base
, into_disp
,
2818 size
, m_vars
[i
].flags
.targetptr
, true)) {
2822 // need to update base in dope vector on device
2823 m_vars
[i
].flags
.sink_addr
= 1;
2826 if (ptr_data
->alloc_disp
!= 0) {
2827 m_vars
[i
].flags
.alloc_disp
= 1;
2828 m_in_datalen
+= sizeof(alloc_disp
);
2831 if (m_vars
[i
].flags
.sink_addr
) {
2832 // get buffers's address on the sink
2833 if (!init_mic_address(ptr_data
)) {
2836 m_in_datalen
+= sizeof(ptr_data
->mic_addr
);
2839 if (!ptr_data
->is_static
&& m_vars
[i
].free_if
) {
2840 // need to decrement buffer reference on target
2841 m_need_runfunction
= true;
2844 // offset to base from the beginning of the buffer
2847 (char*) base
- (char*) ptr_data
->cpu_addr
.start();
2849 // copy other pointer properties to var descriptor
2850 m_vars
[i
].mic_offset
= ptr_data
->mic_offset
;
2851 m_vars
[i
].flags
.is_static_dstn
= ptr_data
->is_static
;
2853 else { // src_is_for_mic
2854 if (!find_ptr_data(ptr_data
,
2861 into_offset
= !ptr_data
?
2863 (char*) base
- (char*) ptr_data
->cpu_addr
.start();
2866 // save pointer data
2867 m_vars_extra
[i
].dst_data
= ptr_data
;
2872 LIBOFFLOAD_ERROR(c_unknown_var_type
, m_vars_extra
[i
].type_src
);
2875 // if into is used at CPU save its offset and disp
2876 if (m_vars
[i
].direction
.out
) {
2877 m_vars_extra
[i
].cpu_offset
= into_offset
;
2878 m_vars_extra
[i
].cpu_disp
= into_disp
;
2881 if (m_vars
[i
].flags
.is_stack_buf
) {
2882 if (this_threads_cpu_stack_addr
== 0) {
2883 this_threads_cpu_stack_addr
=
2884 get_this_threads_cpu_stack_addr(
2885 stack_addr
, entry_id
,
2886 thread_specific_function_locals
);
2888 into_offset
= static_cast<char*>
2890 this_threads_cpu_stack_addr
;
2892 m_vars
[i
].offset
= into_offset
;
2893 m_vars
[i
].disp
= into_disp
;
2900 bool OffloadDescriptor::setup_misc_data(const char *name
)
2902 OffloadTimer
timer(get_timer_data(), c_offload_host_setup_misc_data
);
2904 // we can skip run functon call together with wait if offloaded
2905 // region is empty and there is no user defined non-pointer IN/OUT data
2906 if (m_need_runfunction
) {
2907 // variable descriptors are sent as input data
2908 m_in_datalen
+= m_vars_total
* sizeof(VarDesc
);
2910 // timer data is sent as a part of the output data
2911 m_out_datalen
+= OFFLOAD_TIMER_DATALEN();
2913 // max from input data and output data length
2914 uint64_t data_len
= m_in_datalen
> m_out_datalen
? m_in_datalen
:
2917 // Misc data has the following layout
2918 // <Function Descriptor>
2920 // <In/Out Data> (optional)
2922 // We can transfer copyin/copyout data in misc/return data which can
2923 // be passed to run function call if its size does not exceed
2924 // COI_PIPELINE_MAX_IN_MISC_DATA_LEN. Otherwise we have to allocate
2927 m_func_desc_size
= sizeof(FunctionDescriptor
) + strlen(name
) + 1;
2928 m_func_desc_size
= (m_func_desc_size
+ 7) & ~7;
2930 int misc_data_offset
= 0;
2931 int misc_data_size
= 0;
2933 if (m_func_desc_size
+
2934 m_in_datalen
<= COI_PIPELINE_MAX_IN_MISC_DATA_LEN
&&
2935 m_out_datalen
<= COI_PIPELINE_MAX_IN_MISC_DATA_LEN
) {
2936 // use misc/return data for copyin/copyout
2937 misc_data_offset
= m_func_desc_size
;
2938 misc_data_size
= data_len
;
2941 OffloadTimer
timer_buf(get_timer_data(),
2942 c_offload_host_alloc_data_buffer
);
2944 // send/receive data using buffer
2945 COIRESULT res
= COI::BufferCreate(data_len
,
2948 1, &m_device
.get_process(),
2950 if (res
!= COI_SUCCESS
) {
2951 if (m_status
!= 0) {
2952 m_status
->result
= translate_coi_error(res
);
2955 report_coi_error(c_buf_create
, res
);
2958 m_compute_buffers
.push_back(m_inout_buf
);
2959 m_destroy_buffers
.push_back(m_inout_buf
);
2963 // initialize function descriptor
2964 m_func_desc
= (FunctionDescriptor
*) malloc(m_func_desc_size
+
2966 if (m_func_desc
== NULL
)
2967 LIBOFFLOAD_ERROR(c_malloc
);
2968 m_func_desc
->console_enabled
= console_enabled
;
2969 m_func_desc
->timer_enabled
= offload_report_enabled
&&
2970 (timer_enabled
|| offload_report_level
);
2971 m_func_desc
->offload_report_level
= offload_report_enabled
?
2972 offload_report_level
: 0;
2973 m_func_desc
->offload_number
= GET_OFFLOAD_NUMBER(get_timer_data());
2974 m_func_desc
->in_datalen
= m_in_datalen
;
2975 m_func_desc
->out_datalen
= m_out_datalen
;
2976 m_func_desc
->vars_num
= m_vars_total
;
2977 m_func_desc
->data_offset
= misc_data_offset
;
2979 // append entry name
2980 strcpy(m_func_desc
->data
, name
);
2986 void OffloadDescriptor::setup_omp_async_info()
2988 OFFLOAD_TRACE(2, "setup_omp_async_info\n");
2989 OmpAsyncLastEventType event_type
= m_need_runfunction
?
2990 c_last_runfunc
: c_last_write
;
2991 int last_in
= m_need_runfunction
? 0 : -1;
2994 for (i
= m_vars_total
- 1; i
>=0; i
--) {
2995 bool src_is_target
= (m_vars
[i
].direction
.out
|| !m_vars
[i
].into
);
2996 int var_type
= src_is_target
? m_vars_extra
[i
].type_src
:
2997 m_vars_extra
[i
].type_dst
;
2998 bool target_is_static
= src_is_target
? m_vars
[i
].flags
.is_static
:
2999 m_vars
[i
].flags
.is_static_dstn
;
3004 if (m_vars
[i
].direction
.out
&& target_is_static
) {
3005 event_type
= c_last_read
;
3007 else if (last_in
< 0 && m_vars
[i
].direction
.in
&&
3014 case c_string_ptr_ptr
:
3015 case c_data_ptr_ptr
:
3016 case c_cean_var_ptr
:
3017 case c_cean_var_ptr_ptr
:
3021 case c_dv_data_slice
:
3022 case c_dv_ptr_data_slice
:
3024 if (m_vars
[i
].direction
.out
) {
3025 event_type
= c_last_read
;
3027 else if (last_in
< 0 && m_vars
[i
].direction
.in
) {
3034 if (event_type
== c_last_read
) {
3039 if (event_type
== c_last_read
) {
3040 m_vars_extra
[i
].omp_last_event_type
= c_last_read
;
3042 else if (event_type
== c_last_write
) {
3043 m_vars_extra
[last_in
].omp_last_event_type
= c_last_write
;
3045 m_omp_async_last_event_type
= event_type
;
3046 OFFLOAD_TRACE(2, "setup_omp_async_info: event_type=%d\n",
3047 m_omp_async_last_event_type
);
3051 void offload_proxy_task_completed_ooo(
3057 task_completion_callback ((void *) info
);
3060 // Callback function for asynchronous offloads
3061 void offload_complete_task(
3068 OffloadDescriptor
*task
= const_cast<OffloadDescriptor
*>(
3069 reinterpret_cast<const OffloadDescriptor
*>(info
));
3070 uint32_t events_remained
;
3072 lock_complete
.lock();
3073 if (!offload_descr_map
[task
]) {
3074 lock_complete
.unlock();
3078 #ifndef TARGET_WINNT
3079 events_remained
= __sync_sub_and_fetch(&task
->m_event_count
, 1);
3080 #else // TARGET_WINNT
3081 events_remained
= _InterlockedDecrement(&task
->m_event_count
);
3082 #endif // TARGET_WINNT
3083 // Waiting for the last event
3084 if (events_remained
!= 0) {
3085 lock_complete
.unlock();
3089 // Callback could be called when execution at host is completed.
3090 // Do nothing as engine data is destructed
3091 if (!task
->get_device().get_ready()) {
3092 lock_complete
.unlock();
3096 void * signal
= task
->get_signal();
3097 _Offload_stream stream_handle
= task
->get_stream();
3099 OFFLOAD_TRACE(2, "Call function offload_complete_task(%p)\n", info
);
3101 // Completed offload has a signal
3102 if (task
->m_has_signal
) {
3103 if (!offload_descr_map
[task
]) {
3104 lock_complete
.unlock();
3107 task
->get_device().complete_signaled_ofld(signal
);
3108 // Asynchronous offload can have both signal and stream. Need to
3109 // clean stream if any.
3110 stream_handle
= task
->get_stream();
3111 if (stream_handle
!= -1) {
3112 stream
= Stream::find_stream(stream_handle
, false);
3113 if (stream
&& stream
->get_last_offload() == task
) {
3114 stream
->set_last_offload(NULL
);
3117 offload_descr_map
[task
] = false;
3118 lock_complete
.unlock();
3120 if (task
->offload_finish(0)) { //arg is 0 for is_traceback
3125 // Asynchronous by stream
3127 if (stream_handle
!= 0) {
3128 stream
= Stream::find_stream(stream_handle
, false);
3130 // the stream was not created or was destroyed
3132 LIBOFFLOAD_ERROR(c_offload_no_stream
,
3133 task
->get_device().get_logical_index());
3136 if (!offload_descr_map
[task
]) {
3137 lock_complete
.unlock();
3140 if (task
== stream
->get_last_offload()) {
3141 stream
->set_last_offload(NULL
);
3143 // if the offload has both signal and stream we will complete
3144 // it as it has the signal. So we don't need to mark signal
3146 offload_descr_map
[task
] = false;
3147 lock_complete
.unlock();
3148 if (task
->offload_finish(0)) { //arg is 0 for is_traceback
3157 void OffloadDescriptor::register_omp_event_call_back(
3158 const COIEVENT
*event
,
3161 register_event_call_back(&offload_proxy_task_completed_ooo
, event
, info
);
3164 void OffloadDescriptor::register_event_call_back(
3165 void (*func
)(COIEVENT
, const COIRESULT
, const void*),
3166 const COIEVENT
*event
,
3169 OFFLOAD_TRACE(2, "register_event_call_back(event=%p, info=%p)\n",
3171 if (COI::EventRegisterCallback
) {
3172 COI::EventRegisterCallback(
3177 "COI::EventRegisterCallback found; callback registered\n");
3181 bool OffloadDescriptor::wait_dependencies(
3184 _Offload_stream handle
3187 OffloadTimer
timer(get_timer_data(), c_offload_host_wait_deps
);
3189 OffloadDescriptor
*task
;
3192 if (num_waits
== 0) {
3193 // Prepare in dependencies for stream
3194 get_stream_in_dependencies(m_num_in_dependencies
,m_p_in_dependencies
);
3199 if (num_waits
== -1) {
3201 // some specific stream of the device
3203 lock_complete
.lock();
3204 stream
= Stream::find_stream(handle
, false);
3206 // the stream was not created or was destroyed
3208 LIBOFFLOAD_ERROR(c_offload_no_stream
, m_device
.get_logical_index());
3211 task
= stream
->get_last_offload();
3213 // offload was completed by previous offload_wait pragma
3215 if (!offload_descr_map
[task
]) {
3216 lock_complete
.unlock();
3219 stream
->set_last_offload(NULL
);
3220 if (task
->m_has_signal
) {
3221 signal
= task
->get_signal();
3222 if (m_device
.find_signal(signal
, false) == task
) {
3223 m_device
.complete_signaled_ofld(signal
);
3226 offload_descr_map
[task
] = false;
3227 lock_complete
.unlock();
3229 if (!task
->offload_finish(0)) { //arg is 0 for is_traceback
3235 // all streams of the device or over all devices
3237 StreamMap stream_map
= Stream::all_streams
;
3238 for (StreamMap::iterator it
= stream_map
.begin();
3239 it
!= stream_map
.end(); it
++) {
3240 Stream
* stream
= it
->second
;
3241 if (!m_wait_all_devices
&&
3242 stream
->get_device() != m_device
.get_logical_index()) {
3245 lock_complete
.lock();
3247 // get associated async task
3248 OffloadDescriptor
*task
= stream
->get_last_offload();
3249 // offload was completed by offload_wait pragma or wait clause
3250 if (!offload_descr_map
[task
]) {
3251 lock_complete
.unlock();
3254 if (task
->m_has_signal
) {
3255 signal
= task
->get_signal();
3256 if (task
->get_device().find_signal(signal
, false) ==
3258 task
->get_device().complete_signaled_ofld(signal
);
3261 stream
->set_last_offload(NULL
);
3262 offload_descr_map
[task
] = false;
3263 lock_complete
.unlock();
3264 if (!task
->offload_finish(0)) { //arg is 0 for is_traceback
3270 // no uncompleted streams
3276 // If offload is asynchronous we will not really wait for signals.
3277 // We will collect all waited events into m_p_in_dependencies vector
3278 // to be used in future calls to COI::Copy... API.
3280 if (!__offload_always_wait
&& (m_has_signal
|| (get_stream() > 0))) {
3281 uint64_t num_in_dep
= 0,
3282 num_in_dep_prev
= 0;
3283 COIEVENT
*p_in_dep
= NULL
;
3284 _Offload_stream stream_handle
= get_stream();
3286 bool stream_need_connection
= stream_handle
> 0;
3288 if (stream_need_connection
) {
3289 stream
= Stream::find_stream(stream_handle
, false);
3290 // check previous offload with the stream_handle
3291 // to be noncompleted
3293 stream_need_connection
= false;
3296 for (int i
= 0; i
< num_waits
; i
++) {
3297 task
= m_device
.find_signal(waits
[i
], false);
3299 LIBOFFLOAD_ERROR(c_offload1
, m_device
.get_logical_index(),
3303 else if (task
== SIGNAL_HAS_COMPLETED
) {
3306 if (stream_need_connection
&&
3307 stream
->get_last_offload() == task
) {
3308 stream_need_connection
= false;
3310 if (!task
->m_num_in_dependencies
) {
3313 num_in_dep
+= task
->m_num_in_dependencies
;
3314 p_in_dep
= (COIEVENT
*)realloc(p_in_dep
,
3315 sizeof(COIEVENT
) * num_in_dep
);
3316 if (p_in_dep
== NULL
)
3317 LIBOFFLOAD_ERROR(c_malloc
);
3318 memcpy(p_in_dep
+ num_in_dep_prev
, task
->m_p_in_dependencies
,
3319 task
->m_num_in_dependencies
* sizeof(COIEVENT
));
3320 num_in_dep_prev
= num_in_dep
;
3322 if (stream_need_connection
) {
3323 task
= stream
->get_last_offload();
3325 num_in_dep
+= task
->m_num_in_dependencies
;
3326 p_in_dep
= (COIEVENT
*)realloc(p_in_dep
,
3327 sizeof(COIEVENT
) * num_in_dep
);
3328 if (p_in_dep
== NULL
)
3329 LIBOFFLOAD_ERROR(c_malloc
);
3330 memcpy(p_in_dep
+ num_in_dep_prev
,
3331 task
->m_p_in_dependencies
,
3332 task
->m_num_in_dependencies
* sizeof(COIEVENT
));
3333 num_in_dep_prev
= num_in_dep
;
3336 m_num_in_dependencies
= num_in_dep
? num_in_dep
:
3337 m_num_in_dependencies
;
3338 m_p_in_dependencies
= num_in_dep
? p_in_dep
: m_p_in_dependencies
;
3340 // wait and do offload_finish for serial offload
3342 for (int i
= 0; i
< num_waits
; i
++) {
3343 _Offload_stream stream_handle
;
3346 lock_complete
.lock();
3347 task
= m_device
.find_signal(waits
[i
], false);
3349 LIBOFFLOAD_ERROR(c_offload1
, m_device
.get_logical_index(),
3353 else if (!offload_descr_map
[task
]) {
3354 lock_complete
.unlock();
3357 // Need to mark signal as completed to prevent run condition
3358 // with the call to "offload_complete_task" for the same
3360 m_device
.complete_signaled_ofld(waits
[i
]);
3362 // Asynchronous offload can have both signal and stream.
3363 // Need to clean stream if any.
3365 stream_handle
= task
->m_stream
;
3366 if (stream_handle
!= -1) {
3367 stream
= Stream::find_stream(stream_handle
, false);
3368 if (stream
&& stream
->get_last_offload() == task
) {
3369 stream
->set_last_offload(NULL
);
3372 offload_descr_map
[task
] = false;
3373 lock_complete
.unlock();
3375 if (!task
->offload_finish(0)) { //arg is 0 for is_traceback
3387 bool OffloadDescriptor::offload_wrap(
3395 const void **signal
,
3397 const void *stack_addr
,
3398 OffloadFlags offload_flags
3401 OffloadWaitKind wait_kind
= c_offload_wait_signal
;
3402 bool is_traceback
= offload_flags
.bits
.fortran_traceback
;
3404 // define kind of wait if any;
3405 // there can be one of the following kind:
3406 // 1. c_offload_wait_signal for "offload_wait wait(signal)"
3407 // 2. c_offload_wait_stream for "offload_wait stream(stream)"
3408 // 3. c_offload_wait_all_streams for "offload_wait stream(0)"
3409 if (num_waits
== -1) {
3410 wait_kind
= (m_stream
== 0) ?
3411 c_offload_wait_all_streams
:
3412 c_offload_wait_stream
;
3415 const char *stream_str
;
3417 if (m_stream
== no_stream
|| num_waits
==-1) {
3418 stream_str
= "none";
3420 else if (m_stream
== 0) {
3424 sprintf(buf
, "%#llx", m_stream
);
3429 OFFLOAD_DEBUG_TRACE_1(1,
3430 GET_OFFLOAD_NUMBER(get_timer_data()),
3431 c_offload_init_func
,
3432 "Offload function %s, is_empty=%d, #varDescs=%d, "
3433 "signal=none, stream=%s, #waits=%d%c",
3434 name
, is_empty
, vars_total
, stream_str
, num_waits
,
3435 num_waits
== 0 ? '\n' : ' ');
3436 // Breaks the norm of using OFFLOAD_DEBUG_TRACE to print the waits
3437 // since the number of waits is not fixed.
3438 if (!OFFLOAD_DO_TRACE
&& (console_enabled
>= 1)) {
3441 if (m_stream
== no_stream
) {
3442 printf("%p", waits
[0]);
3443 for (int i
= 1; i
< num_waits
; i
++) {
3444 printf(", %p", waits
[i
]);
3447 else if (m_stream
!= 0) {
3448 printf("%#x", m_stream
);
3451 printf(" all streams");
3458 // stream in wait is reported further in OFFLOAD_REPORT for waits
3459 if (m_stream
!= no_stream
&& num_waits
== 0) {
3460 OFFLOAD_REPORT(3, GET_OFFLOAD_NUMBER(get_timer_data()),
3464 OFFLOAD_REPORT(3, GET_OFFLOAD_NUMBER(get_timer_data()),
3469 OFFLOAD_DEBUG_TRACE_1(1,
3470 GET_OFFLOAD_NUMBER(get_timer_data()),
3471 c_offload_init_func
,
3472 "Offload function %s, is_empty=%d, #varDescs=%d, "
3473 "signal=%p, stream=%s, #waits=%d%c",
3474 name
, is_empty
, vars_total
, signal
, stream_str
,
3475 num_waits
, num_waits
== 0 ? '\n' : ' ');
3476 // Breaks the norm of using OFFLOAD_DEBUG_TRACE to print the waits
3477 // since the number of waits is not fixed.
3478 if (!OFFLOAD_DO_TRACE
&& (console_enabled
>= 1)) {
3481 if (m_stream
== no_stream
) {
3482 printf("%p", waits
[0]);
3483 for (int i
= 1; i
< num_waits
; i
++) {
3484 printf(", %p", waits
[i
]);
3488 else if (m_stream
!= 0) {
3489 printf("%#x", m_stream
);
3492 printf(" all streams");
3499 // stream in wait is reported further in OFFLOAD_REPORT for waits
3500 if (m_stream
!= no_stream
&& num_waits
== 0) {
3501 OFFLOAD_REPORT(3, GET_OFFLOAD_NUMBER(get_timer_data()),
3505 OFFLOAD_REPORT(3, GET_OFFLOAD_NUMBER(get_timer_data()),
3509 if (console_enabled
>= 1 && offload_flags
.flags
!= 0) {
3510 trace_offload_flags(get_timer_data(), offload_flags
);
3513 OFFLOAD_REPORT(3, GET_OFFLOAD_NUMBER(get_timer_data()),
3514 c_offload_wait
, "%d\n",
3515 wait_kind
, num_waits
,
3516 (wait_kind
== c_offload_wait_signal
) ?
3518 reinterpret_cast<const void **>(m_stream
));
3520 if (m_status
!= 0) {
3521 m_status
->result
= OFFLOAD_SUCCESS
;
3522 m_status
->device_number
= m_device
.get_logical_index();
3525 m_initial_need_runfunction
= m_need_runfunction
= !is_empty
;
3527 // wait for dependencies to finish or set
3528 // m_num_in_dependencies and m_p_in_dependencies for asynchronous offload
3529 if (!wait_dependencies(waits
, num_waits
, m_stream
)) {
3535 if (!setup_descriptors(vars
, vars2
, vars_total
, entry_id
, stack_addr
)) {
3540 if (offload_flags
.bits
.omp_async
) {
3541 setup_omp_async_info();
3544 // initiate send for pointers. Want to do it as early as possible.
3545 if (!send_pointer_data(signal
!= 0 || offload_flags
.bits
.omp_async
,
3551 // setup misc data for run function
3552 if (!setup_misc_data(name
)) {
3557 // gather copyin data into buffer
3558 if (!gather_copyin_data()) {
3563 // Start the computation
3564 if (!compute(signal
)) {
3569 // initiate receive for pointers
3570 if (!receive_pointer_data(signal
!= 0 || offload_flags
.bits
.omp_async
,
3576 if (offload_flags
.bits
.omp_async
) {
3580 // if there is a signal or stream save descriptor for the later use.
3581 // num_waits == -1 is for offload_wait and there is nothing to save
3582 if (num_waits
!= -1 && (signal
!= 0 || m_stream
!= no_stream
)) {
3585 m_device
.add_signal(*signal
, this);
3588 if (m_stream
!= no_stream
&& m_stream
!= 0) {
3589 Stream
* stream
= Stream::find_stream(m_stream
, false);
3591 stream
->set_last_offload(this);
3594 LIBOFFLOAD_ERROR(c_offload_no_stream
, m_device
.get_logical_index());
3598 // Register callback function "offload_complete_task" for all out
3599 // events or for all in events if there are no out transfers
3600 if (!m_preallocated_alloc
) {
3601 m_event_count
= m_out_deps_total
?
3602 m_out_deps_total
: m_in_deps_total
;
3603 COIEVENT
*event_list
= m_out_deps_total
? m_out_deps
: m_in_deps
;
3605 for (int i
= 0; i
< m_event_count
; i
++) {
3606 register_event_call_back(&offload_complete_task
,
3607 &event_list
[i
], this);
3609 offload_descr_map
[this] = true;
3614 // wait for the offload to finish.
3615 if (!offload_finish(is_traceback
)) {
3624 bool OffloadDescriptor::offload(
3632 const void **signal
,
3634 const void *stack_addr
,
3635 OffloadFlags offload_flags
3639 res
= offload_wrap(name
, is_empty
, vars
, vars2
, vars_total
,
3640 waits
, num_waits
, signal
, entry_id
,
3641 stack_addr
, offload_flags
);
3642 if (res
== false && !m_traceback_called
) {
3643 if (offload_flags
.bits
.fortran_traceback
) {
3645 "Calling Fortran library to continue traceback from MIC\n");
3646 FORTRAN_TRACE_BACK(m_status
->result
);
3647 m_traceback_called
= true;
3653 bool OffloadDescriptor::offload_finish(
3659 // wait for compute dependencies to become signaled
3660 if (m_in_deps_total
> 0 &&
3661 (m_out_deps_total
<= 0 || m_preallocated_alloc
)) {
3662 OffloadTimer
timer(get_timer_data(), c_offload_host_wait_compute
);
3664 if (__offload_active_wait
) {
3667 res
= COI::EventWait(m_in_deps_total
, m_in_deps
, 0, 1, 0, 0);
3669 while (res
== COI_TIME_OUT_REACHED
);
3672 res
= COI::EventWait(m_in_deps_total
, m_in_deps
, -1, 1, 0, 0);
3675 if (res
!= COI_SUCCESS
) {
3676 if (m_status
!= 0 && !m_traceback_called
) {
3677 m_status
->result
= translate_coi_error(res
);
3680 "Calling Fortran library to continue traceback from MIC\n");
3681 FORTRAN_TRACE_BACK(m_status
->result
);
3682 m_traceback_called
= true;
3686 if (is_traceback
&& !m_traceback_called
) {
3688 "Calling Fortran library to continue traceback from MIC\n");
3689 FORTRAN_TRACE_BACK(OFFLOAD_ERROR
);
3692 report_coi_error(c_event_wait
, res
);
3696 // need to do scatter copyout data received from target after
3697 // completing in dependencies to get preallocated buffers.
3698 // If there are no preallocated buffers we will scatter_copyout_data
3699 // after completing out dependencies. In this case we dont need wait
3700 // in dependencies as they are already in DAG.
3701 if (m_out_with_preallocated
) {
3702 if (!scatter_copyout_data()) {
3705 if (!receive_pointer_data(m_out_deps_total
> 0, false, NULL
)) {
3711 // wait for receive dependencies to become signaled
3712 if (m_out_deps_total
> 0) {
3713 OffloadTimer
timer(get_timer_data(), c_offload_host_wait_buffers_reads
);
3715 if (__offload_active_wait
) {
3718 res
= COI::EventWait(m_out_deps_total
, m_out_deps
, 0, 1, 0, 0);
3720 while (res
== COI_TIME_OUT_REACHED
);
3723 res
= COI::EventWait(m_out_deps_total
, m_out_deps
, -1, 1, 0, 0);
3726 if (res
!= COI_SUCCESS
) {
3727 if (m_status
!= 0 && !m_traceback_called
) {
3728 m_status
->result
= translate_coi_error(res
);
3731 "Calling Fortran library to continue traceback from MIC\n");
3732 FORTRAN_TRACE_BACK(m_status
->result
);
3733 m_traceback_called
= true;
3737 if (is_traceback
&& !m_traceback_called
) {
3739 "Calling Fortran library to continue traceback from MIC\n");
3740 FORTRAN_TRACE_BACK(OFFLOAD_ERROR
);
3743 report_coi_error(c_event_wait
, res
);
3747 if (!m_out_with_preallocated
&& !scatter_copyout_data()) {
3752 OffloadTimer
timer(get_timer_data(), c_offload_host_destroy_buffers
);
3754 for (BufferList::const_iterator it
= m_destroy_buffers
.begin();
3755 it
!= m_destroy_buffers
.end(); it
++) {
3756 res
= COI::BufferDestroy(*it
);
3757 if (res
!= COI_SUCCESS
) {
3758 if (m_status
!= 0) {
3759 m_status
->result
= translate_coi_error(res
);
3762 report_coi_error(c_buf_destroy
, res
);
3770 void OffloadDescriptor::cleanup()
3772 // release device in orsl
3773 ORSL::release(m_device
.get_logical_index());
3775 OFFLOAD_TIMER_STOP(get_timer_data(), c_offload_host_total_offload
);
3778 Offload_Report_Epilog(get_timer_data());
3781 bool OffloadDescriptor::is_signaled()
3783 bool signaled
= true;
3786 // check compute and receive dependencies
3787 if (m_out_deps_total
> 0) {
3788 res
= COI::EventWait(m_out_deps_total
, m_out_deps
, 0, 1, 0, 0);
3789 signaled
= signaled
&& (res
== COI_SUCCESS
);
3791 else if (m_in_deps_total
> 0) {
3792 res
= COI::EventWait(m_in_deps_total
, m_in_deps
, 0, 1, 0, 0);
3793 signaled
= signaled
&& (res
== COI_SUCCESS
);
3799 static Arr_Desc
* make_arr_desc(
3801 int64_t extent_start_val
,
3802 int64_t extent_elements_val
,
3807 res
= (Arr_Desc
*)malloc(sizeof(Arr_Desc
));
3809 LIBOFFLOAD_ERROR(c_malloc
);
3810 res
->base
= reinterpret_cast<int64_t>(ptr_val
);
3812 res
->dim
[0].size
= size
;
3813 res
->dim
[0].lindex
= 0;
3814 res
->dim
[0].lower
= extent_start_val
;
3815 res
->dim
[0].upper
= extent_elements_val
+ extent_start_val
- 1;
3816 res
->dim
[0].stride
= 1;
3820 // Send pointer data if source or destination or both of them are
3821 // noncontiguous. There is guarantee that length of destination enough for
3822 // transferred data.
3823 bool OffloadDescriptor::send_noncontiguous_pointer_data(
3828 uint64_t &data_sent
,
3829 uint32_t in_deps_amount
,
3833 NonContigDesc
*desc
;
3835 int64_t offset_src
, offset_dst
;
3836 int64_t length_src
, length_dst
;
3837 int64_t length_src_cur
, length_dst_cur
;
3840 bool dst_is_empty
= true;
3841 bool src_is_empty
= true;
3843 // If BufferWriteMultiD is defined we can set values of required arguments
3844 // and transfer noncontiguous data via call to the COI routine.
3845 if (!m_vars
[i
].flags
.is_non_cont_struct
&&
3846 __offload_use_coi_noncontiguous_transfer
&& COI::BufferWriteMultiD
) {
3847 struct Arr_Desc
* arr_desc_dst
;
3848 struct Arr_Desc
* arr_desc_src
;
3849 int64_t size_src
, size_dst
;
3850 char *base
= offload_get_src_base(static_cast<char*>(m_vars
[i
].ptr
),
3851 m_vars_extra
[i
].type_src
);
3852 COIBUFFER dst_buf
= m_vars
[i
].into
?
3853 m_vars_extra
[i
].dst_data
->mic_buf
:
3854 m_vars_extra
[i
].src_data
->mic_buf
;
3856 offset_src
= (m_vars_extra
[i
].read_rng_src
)?
3857 m_vars_extra
[i
].read_rng_src
->init_offset
: m_vars_extra
[i
].cpu_disp
;
3858 size_src
= m_vars_extra
[i
].read_rng_src
?
3859 cean_get_transf_size(m_vars_extra
[i
].read_rng_src
) :
3862 offset_dst
= (m_vars_extra
[i
].read_rng_dst
)?
3863 m_vars_extra
[i
].read_rng_dst
->init_offset
: m_vars
[i
].disp
;
3864 size_dst
= m_vars_extra
[i
].read_rng_dst
?
3865 cean_get_transf_size(m_vars_extra
[i
].read_rng_dst
) : m_vars
[i
].size
;
3867 int64_t el_size
= (!m_vars
[i
].into
||
3868 (m_vars_extra
[i
].read_rng_src
&& m_vars_extra
[i
].read_rng_dst
)) ?
3870 m_vars_extra
[i
].read_rng_src
?
3871 m_vars_extra
[i
].read_rng_src
->arr_desc
->dim
[
3872 m_vars_extra
[i
].read_rng_src
->arr_desc
->rank
- 1].size
:
3873 m_vars_extra
[i
].read_rng_dst
->arr_desc
->dim
[
3874 m_vars_extra
[i
].read_rng_dst
->arr_desc
->rank
- 1].size
;
3876 arr_desc_src
= (m_vars_extra
[i
].read_rng_src
) ?
3877 m_vars_extra
[i
].read_rng_src
->arr_desc
:
3878 make_arr_desc(NULL
, // don't required for source
3879 offset_src
/el_size
, size_src
/el_size
, el_size
);
3881 arr_desc_dst
= !m_vars
[i
].into
?
3883 (m_vars_extra
[i
].read_rng_dst
) ?
3884 m_vars_extra
[i
].read_rng_dst
->arr_desc
:
3886 offset_dst
/el_size
, size_src
/el_size
, el_size
);
3888 int64_t alloc_disp
= m_vars
[i
].into
?
3889 m_vars_extra
[i
].dst_data
->alloc_disp
:
3890 m_vars_extra
[i
].src_data
->alloc_disp
;
3892 arr_desc_dst
->base
= 0;
3893 arr_desc_src
->base
= reinterpret_cast<int64_t>(base
);
3895 res
= COI::BufferWriteMultiD(
3896 dst_buf
, // in_DestBuffer,
3897 NULL
, // DestProcess,
3898 m_vars
[i
].offset
+ m_vars
[i
].mic_offset
-
3899 alloc_disp
, // Offset
3900 (void*)arr_desc_dst
, // descriptor of DestArray
3901 (void*)arr_desc_src
, // descriptor of SrcArray
3902 COI_COPY_UNSPECIFIED
, // Type
3903 m_num_in_dependencies
, // Number of in Dependencies
3904 m_p_in_dependencies
, // array of in Dependencies
3905 event
); // out Dependency
3906 if (res
!= COI_SUCCESS
) {
3907 if (m_status
!= 0) {
3908 m_status
->result
= translate_coi_error(res
);
3911 report_coi_error(c_buf_copy
, res
);
3917 if (m_vars
[i
].flags
.is_non_cont_struct
) {
3918 desc
= m_vars_extra
[i
].noncont_desc
;
3922 // Set length_src and length_dst
3923 length_src
= (m_vars_extra
[i
].read_rng_src
) ?
3924 m_vars_extra
[i
].read_rng_src
->range_size
: m_vars
[i
].size
;
3925 length_dst
= !m_vars
[i
].into
? length_src
:
3926 (m_vars_extra
[i
].read_rng_dst
) ?
3927 m_vars_extra
[i
].read_rng_dst
->range_size
:
3929 send_size
= (length_src
< length_dst
) ? length_src
: length_dst
;
3932 // if event is defined we must multiplate it for all contiguous ranges
3933 // that will be Copied/Write.
3934 // Take in account that we already have 1 event.
3936 uint32_t range_num
= m_vars
[i
].flags
.is_non_cont_struct
?
3937 desc
->interval_cnt
:
3938 (length_src
/ send_size
) *
3939 ((m_vars_extra
[i
].read_rng_src
) ?
3940 m_vars_extra
[i
].read_rng_src
->range_max_number
: 1) ;
3941 m_in_deps_allocated
+= range_num
;
3943 (COIEVENT
*)realloc(m_in_deps
, sizeof(COIEVENT
) * m_in_deps_allocated
);
3947 // consequently get contiguous ranges,
3948 // define corresponded destination offset and send data
3950 if (m_vars
[i
].flags
.is_non_cont_struct
) {
3952 if (noncont_num
>= desc
->interval_cnt
) {
3955 offset_src
= offset_dst
= desc
->interval
[noncont_num
].lower
;
3956 send_size
= desc
->interval
[noncont_num
].size
;
3961 if (m_vars_extra
[i
].read_rng_src
) {
3962 if (!get_next_range(m_vars_extra
[i
].read_rng_src
,
3964 // source ranges are over - nothing to send
3968 else if (data_sent
== 0) {
3969 offset_src
= m_vars_extra
[i
].cpu_disp
;
3974 length_src_cur
= length_src
;
3977 // if source is contiguous or its contiguous range is greater
3978 // than destination one
3979 offset_src
+= send_size
;
3981 length_src_cur
-= send_size
;
3982 src_is_empty
= length_src_cur
== 0;
3985 if (m_vars
[i
].into
) {
3986 if (m_vars_extra
[i
].read_rng_dst
) {
3987 if (!get_next_range(m_vars_extra
[i
].read_rng_dst
,
3989 // destination ranges are over
3990 LIBOFFLOAD_ERROR(c_destination_is_over
);
3994 // into is contiguous.
3996 offset_dst
= m_vars
[i
].disp
;
3998 length_dst_cur
= length_dst
;
4002 offset_dst
= offset_src
;
4003 length_dst_cur
= length_src
;
4007 // if destination is contiguous or its contiguous range is greater
4009 offset_dst
+= send_size
;
4011 length_dst_cur
-= send_size
;
4012 dst_is_empty
= length_dst_cur
== 0;
4015 event
= &m_in_deps
[m_in_deps_total
++];
4017 if (src_data
!= 0 && src_data
->cpu_buf
!= 0) {
4018 res
= COI::BufferCopy(
4021 m_vars
[i
].mic_offset
+
4022 m_vars
[i
].offset
+ offset_dst
,
4023 m_vars_extra
[i
].cpu_offset
+ offset_src
,
4025 COI_COPY_UNSPECIFIED
,
4026 m_num_in_dependencies
,
4027 m_p_in_dependencies
,
4029 if (res
!= COI_SUCCESS
) {
4030 if (m_status
!= 0) {
4031 m_status
->result
= translate_coi_error(res
);
4034 report_coi_error(c_buf_copy
, res
);
4038 char *base
= offload_get_src_base(m_vars
[i
].ptr
,
4039 m_vars_extra
[i
].type_src
);
4041 res
= COI::BufferWrite(
4043 m_vars
[i
].mic_offset
+
4044 m_vars
[i
].offset
+ offset_dst
,
4047 COI_COPY_UNSPECIFIED
,
4048 m_num_in_dependencies
,
4049 m_p_in_dependencies
,
4051 if (res
!= COI_SUCCESS
) {
4052 if (m_status
!= 0) {
4053 m_status
->result
= translate_coi_error(res
);
4056 report_coi_error(c_buf_write
, res
);
4059 data_sent
+= send_size
;
4065 bool OffloadDescriptor::send_pointer_data(bool is_async
, void* info
)
4067 OffloadTimer
timer(get_timer_data(), c_offload_host_send_pointers
);
4069 bool should_use_async_buffer_write
= m_initial_need_runfunction
;
4070 uint64_t ptr_sent
= 0;
4072 uint32_t in_deps_amount
= 0;
4073 COIEVENT
*in_deps
= NULL
;
4075 // For offload_transfer and offload with empty body without signal:
4076 // - if there is only one buffer copy - send data synchronously
4077 // - if there are multiple buffer copy and
4078 // __offload_parallel_copy is false - send data synchronously
4079 // - if there are multiple buffer copy and
4080 // __offload_parallel_copy is true - send data asynchronously
4081 // It concerns only big size data - greater than __offload_use_async_buffer_write.
4082 // Data of size less than __offload_use_async_buffer_write are sent synchronously.
4083 // Synchronous transfer results in better performance in COI.
4084 // __offload_parallel_copy is false by default but can be changed
4085 // via environment variable OFFLOAD_PARALLEL_COPY
4086 if (!m_initial_need_runfunction
&& __offload_parallel_copy
) {
4087 int big_size_count
= 0;
4088 for (int i
= 0; i
< m_vars_total
; i
++) {
4089 if (m_vars
[i
].direction
.in
&&
4090 m_vars
[i
].size
>= __offload_use_async_buffer_write
) {
4091 switch (m_vars_extra
[i
].type_dst
) {
4094 case c_void_ptr_ptr
:
4096 if (m_vars
[i
].flags
.is_static_dstn
) {
4101 case c_string_ptr_ptr
:
4103 case c_data_ptr_ptr
:
4104 case c_cean_var_ptr
:
4105 case c_cean_var_ptr_ptr
:
4109 case c_dv_data_slice
:
4110 case c_dv_ptr_data_slice
:
4118 if (big_size_count
> 1) {
4119 should_use_async_buffer_write
= true;
4123 // Initiate send for pointer data
4124 for (int i
= 0; i
< m_vars_total
; i
++) {
4125 uint64_t sent_data
= m_vars
[i
].size
;
4127 if (m_vars_extra
[i
].omp_last_event_type
== c_last_write
&&
4128 m_in_deps_total
> 0) {
4129 m_num_in_dependencies
= m_in_deps_total
;
4130 m_p_in_dependencies
= m_in_deps
;
4132 switch (m_vars_extra
[i
].type_dst
) {
4133 case c_data_ptr_array
:
4137 case c_void_ptr_ptr
:
4139 if (m_vars
[i
].direction
.in
&&
4140 m_vars
[i
].flags
.is_static_dstn
) {
4142 (m_stream
!= no_stream
||
4144 (should_use_async_buffer_write
&&
4145 m_vars
[i
].size
>= __offload_use_async_buffer_write
)) ?
4146 &m_in_deps
[m_in_deps_total
++] : 0;
4147 PtrData
* dst_data
= m_vars
[i
].into
?
4148 m_vars_extra
[i
].dst_data
:
4149 m_vars_extra
[i
].src_data
;
4151 VAR_TYPE_IS_PTR(m_vars_extra
[i
].type_src
) ||
4152 VAR_TYPE_IS_SCALAR(m_vars_extra
[i
].type_src
) &&
4153 m_vars
[i
].flags
.is_static
?
4154 m_vars_extra
[i
].src_data
: 0;
4156 if (m_vars
[i
].flags
.is_non_cont_struct
||
4157 m_vars
[i
].flags
.is_noncont_src
||
4158 m_vars
[i
].flags
.is_noncont_dst
) {
4159 if (!send_noncontiguous_pointer_data(
4160 i
, src_data
, dst_data
, event
, sent_data
,
4161 m_num_in_dependencies
, m_p_in_dependencies
)) {
4165 else if (src_data
!= 0 && src_data
->cpu_buf
!= 0) {
4166 res
= COI::BufferCopy(
4169 m_vars
[i
].mic_offset
+
4170 m_vars
[i
].offset
+ m_vars
[i
].disp
,
4171 m_vars_extra
[i
].cpu_offset
+
4172 m_vars_extra
[i
].cpu_disp
,
4174 COI_COPY_UNSPECIFIED
,
4175 m_num_in_dependencies
,
4176 m_p_in_dependencies
,
4178 if (res
!= COI_SUCCESS
) {
4179 if (m_status
!= 0) {
4180 m_status
->result
= translate_coi_error(res
);
4183 report_coi_error(c_buf_copy
, res
);
4187 char *base
= offload_get_src_base(m_vars
[i
].ptr
,
4188 m_vars_extra
[i
].type_src
);
4189 res
= COI::BufferWrite(
4191 m_vars
[i
].mic_offset
+
4192 m_vars
[i
].offset
+ m_vars
[i
].disp
,
4193 base
+ m_vars_extra
[i
].cpu_disp
,
4195 COI_COPY_UNSPECIFIED
,
4196 m_num_in_dependencies
,
4197 m_p_in_dependencies
,
4199 if (res
!= COI_SUCCESS
) {
4200 if (m_status
!= 0) {
4201 m_status
->result
= translate_coi_error(res
);
4204 report_coi_error(c_buf_write
, res
);
4207 ptr_sent
+= sent_data
;
4212 // If use_device_ptr no data needs to be sent
4213 if (m_vars
[i
].flags
.use_device_ptr
) {
4217 case c_string_ptr_ptr
:
4218 case c_data_ptr_ptr
:
4219 case c_cean_var_ptr
:
4220 case c_cean_var_ptr_ptr
:
4222 if (m_vars
[i
].direction
.in
&& m_vars
[i
].size
> 0) {
4224 (m_stream
!= no_stream
||
4226 (should_use_async_buffer_write
&&
4227 m_vars
[i
].size
>= __offload_use_async_buffer_write
)) ?
4228 &m_in_deps
[m_in_deps_total
++] : 0;
4229 PtrData
* dst_data
= m_vars
[i
].into
?
4230 m_vars_extra
[i
].dst_data
:
4231 m_vars_extra
[i
].src_data
;
4233 VAR_TYPE_IS_PTR(m_vars_extra
[i
].type_src
) ||
4234 VAR_TYPE_IS_SCALAR(m_vars_extra
[i
].type_src
) &&
4235 m_vars
[i
].flags
.is_static
?
4236 m_vars_extra
[i
].src_data
: 0;
4238 if (m_vars
[i
].flags
.is_non_cont_struct
||
4239 m_vars
[i
].flags
.is_noncont_src
||
4240 m_vars
[i
].flags
.is_noncont_dst
) {
4241 send_noncontiguous_pointer_data(
4242 i
, src_data
, dst_data
, event
, sent_data
,
4243 in_deps_amount
, in_deps
);
4245 else if (src_data
!= 0 && src_data
->cpu_buf
!= 0) {
4246 res
= COI::BufferCopy(
4249 m_vars
[i
].mic_offset
+
4250 m_vars
[i
].offset
+ m_vars
[i
].disp
,
4251 m_vars_extra
[i
].cpu_offset
+
4252 m_vars_extra
[i
].cpu_disp
,
4254 COI_COPY_UNSPECIFIED
,
4255 m_num_in_dependencies
,
4256 m_p_in_dependencies
,
4258 if (res
!= COI_SUCCESS
) {
4259 if (m_status
!= 0) {
4260 m_status
->result
= translate_coi_error(res
);
4263 report_coi_error(c_buf_copy
, res
);
4267 char *base
= offload_get_src_base(m_vars
[i
].ptr
,
4268 m_vars_extra
[i
].type_src
);
4269 res
= COI::BufferWrite(
4271 m_vars
[i
].mic_offset
+
4272 m_vars
[i
].offset
+ m_vars
[i
].disp
,
4273 base
+ m_vars_extra
[i
].cpu_disp
,
4275 COI_COPY_UNSPECIFIED
,
4276 m_num_in_dependencies
,
4277 m_p_in_dependencies
,
4279 if (res
!= COI_SUCCESS
) {
4280 if (m_status
!= 0) {
4281 m_status
->result
= translate_coi_error(res
);
4284 report_coi_error(c_buf_write
, res
);
4288 ptr_sent
+= sent_data
;
4294 if (m_vars
[i
].direction
.in
&&
4295 m_vars
[i
].size
> 0) {
4296 PtrData
*ptr_data
= m_vars
[i
].into
?
4297 m_vars_extra
[i
].dst_data
:
4298 m_vars_extra
[i
].src_data
;
4299 PtrData
* src_data
= m_vars_extra
[i
].src_data
;
4302 (m_stream
!= no_stream
||
4304 (should_use_async_buffer_write
&&
4305 m_vars
[i
].size
>= __offload_use_async_buffer_write
)) ?
4306 &m_in_deps
[m_in_deps_total
++] : 0;
4308 if (m_vars
[i
].flags
.is_non_cont_struct
||
4309 m_vars
[i
].flags
.is_noncont_src
||
4310 m_vars
[i
].flags
.is_noncont_dst
) {
4311 send_noncontiguous_pointer_data(
4312 i
, src_data
, ptr_data
, event
, sent_data
,
4313 in_deps_amount
, in_deps
);
4315 else if (src_data
&& src_data
->cpu_buf
!= 0) {
4316 res
= COI::BufferCopy(
4319 m_vars
[i
].offset
+ ptr_data
->mic_offset
+
4321 m_vars_extra
[i
].cpu_offset
+
4322 m_vars_extra
[i
].cpu_disp
,
4324 COI_COPY_UNSPECIFIED
,
4325 m_num_in_dependencies
,
4326 m_p_in_dependencies
,
4328 if (res
!= COI_SUCCESS
) {
4329 if (m_status
!= 0) {
4330 m_status
->result
= translate_coi_error(res
);
4333 report_coi_error(c_buf_copy
, res
);
4337 char *base
= offload_get_src_base(m_vars
[i
].ptr
,
4338 m_vars_extra
[i
].type_src
);
4339 res
= COI::BufferWrite(
4341 ptr_data
->mic_offset
+
4342 m_vars
[i
].offset
+ m_vars
[i
].disp
,
4343 base
+ m_vars_extra
[i
].cpu_disp
,
4345 COI_COPY_UNSPECIFIED
,
4346 m_num_in_dependencies
,
4347 m_p_in_dependencies
,
4349 if (res
!= COI_SUCCESS
) {
4350 if (m_status
!= 0) {
4351 m_status
->result
= translate_coi_error(res
);
4354 report_coi_error(c_buf_write
, res
);
4357 ptr_sent
+= sent_data
;
4361 case c_dv_data_slice
:
4362 case c_dv_ptr_data_slice
:
4363 if (m_vars
[i
].direction
.in
&&
4364 m_vars
[i
].size
> 0) {
4365 PtrData
*dst_data
= m_vars
[i
].into
?
4366 m_vars_extra
[i
].dst_data
:
4367 m_vars_extra
[i
].src_data
;
4369 (VAR_TYPE_IS_PTR(m_vars_extra
[i
].type_src
) ||
4370 VAR_TYPE_IS_DV_DATA(m_vars_extra
[i
].type_src
) ||
4371 VAR_TYPE_IS_DV_DATA_SLICE(m_vars_extra
[i
].type_src
) ||
4372 VAR_TYPE_IS_SCALAR(m_vars_extra
[i
].type_src
) &&
4373 m_vars
[i
].flags
.is_static
) ?
4374 m_vars_extra
[i
].src_data
: 0;
4376 (m_stream
!= no_stream
||
4378 (should_use_async_buffer_write
&&
4379 m_vars
[i
].size
>= __offload_use_async_buffer_write
)) ?
4380 &m_in_deps
[m_in_deps_total
++] : 0;
4381 if (m_vars
[i
].flags
.is_non_cont_struct
||
4382 m_vars
[i
].flags
.is_noncont_src
||
4383 m_vars
[i
].flags
.is_noncont_dst
) {
4384 send_noncontiguous_pointer_data(
4385 i
, src_data
, dst_data
, event
, sent_data
,
4386 in_deps_amount
, in_deps
);
4388 else if (src_data
&& src_data
->cpu_buf
!= 0) {
4389 res
= COI::BufferCopy(
4393 dst_data
->mic_offset
+
4395 m_vars_extra
[i
].cpu_offset
+
4396 m_vars_extra
[i
].cpu_disp
,
4398 COI_COPY_UNSPECIFIED
,
4399 m_num_in_dependencies
,
4400 m_p_in_dependencies
,
4402 if (res
!= COI_SUCCESS
) {
4403 if (m_status
!= 0) {
4404 m_status
->result
= translate_coi_error(res
);
4407 report_coi_error(c_buf_copy
, res
);
4411 char *base
= offload_get_src_base(m_vars
[i
].ptr
,
4412 m_vars_extra
[i
].type_src
);
4413 res
= COI::BufferWrite(
4415 dst_data
->mic_offset
+
4416 m_vars
[i
].offset
+ m_vars
[i
].disp
,
4417 base
+ m_vars_extra
[i
].cpu_disp
,
4419 COI_COPY_UNSPECIFIED
,
4420 m_num_in_dependencies
,
4421 m_p_in_dependencies
,
4423 if (res
!= COI_SUCCESS
) {
4424 if (m_status
!= 0) {
4425 m_status
->result
= translate_coi_error(res
);
4428 report_coi_error(c_buf_write
, res
);
4432 ptr_sent
+= sent_data
;
4439 if (m_vars_extra
[i
].omp_last_event_type
== c_last_write
) {
4440 register_omp_event_call_back(&m_in_deps
[m_in_deps_total
- 1], info
);
4442 // alloc field isn't used at target.
4443 // We can reuse it for offset of array pointers.
4444 if (m_vars_extra
[i
].is_arr_ptr_el
) {
4445 m_vars
[i
].ptr_arr_offset
= m_vars_extra
[i
].ptr_arr_offset
;
4448 // list of out events created while send_pointer_data now became input
4449 // dependencies for runfunction (or Read transfers from target if
4450 // runfunction is absent)
4451 m_num_in_dependencies
= m_in_deps_total
? m_in_deps_total
:
4452 m_num_in_dependencies
;
4453 m_p_in_dependencies
= m_in_deps_total
? m_in_deps
: m_p_in_dependencies
;
4456 m_status
->data_sent
+= ptr_sent
;
4459 OFFLOAD_TIMER_HOST_SDATA(get_timer_data(), ptr_sent
);
4460 OFFLOAD_DEBUG_TRACE_1(1, GET_OFFLOAD_NUMBER(get_timer_data()),
4461 c_offload_sent_pointer_data
,
4462 "Total pointer data sent to target: [%lld] bytes\n",
4468 bool OffloadDescriptor::gather_copyin_data()
4470 OffloadTimer
timer(get_timer_data(), c_offload_host_gather_inputs
);
4472 if (m_need_runfunction
&& m_in_datalen
> 0) {
4473 COIMAPINSTANCE map_inst
;
4477 if (m_inout_buf
!= 0) {
4478 OffloadTimer
timer_map(get_timer_data(),
4479 c_offload_host_map_in_data_buffer
);
4481 COIRESULT res
= COI::BufferMap(m_inout_buf
, 0, m_in_datalen
,
4482 COI_MAP_WRITE_ENTIRE_BUFFER
,
4484 reinterpret_cast<void**>(&data
));
4485 if (res
!= COI_SUCCESS
) {
4486 if (m_status
!= 0) {
4487 m_status
->result
= translate_coi_error(res
);
4490 report_coi_error(c_buf_map
, res
);
4494 data
= (char*) m_func_desc
+ m_func_desc
->data_offset
;
4497 // send variable descriptors
4498 memcpy(data
, m_vars
, m_vars_total
* sizeof(VarDesc
));
4499 data
+= m_vars_total
* sizeof(VarDesc
);
4502 m_in
.init_buffer(data
, m_in_datalen
);
4504 // Gather copy data into buffer
4505 for (int i
= 0; i
< m_vars_total
; i
++) {
4506 bool src_is_for_mic
= (m_vars
[i
].direction
.out
||
4507 m_vars
[i
].into
== NULL
);
4508 PtrData
* ptr_data
= src_is_for_mic
?
4509 m_vars_extra
[i
].src_data
:
4510 m_vars_extra
[i
].dst_data
;
4511 if (m_vars
[i
].flags
.alloc_disp
) {
4512 m_in
.send_data(&ptr_data
->alloc_disp
,
4513 sizeof(ptr_data
->alloc_disp
));
4515 if (TYPE_IS_PTR_TO_PTR(m_vars_extra
[i
].type_src
) ||
4516 TYPE_IS_PTR_TO_PTR(m_vars_extra
[i
].type_dst
) ||
4517 (m_vars_extra
[i
].type_src
== c_data_ptr_array
&&
4518 m_vars
[i
].flags
.is_pointer
)) {
4519 m_in
.send_data(&m_vars_extra
[i
].pointer_offset
,
4520 sizeof(m_vars_extra
[i
].pointer_offset
));
4522 // send sink address to the target
4523 if (m_vars
[i
].flags
.sink_addr
) {
4524 m_in
.send_data(&ptr_data
->mic_addr
,
4525 sizeof(ptr_data
->mic_addr
));
4528 switch (m_vars_extra
[i
].type_dst
) {
4529 case c_data_ptr_array
:
4533 case c_void_ptr_ptr
:
4535 if (m_vars
[i
].direction
.in
&&
4536 !m_vars
[i
].flags
.is_static_dstn
) {
4538 char *ptr
= offload_get_src_base(m_vars
[i
].ptr
,
4539 m_vars_extra
[i
].type_src
);
4540 if (m_vars_extra
[i
].type_dst
== c_cean_var
) {
4541 // offset and length are derived from the array
4543 int64_t size
= m_vars
[i
].size
;
4544 int64_t disp
= m_vars
[i
].disp
;
4545 m_in
.send_data(reinterpret_cast<char*>(&size
),
4547 m_in
.send_data(reinterpret_cast<char*>(&disp
),
4551 m_in
.send_data(ptr
+ m_vars_extra
[i
].cpu_disp
,
4557 if (m_vars
[i
].direction
.bits
||
4558 m_vars
[i
].alloc_if
||
4559 m_vars
[i
].free_if
) {
4560 // send dope vector excluding base
4561 char *ptr
= static_cast<char*>(m_vars
[i
].ptr
);
4562 m_in
.send_data(ptr
+ sizeof(uint64_t),
4563 m_vars
[i
].size
- sizeof(uint64_t));
4568 // send to target addresses of obsolete
4569 // stacks to be released
4570 if (m_vars
[i
].flags
.is_stack_buf
&&
4571 !m_vars
[i
].direction
.bits
&&
4572 m_vars
[i
].alloc_if
&&
4573 m_vars
[i
].size
!= 0) {
4574 for (PtrDataList::iterator it
=
4575 m_destroy_stack
.begin();
4576 it
!= m_destroy_stack
.end(); it
++) {
4577 PtrData
* ptr_data
= *it
;
4578 m_in
.send_data(&(ptr_data
->mic_addr
),
4579 sizeof(ptr_data
->mic_addr
));
4584 case c_func_ptr_ptr
:
4585 if (m_vars
[i
].direction
.in
) {
4586 m_in
.send_func_ptr(*((const void**) m_vars
[i
].ptr
));
4596 m_status
->data_sent
+= m_in
.get_tfr_size();
4599 if (m_func_desc
->data_offset
== 0) {
4600 OffloadTimer
timer_unmap(get_timer_data(),
4601 c_offload_host_unmap_in_data_buffer
);
4602 COIRESULT res
= COI::BufferUnmap(map_inst
, 0, 0, 0);
4603 if (res
!= COI_SUCCESS
) {
4604 if (m_status
!= 0) {
4605 m_status
->result
= translate_coi_error(res
);
4608 report_coi_error(c_buf_unmap
, res
);
4613 OFFLOAD_TIMER_HOST_SDATA(get_timer_data(), m_in
.get_tfr_size());
4614 OFFLOAD_DEBUG_TRACE_1(1,
4615 GET_OFFLOAD_NUMBER(get_timer_data()), c_offload_copyin_data
,
4616 "Total copyin data sent to target: [%lld] bytes\n",
4617 m_in
.get_tfr_size());
4622 bool OffloadDescriptor::compute(void *info
)
4624 OffloadTimer
timer(get_timer_data(), c_offload_host_start_compute
);
4626 if (m_need_runfunction
) {
4627 OFFLOAD_DEBUG_TRACE_1(2, GET_OFFLOAD_NUMBER(get_timer_data()),
4628 c_offload_compute
, "Compute task on MIC\n");
4630 void* misc
= m_func_desc
;
4631 int misc_len
= m_func_desc_size
;
4635 if (m_func_desc
->data_offset
!= 0) {
4636 misc_len
+= m_in_datalen
;
4638 if (m_out_datalen
> 0) {
4639 ret
= (char*) m_func_desc
+ m_func_desc
->data_offset
;
4640 ret_len
= m_out_datalen
;
4648 res
= m_device
.compute(m_stream
,
4652 m_num_in_dependencies
,
4653 m_p_in_dependencies
,
4656 if (res
!= COI_SUCCESS
) {
4657 if (m_status
!= 0) {
4658 m_status
->result
= translate_coi_error(res
);
4661 report_coi_error(c_pipeline_run_func
, res
);
4664 if (m_omp_async_last_event_type
== c_last_runfunc
) {
4665 register_omp_event_call_back(&event
, info
);
4668 m_in_deps_total
= m_num_in_dependencies
= 1;
4669 m_in_deps
[0] = event
;
4670 m_p_in_dependencies
= m_in_deps
;
4676 // receive pointer data if source or destination or both of them are
4677 // noncontiguous. There is guarantee that length of destination enough for
4678 // transferred data.
4679 bool OffloadDescriptor::receive_noncontiguous_pointer_data(
4683 uint64_t &received_data
,
4684 uint32_t in_deps_amount
,
4688 NonContigDesc
*desc
;
4690 int64_t offset_src
, offset_dst
;
4691 int64_t length_src
, length_dst
;
4692 int64_t length_src_cur
, length_dst_cur
;
4693 int64_t receive_size
;
4695 bool dst_is_empty
= true;
4696 bool src_is_empty
= true;
4698 char *base
= offload_get_src_base(
4700 static_cast<char*>(m_vars
[i
].into
) :
4701 static_cast<char*>(m_vars
[i
].ptr
),
4702 m_vars_extra
[i
].type_dst
);
4705 // If BufferReadMultiD is defined we can set values of required arguments
4706 // and transfer noncontiguous data via call to the COI routine.
4707 if (__offload_use_coi_noncontiguous_transfer
&& COI::BufferReadMultiD
) {
4708 struct Arr_Desc
* arr_desc_dst
;
4709 struct Arr_Desc
* arr_desc_src
;
4710 int64_t size_src
, size_dst
;
4712 offset_src
= (m_vars_extra
[i
].read_rng_src
)?
4713 m_vars_extra
[i
].read_rng_src
->init_offset
: m_vars
[i
].disp
;
4714 size_src
= m_vars_extra
[i
].read_rng_src
?
4715 cean_get_transf_size(m_vars_extra
[i
].read_rng_src
) :
4718 offset_dst
= (m_vars_extra
[i
].read_rng_dst
)?
4719 m_vars_extra
[i
].read_rng_dst
->init_offset
: m_vars_extra
[i
].cpu_disp
;
4720 size_dst
= m_vars_extra
[i
].read_rng_dst
?
4721 cean_get_transf_size(m_vars_extra
[i
].read_rng_dst
) : m_vars
[i
].size
;
4723 int64_t el_size
= (!m_vars
[i
].into
||
4724 (m_vars_extra
[i
].read_rng_src
&&
4725 m_vars_extra
[i
].read_rng_dst
)) ?
4727 m_vars_extra
[i
].read_rng_src
?
4728 m_vars_extra
[i
].read_rng_src
->arr_desc
->dim
[
4729 m_vars_extra
[i
].read_rng_src
->arr_desc
->rank
- 1].size
:
4730 m_vars_extra
[i
].read_rng_dst
->arr_desc
->dim
[
4731 m_vars_extra
[i
].read_rng_dst
->arr_desc
->rank
- 1].size
;
4732 arr_desc_src
= (m_vars_extra
[i
].read_rng_src
) ?
4733 m_vars_extra
[i
].read_rng_src
->arr_desc
:
4734 make_arr_desc(NULL
, // don't required for source
4735 offset_src
/el_size
, size_src
/el_size
,
4737 arr_desc_dst
= !m_vars
[i
].into
? arr_desc_src
:
4738 (m_vars_extra
[i
].read_rng_dst
) ?
4739 m_vars_extra
[i
].read_rng_dst
->arr_desc
:
4741 offset_dst
/el_size
, size_src
/el_size
, el_size
);
4743 arr_desc_dst
->base
= reinterpret_cast<int64_t>(base
);
4745 res
= COI::BufferReadMultiD(
4746 m_vars_extra
[i
].src_data
->mic_buf
, // SourceBuffer
4747 m_vars
[i
].offset
+ m_vars
[i
].mic_offset
-
4748 m_vars_extra
[i
].src_data
->alloc_disp
, // Offset
4749 (void*)arr_desc_dst
, // descriptor of DestArray
4750 (void*)arr_desc_src
, // descriptor of SrcArray
4751 COI_COPY_UNSPECIFIED
, // Type
4752 m_num_in_dependencies
, // Number of in Dependencies
4753 m_p_in_dependencies
, // array of in Dependencies
4754 event
); // out Dependency
4755 if (res
!= COI_SUCCESS
) {
4756 if (m_status
!= 0) {
4757 m_status
->result
= translate_coi_error(res
);
4760 report_coi_error(c_buf_copy
, res
);
4764 if (m_vars
[i
].flags
.is_non_cont_struct
) {
4765 desc
= m_vars_extra
[i
].noncont_desc
;
4769 // Set length_src and length_dst
4770 length_src
= (m_vars_extra
[i
].read_rng_src
) ?
4771 m_vars_extra
[i
].read_rng_src
->range_size
: m_vars
[i
].size
;
4772 length_dst
= !m_vars
[i
].into
? length_src
:
4773 (m_vars_extra
[i
].read_rng_dst
) ?
4774 m_vars_extra
[i
].read_rng_dst
->range_size
: m_vars
[i
].size
;
4775 receive_size
= (length_src
< length_dst
) ? length_src
: length_dst
;
4778 // if event is defined we must multiplate for all contiguous intervals
4779 // that will be Copied/Read.
4780 // Take in account that we already have 1 event.
4782 uint32_t range_num
= m_vars
[i
].flags
.is_non_cont_struct
?
4783 desc
->interval_cnt
:
4784 (length_src
/ receive_size
) *
4785 ((m_vars_extra
[i
].read_rng_src
) ?
4786 m_vars_extra
[i
].read_rng_src
->range_max_number
: 1) ;
4787 m_out_deps_allocated
+= range_num
;
4789 (COIEVENT
*)realloc(m_out_deps
, sizeof(COIEVENT
) * m_out_deps_allocated
);
4793 // consequently get contiguous ranges,
4794 // define corresponded destination offset and receive data
4796 if (m_vars
[i
].flags
.is_non_cont_struct
) {
4798 if (noncont_num
>= desc
->interval_cnt
) {
4801 offset_src
= offset_dst
= desc
->interval
[noncont_num
].lower
;
4802 receive_size
= desc
->interval
[noncont_num
].size
;
4805 else { // get source offset
4807 if (m_vars_extra
[i
].read_rng_src
) {
4808 if (!get_next_range(m_vars_extra
[i
].read_rng_src
,
4810 // source ranges are over - nothing to send
4814 else if (received_data
== 0) {
4815 offset_src
= m_vars
[i
].disp
;
4820 length_src_cur
= length_src
;
4823 // if source is contiguous or its contiguous range is greater
4824 // than destination one
4825 offset_src
+= receive_size
;
4827 length_src_cur
-= receive_size
;
4828 src_is_empty
= length_src_cur
== 0;
4830 // get destination offset
4832 if (m_vars
[i
].into
) {
4833 if (m_vars_extra
[i
].read_rng_dst
) {
4834 if (!get_next_range(m_vars_extra
[i
].read_rng_dst
,
4836 // destination ranges are over
4837 LIBOFFLOAD_ERROR(c_destination_is_over
);
4841 // destination is contiguous.
4843 offset_dst
= m_vars_extra
[i
].cpu_disp
;
4845 length_dst_cur
= length_dst
;
4849 offset_dst
= offset_src
;
4850 length_dst_cur
= length_src
;
4854 // if destination is contiguous or its contiguous range is greater
4856 offset_dst
+= receive_size
;
4858 length_dst_cur
-= receive_size
;
4859 dst_is_empty
= length_dst_cur
== 0;
4862 event
= &m_out_deps
[m_out_deps_total
++];
4865 res
= COI::BufferCopy(
4867 m_vars_extra
[i
].src_data
->mic_buf
,
4868 m_vars_extra
[i
].cpu_offset
+ offset_dst
,
4869 m_vars
[i
].offset
+ offset_src
+
4870 m_vars
[i
].mic_offset
,
4872 COI_COPY_UNSPECIFIED
,
4873 m_num_in_dependencies
,
4874 m_p_in_dependencies
,
4876 if (res
!= COI_SUCCESS
) {
4877 if (m_status
!= 0) {
4878 m_status
->result
= translate_coi_error(res
);
4881 report_coi_error(c_buf_copy
, res
);
4885 res
= COI::BufferRead(
4886 m_vars_extra
[i
].src_data
->mic_buf
,
4887 m_vars
[i
].offset
+ offset_src
+
4888 m_vars
[i
].mic_offset
,
4891 COI_COPY_UNSPECIFIED
,
4892 m_num_in_dependencies
,
4893 m_p_in_dependencies
,
4895 if (res
!= COI_SUCCESS
) {
4896 if (m_status
!= 0) {
4897 m_status
->result
= translate_coi_error(res
);
4900 report_coi_error(c_buf_read
, res
);
4903 received_data
+= receive_size
;
4909 bool OffloadDescriptor::receive_pointer_data(bool is_async
,
4910 bool first_run
, void *info
)
4912 OffloadTimer
timer(get_timer_data(), c_offload_host_start_buffers_reads
);
4914 bool should_use_async_buffer_read
= m_initial_need_runfunction
;
4915 uint64_t ptr_received
= 0;
4918 // For offload_transfer and offload with empty body without signal:
4919 // - if there is only one buffer copy - get data synchronously
4920 // - if there are multiple buffer copy and
4921 // __offload_parallel_copy is false - get data synchronously
4922 // - if there are multiple buffer copy
4923 // and __offload_parallel_copy is true - get data asynchronously
4924 // It concerns only data with size greater than __offload_use_async_buffer_read.
4925 // Data of size less than __offload_use_async_buffer_read are received synchronously.
4926 // Synchronous transfer results in better performance in COI.
4927 // __offload_parallel_copy is false by default but can be changed
4928 // via environment variable OFFLOAD_PARALLEL_COPY
4929 if (!m_initial_need_runfunction
&& __offload_parallel_copy
) {
4930 int big_size_count
= 0;
4932 for (int i
= 0; i
< m_vars_total
; i
++) {
4933 if (m_vars
[i
].direction
.out
&&
4934 m_vars
[i
].size
>= __offload_use_async_buffer_read
) {
4935 // preallocated OUT only at second run
4936 if (first_run
== m_vars
[i
].flags
.preallocated
) {
4939 switch (m_vars_extra
[i
].type_src
) {
4942 case c_void_ptr_ptr
:
4944 if (m_vars
[i
].flags
.is_static
) {
4950 case c_string_ptr_ptr
:
4951 case c_data_ptr_ptr
:
4952 case c_cean_var_ptr
:
4953 case c_cean_var_ptr_ptr
:
4956 case c_dv_data_slice
:
4957 case c_dv_ptr_data_slice
:
4966 if (big_size_count
> 1) {
4967 should_use_async_buffer_read
= true;
4970 uint32_t in_deps_amount
= m_in_deps_total
;
4971 COIEVENT
*in_deps
= m_in_deps_total
> 0 ? m_in_deps
: 0;
4973 for (int i
= 0; i
< m_vars_total
; i
++) {
4974 uint64_t received_data
= m_vars
[i
].size
;
4976 // Nothing to receive if use_device_ptr
4977 if (m_vars
[i
].flags
.use_device_ptr
)
4979 if (m_vars_extra
[i
].omp_last_event_type
== c_last_read
&&
4980 m_out_deps_total
> 0) {
4981 m_num_in_dependencies
= m_out_deps_total
;
4982 m_p_in_dependencies
= m_out_deps
;
4984 // At first run don't receive by preallocated target pointer as the
4985 //pointer value will be ready later after call to scatter_copyout_data
4986 if (first_run
&& m_vars
[i
].alloc_if
&& m_vars
[i
].flags
.preallocated
) {
4987 m_preallocated_alloc
= true;
4988 // need one more call to OffloadDescriptor::receive_pointer_data
4989 if (m_vars
[i
].direction
.out
) {
4990 m_out_with_preallocated
= true;
4994 switch (m_vars_extra
[i
].type_src
) {
4995 case c_data_ptr_array
:
4999 case c_void_ptr_ptr
:
5001 if (m_vars
[i
].direction
.out
&&
5002 m_vars
[i
].flags
.is_static
) {
5004 (m_stream
!= no_stream
||
5006 m_in_deps_total
> 0 ||
5007 (should_use_async_buffer_read
&&
5008 m_vars
[i
].size
>= __offload_use_async_buffer_read
)) ?
5009 &m_out_deps
[m_out_deps_total
++] : 0;
5010 PtrData
*ptr_data
= NULL
;
5011 COIBUFFER dst_buf
= NULL
; // buffer at host
5014 if (VAR_TYPE_IS_PTR(m_vars_extra
[i
].type_dst
)) {
5015 ptr_data
= m_vars
[i
].into
?
5016 m_vars_extra
[i
].dst_data
:
5017 m_vars_extra
[i
].src_data
;
5019 else if (VAR_TYPE_IS_SCALAR(m_vars_extra
[i
].type_dst
)) {
5020 if (m_vars
[i
].flags
.is_static_dstn
) {
5021 ptr_data
= m_vars
[i
].into
?
5022 m_vars_extra
[i
].dst_data
:
5023 m_vars_extra
[i
].src_data
;
5026 dst_buf
= ptr_data
? ptr_data
->cpu_buf
: NULL
;
5027 if (dst_buf
== NULL
) {
5028 base
= offload_get_src_base(
5030 static_cast<char*>(m_vars
[i
].into
) :
5031 static_cast<char*>(m_vars
[i
].ptr
),
5032 m_vars_extra
[i
].type_dst
);
5035 if (m_vars
[i
].flags
.is_non_cont_struct
||
5036 m_vars
[i
].flags
.is_noncont_src
||
5037 m_vars
[i
].flags
.is_noncont_dst
) {
5038 receive_noncontiguous_pointer_data(
5039 i
, dst_buf
, event
, received_data
,
5040 m_num_in_dependencies
, m_p_in_dependencies
);
5042 else if (dst_buf
!= 0) {
5043 res
= COI::BufferCopy(
5045 m_vars_extra
[i
].src_data
->mic_buf
,
5046 m_vars_extra
[i
].cpu_offset
+
5047 m_vars_extra
[i
].cpu_disp
,
5048 m_vars
[i
].offset
+ m_vars
[i
].disp
,
5050 COI_COPY_UNSPECIFIED
,
5051 m_num_in_dependencies
,
5052 m_p_in_dependencies
,
5054 if (res
!= COI_SUCCESS
) {
5055 if (m_status
!= 0) {
5056 m_status
->result
= translate_coi_error(res
);
5059 report_coi_error(c_buf_copy
, res
);
5063 res
= COI::BufferRead(
5064 m_vars_extra
[i
].src_data
->mic_buf
,
5065 m_vars
[i
].offset
+ m_vars
[i
].disp
,
5066 base
+ m_vars_extra
[i
].cpu_offset
+
5067 m_vars_extra
[i
].cpu_disp
,
5069 COI_COPY_UNSPECIFIED
,
5070 m_num_in_dependencies
,
5071 m_p_in_dependencies
,
5073 if (res
!= COI_SUCCESS
) {
5074 if (m_status
!= 0) {
5075 m_status
->result
= translate_coi_error(res
);
5078 report_coi_error(c_buf_read
, res
);
5081 ptr_received
+= received_data
;
5087 case c_string_ptr_ptr
:
5088 case c_data_ptr_ptr
:
5089 case c_cean_var_ptr
:
5090 case c_cean_var_ptr_ptr
:
5093 case c_dv_data_slice
:
5094 case c_dv_ptr_data_slice
:
5096 COIBUFFER dst_buf
= NULL
; // buffer on host
5097 if (m_vars
[i
].direction
.out
&& m_vars
[i
].size
> 0) {
5099 (m_stream
!= no_stream
||
5101 m_in_deps_total
> 0 ||
5102 (should_use_async_buffer_read
&&
5103 m_vars
[i
].size
>= __offload_use_async_buffer_read
)) ?
5104 &m_out_deps
[m_out_deps_total
++] : 0;
5106 uint64_t dst_offset
= 0;
5107 char *base
= static_cast<char*>(m_vars
[i
].ptr
);
5109 if (VAR_TYPE_IS_PTR(m_vars_extra
[i
].type_dst
)) {
5110 PtrData
*ptr_data
= m_vars
[i
].into
?
5111 m_vars_extra
[i
].dst_data
:
5112 m_vars_extra
[i
].src_data
;
5113 dst_buf
= ptr_data
? ptr_data
->cpu_buf
: NULL
;
5114 if (dst_buf
== NULL
) {
5115 base
= m_vars
[i
].into
?
5116 *static_cast<char**>(m_vars
[i
].into
) :
5117 *static_cast<char**>(m_vars
[i
].ptr
);
5119 dst_offset
= m_vars_extra
[i
].cpu_offset
+
5120 m_vars_extra
[i
].cpu_disp
;
5122 else if (VAR_TYPE_IS_SCALAR(m_vars_extra
[i
].type_dst
)) {
5123 if (m_vars
[i
].flags
.is_static_dstn
) {
5124 dst_buf
= m_vars
[i
].into
?
5125 m_vars_extra
[i
].dst_data
->cpu_buf
:
5126 m_vars_extra
[i
].src_data
->cpu_buf
;
5128 if (dst_buf
== NULL
) {
5129 base
= offload_get_src_base(
5131 static_cast<char*>(m_vars
[i
].into
) :
5132 static_cast<char*>(m_vars
[i
].ptr
),
5133 m_vars_extra
[i
].type_dst
);
5135 dst_offset
= m_vars_extra
[i
].cpu_offset
+
5136 m_vars_extra
[i
].cpu_disp
;
5138 else if (VAR_TYPE_IS_DV_DATA(m_vars_extra
[i
].type_dst
) ||
5139 VAR_TYPE_IS_DV_DATA_SLICE(m_vars_extra
[i
].type_dst
)) {
5140 PtrData
*ptr_data
= m_vars
[i
].into
!= 0 ?
5141 m_vars_extra
[i
].dst_data
:
5142 m_vars_extra
[i
].src_data
;
5143 dst_buf
= ptr_data
!= 0 ? ptr_data
->cpu_buf
: 0;
5144 if (dst_buf
== NULL
) {
5145 base
= offload_get_src_base(
5147 static_cast<char*>(m_vars
[i
].into
) :
5148 static_cast<char*>(m_vars
[i
].ptr
),
5149 m_vars_extra
[i
].type_dst
);
5152 dst_offset
= m_vars_extra
[i
].cpu_offset
+
5153 m_vars_extra
[i
].cpu_disp
;
5156 if (m_vars
[i
].flags
.is_non_cont_struct
||
5157 m_vars
[i
].flags
.is_noncont_src
||
5158 m_vars
[i
].flags
.is_noncont_dst
) {
5159 receive_noncontiguous_pointer_data(
5160 i
, dst_buf
, event
, received_data
,
5161 m_num_in_dependencies
, m_p_in_dependencies
);
5163 else if (dst_buf
!= 0) {
5164 res
= COI::BufferCopy(
5166 m_vars_extra
[i
].src_data
->mic_buf
,
5168 m_vars
[i
].offset
+ m_vars
[i
].disp
+
5169 m_vars
[i
].mic_offset
,
5171 COI_COPY_UNSPECIFIED
,
5172 m_num_in_dependencies
,
5173 m_p_in_dependencies
,
5175 if (res
!= COI_SUCCESS
) {
5176 if (m_status
!= 0) {
5177 m_status
->result
= translate_coi_error(res
);
5180 report_coi_error(c_buf_copy
, res
);
5184 res
= COI::BufferRead(
5185 m_vars_extra
[i
].src_data
->mic_buf
,
5186 m_vars
[i
].offset
+ m_vars
[i
].disp
+
5187 m_vars
[i
].mic_offset
,
5190 COI_COPY_UNSPECIFIED
,
5191 m_num_in_dependencies
,
5192 m_p_in_dependencies
,
5194 if (res
!= COI_SUCCESS
) {
5195 if (m_status
!= 0) {
5196 m_status
->result
= translate_coi_error(res
);
5199 report_coi_error(c_buf_read
, res
);
5202 ptr_received
+= received_data
;
5211 if (m_vars_extra
[i
].omp_last_event_type
== c_last_read
) {
5212 register_omp_event_call_back(&m_out_deps
[m_out_deps_total
- 1], info
);
5214 // destroy buffers for obsolete stacks
5215 if (m_destroy_stack
.size() != 0) {
5216 for (PtrDataList::iterator it
= m_destroy_stack
.begin();
5217 it
!= m_destroy_stack
.end(); it
++) {
5218 PtrData
*ptr_data
= *it
;
5219 m_destroy_buffers
.push_back(ptr_data
->mic_buf
);
5220 OFFLOAD_TRACE(3, "Removing stack buffer with addr %p\n",
5221 ptr_data
->mic_addr
);
5223 m_destroy_stack
.clear();
5225 if (m_vars
[i
].free_if
) {
5226 // remove association for automatic variables
5228 if (m_vars_extra
[i
].auto_data
) {
5229 AutoData
*auto_data
= m_vars_extra
[i
].auto_data
;
5230 if (m_vars
[i
].flags
.always_delete
) {
5231 auto_data
->nullify_reference();
5233 else if (auto_data
->remove_reference() == 0) {
5234 m_device
.remove_auto_data(auto_data
->cpu_addr
.start());
5239 PtrData
*ptr_data
= m_vars_extra
[i
].src_data
;
5241 IS_OPENMP_IMPLICIT_OR_LINK(ptr_data
->var_alloc_type
)) {
5242 if (ptr_data
->get_reference() > 0) {
5243 ptr_data
->remove_reference();
5251 if (m_vars
[i
].direction
.out
|| m_vars
[i
].into
== NULL
) {
5252 if (!VAR_TYPE_IS_PTR(m_vars_extra
[i
].type_src
) &&
5253 !VAR_TYPE_IS_DV_DATA_SLICE(m_vars_extra
[i
].type_src
) &&
5254 !VAR_TYPE_IS_DV_DATA(m_vars_extra
[i
].type_src
)) {
5258 PtrData
*ptr_data
= m_vars_extra
[i
].src_data
;
5259 if (ptr_data
->remove_reference() == 0) {
5261 if (ptr_data
->cpu_buf
!= 0) {
5262 m_destroy_buffers
.push_back(ptr_data
->cpu_buf
);
5264 if (ptr_data
->mic_buf
!= 0) {
5265 m_destroy_buffers
.push_back(ptr_data
->mic_buf
);
5267 OFFLOAD_TRACE(3, "Removing association for addr %p\n",
5268 ptr_data
->cpu_addr
.start());
5270 // remove association from map
5271 if (m_vars
[i
].flags
.targetptr
) {
5272 m_device
.remove_targetptr_data(ptr_data
->cpu_addr
.start());
5275 m_device
.remove_ptr_data(ptr_data
->cpu_addr
.start());
5279 else if (VAR_TYPE_IS_PTR(m_vars_extra
[i
].type_dst
) ||
5280 VAR_TYPE_IS_DV_DATA_SLICE(m_vars_extra
[i
].type_dst
) ||
5281 VAR_TYPE_IS_DV_DATA(m_vars_extra
[i
].type_dst
)) {
5282 PtrData
*ptr_data
= m_vars_extra
[i
].dst_data
;
5284 if (ptr_data
->remove_reference() == 0) {
5286 if (ptr_data
->cpu_buf
!= 0) {
5287 m_destroy_buffers
.push_back(ptr_data
->cpu_buf
);
5289 if (ptr_data
->mic_buf
!= 0) {
5290 m_destroy_buffers
.push_back(ptr_data
->mic_buf
);
5292 OFFLOAD_TRACE(3, "Removing association for addr %p\n",
5293 ptr_data
->cpu_addr
.start());
5295 // remove association from map
5296 if (m_vars
[i
].flags
.targetptr
) {
5297 m_device
.remove_targetptr_data(ptr_data
->cpu_addr
.start());
5300 m_device
.remove_ptr_data(ptr_data
->cpu_addr
.start());
5308 m_status
->data_received
+= ptr_received
;
5311 m_num_in_dependencies
= m_out_deps_total
? m_out_deps_total
:
5312 m_num_in_dependencies
;
5313 m_p_in_dependencies
= m_out_deps_total
? m_out_deps
: m_p_in_dependencies
;
5315 OFFLOAD_TIMER_HOST_RDATA(get_timer_data(), ptr_received
);
5316 OFFLOAD_DEBUG_TRACE_1(1, GET_OFFLOAD_NUMBER(get_timer_data()),
5317 c_offload_received_pointer_data
,
5318 "Total pointer data received from target: [%lld] bytes\n",
5324 bool OffloadDescriptor::scatter_copyout_data()
5326 OffloadTimer
timer(get_timer_data(), c_offload_host_scatter_outputs
);
5328 if (m_need_runfunction
&& m_out_datalen
> 0) {
5330 // total size that need to be transferred from target to host
5331 COIMAPINSTANCE map_inst
;
5335 // output data buffer
5336 if (m_func_desc
->data_offset
== 0) {
5337 OffloadTimer
timer_map(get_timer_data(),
5338 c_offload_host_map_out_data_buffer
);
5340 COIRESULT res
= COI::BufferMap(m_inout_buf
, 0, m_out_datalen
,
5341 COI_MAP_READ_ONLY
, 0, 0, 0,
5343 reinterpret_cast<void**>(&data
));
5344 if (res
!= COI_SUCCESS
) {
5345 if (m_status
!= 0) {
5346 m_status
->result
= translate_coi_error(res
);
5349 report_coi_error(c_buf_map
, res
);
5353 data
= (char*) m_func_desc
+ m_func_desc
->data_offset
;
5357 OFFLOAD_TIMER_TARGET_DATA(get_timer_data(), data
);
5358 data
+= OFFLOAD_TIMER_DATALEN();
5360 // initialize output marshaller
5361 m_out
.init_buffer(data
, m_out_datalen
);
5363 for (int i
= 0; i
< m_vars_total
; i
++) {
5364 bool src_is_for_mic
= (m_vars
[i
].direction
.out
||
5365 m_vars
[i
].into
== NULL
);
5367 if (m_vars_extra
[i
].type_src
!= c_data_ptr_array
&&
5368 m_vars
[i
].flags
.preallocated
&& m_vars
[i
].alloc_if
) {
5371 void ** cpu_ptr
= src_is_for_mic
?
5372 reinterpret_cast<void**>(m_vars
[i
].ptr
) :
5373 reinterpret_cast<void**>(m_vars
[i
].into
);
5374 void* alloc_base
= NULL
;
5375 int64_t alloc_disp
= 0;
5377 if (m_vars_extra
[i
].alloc
!= NULL
) {
5379 const Arr_Desc
*ap
=
5380 static_cast<const Arr_Desc
*>(m_vars_extra
[i
].alloc
);
5382 __arr_data_offset_and_length(ap
, alloc_disp
, alloc_size
);
5384 alloc_base
= reinterpret_cast<void*>(ap
->base
);
5387 // get pointer to target memory
5388 m_out
.receive_data(&ptr_value
, sizeof(void*));
5391 if (!alloc_ptr_data(
5394 (alloc_base
!= NULL
) ?
5395 alloc_disp
: m_vars
[i
].disp
,
5396 (alloc_base
!= NULL
) ?
5397 alloc_size
: m_vars
[i
].size
,
5400 m_vars
[i
].flags
.targetptr
,
5401 m_vars
[i
].flags
.preallocated
,
5402 m_vars
[i
].flags
.pin
)) {
5406 ptr_data
->add_reference();
5407 *cpu_ptr
= ptr_value
;
5408 if (src_is_for_mic
) {
5409 m_vars_extra
[i
].src_data
= ptr_data
;
5412 m_vars_extra
[i
].dst_data
= ptr_data
;
5414 m_vars
[i
].offset
= (char*) ptr_value
-
5415 (char*) ptr_data
->cpu_addr
.start();
5418 switch (m_vars_extra
[i
].type_src
) {
5419 case c_data_ptr_array
:
5423 case c_void_ptr_ptr
:
5425 if (m_vars
[i
].direction
.out
&&
5426 !m_vars
[i
].flags
.is_static
) {
5428 if (m_vars
[i
].into
) {
5429 char *ptr
= offload_get_src_base(
5430 static_cast<char*>(m_vars
[i
].into
),
5431 m_vars_extra
[i
].type_dst
);
5432 m_out
.receive_data(ptr
+ m_vars_extra
[i
].cpu_disp
,
5437 static_cast<char*>(m_vars
[i
].ptr
) +
5438 m_vars_extra
[i
].cpu_disp
,
5445 case c_func_ptr_ptr
:
5446 if (m_vars
[i
].direction
.out
) {
5447 m_out
.receive_func_ptr((const void**) m_vars
[i
].ptr
);
5457 m_status
->data_received
+= m_out
.get_tfr_size();
5460 if (m_func_desc
->data_offset
== 0) {
5461 OffloadTimer
timer_unmap(get_timer_data(),
5462 c_offload_host_unmap_out_data_buffer
);
5464 COIRESULT res
= COI::BufferUnmap(map_inst
, 0, 0, 0);
5465 if (res
!= COI_SUCCESS
) {
5466 if (m_status
!= 0) {
5467 m_status
->result
= translate_coi_error(res
);
5470 report_coi_error(c_buf_unmap
, res
);
5475 OFFLOAD_TIMER_HOST_RDATA(get_timer_data(), m_out
.get_tfr_size());
5476 OFFLOAD_TRACE(1, "Total copyout data received from target: [%lld] bytes\n",
5477 m_out
.get_tfr_size());
5482 static void get_arr_desc_numbers(
5488 CeanReadRanges
* &ptr_ranges
5491 if (is_arr_desc_contiguous(ap
)) {
5493 __arr_data_offset_and_length(ap
, offset
, size
);
5494 el_number
= size
/ el_size
;
5497 ptr_ranges
= init_read_ranges_arr_desc(ap
);
5498 el_number
= (ptr_ranges
->range_size
/ el_size
) *
5499 ptr_ranges
->range_max_number
;
5500 size
= ptr_ranges
->range_size
;
5504 bool OffloadDescriptor::gen_var_descs_for_pointer_array(int i
)
5506 int pointers_number
;
5508 int new_index
= m_vars_total
;
5510 const VarDesc3
*vd3
= static_cast<const VarDesc3
*>(m_vars
[i
].ptr
);
5511 int flags
= vd3
->array_fields
;
5512 bool src_is_for_mic
= (m_vars
[i
].direction
.out
||
5513 m_vars
[i
].into
== NULL
);
5515 ReadArrElements
<void *> ptr
;
5516 ReadArrElements
<void *> into
;
5517 ReadArrElements
<int64_t> ext_start
;
5518 ReadArrElements
<int64_t> ext_elements
;
5519 ReadArrElements
<int64_t> align
;
5520 ReadArrElements
<int64_t> alloc_if
;
5521 ReadArrElements
<int64_t> free_if
;
5522 ReadArrElements
<int64_t> into_start
;
5523 ReadArrElements
<int64_t> into_elem
;
5524 ReadArrElements
<int64_t> alloc_start
;
5525 ReadArrElements
<int64_t> alloc_elem
;
5528 ap
= static_cast<const Arr_Desc
*>(vd3
->ptr_array
);
5530 // "pointers_number" for total number of transferred pointers.
5531 // For each of them we create new var_desc and put it at the bottom
5532 // of the var_desc's array
5533 get_arr_desc_numbers(ap
, sizeof(void *), ptr
.offset
, ptr
.size
,
5534 pointers_number
, ptr
.ranges
);
5535 ptr
.base
= reinterpret_cast<char*>(ap
->base
);
5537 // 2. prepare memory for new var_descs
5538 m_vars_total
+= pointers_number
;
5539 m_vars
= (VarDesc
*)realloc(m_vars
, m_vars_total
* sizeof(VarDesc
));
5541 LIBOFFLOAD_ERROR(c_malloc
);
5543 (VarExtra
*)realloc(m_vars_extra
, m_vars_total
* sizeof(VarExtra
));
5544 if (m_vars_extra
== NULL
)
5545 LIBOFFLOAD_ERROR(c_malloc
);
5547 (COIEVENT
*)realloc(m_in_deps
, sizeof(COIEVENT
) * (m_vars_total
+ 1));
5548 if (m_in_deps
== NULL
)
5549 LIBOFFLOAD_ERROR(c_malloc
);
5551 (COIEVENT
*)realloc(m_out_deps
, sizeof(COIEVENT
) * m_vars_total
);
5552 if (m_out_deps
== NULL
)
5553 LIBOFFLOAD_ERROR(c_malloc
);
5555 // 3. Prepare for reading new var_desc's fields
5557 if ((flags
& (1<<flag_extent_start_is_array
)) != 0) {
5558 ap
= static_cast<const Arr_Desc
*>(vd3
->extent_start
);
5559 get_arr_desc_numbers(ap
, ap
->dim
[ap
->rank
- 1].size
, ext_start
.offset
,
5560 ext_start
.size
, tmp_val
, ext_start
.ranges
);
5561 ext_start
.base
= reinterpret_cast<char*>(ap
->base
);
5562 ext_start
.el_size
= ap
->dim
[ap
->rank
- 1].size
;
5564 if (tmp_val
< pointers_number
) {
5565 LIBOFFLOAD_ERROR(c_pointer_array_mismatch
, "extent start");
5569 else if ((flags
& (1<<flag_extent_start_is_scalar
)) != 0) {
5570 ext_start
.val
= (int64_t)vd3
->extent_start
;
5576 // EXTENT ELEMENTS NUMBER
5577 if ((flags
& (1<<flag_extent_elements_is_array
)) != 0) {
5578 ap
= static_cast<const Arr_Desc
*>(vd3
->extent_elements
);
5579 get_arr_desc_numbers(ap
, ap
->dim
[ap
->rank
- 1].size
,
5580 ext_elements
.offset
, ext_elements
.size
,
5581 tmp_val
, ext_elements
.ranges
);
5582 ext_elements
.base
= reinterpret_cast<char*>(ap
->base
);
5583 ext_elements
.el_size
= ap
->dim
[ap
->rank
- 1].size
;
5585 if (tmp_val
< pointers_number
) {
5586 LIBOFFLOAD_ERROR(c_pointer_array_mismatch
, "extent elements");
5590 else if ((flags
& (1<<flag_extent_elements_is_scalar
)) != 0) {
5591 ext_elements
.val
= (int64_t)vd3
->extent_elements
;
5594 ext_elements
.val
= m_vars
[i
].count
;
5598 if ((flags
& (1<<flag_alloc_if_is_array
)) != 0) {
5599 ap
= static_cast<const Arr_Desc
*>(vd3
->alloc_if_array
);
5600 get_arr_desc_numbers(ap
, ap
->dim
[ap
->rank
- 1].size
, alloc_if
.offset
,
5601 alloc_if
.size
, tmp_val
, alloc_if
.ranges
);
5602 alloc_if
.base
= reinterpret_cast<char*>(ap
->base
);
5603 alloc_if
.el_size
= ap
->dim
[ap
->rank
- 1].size
;
5605 if (tmp_val
< pointers_number
) {
5606 LIBOFFLOAD_ERROR(c_pointer_array_mismatch
, "alloc_if");
5611 alloc_if
.val
= m_vars
[i
].alloc_if
;
5615 if ((flags
& (1<<flag_free_if_is_array
)) != 0) {
5616 ap
= static_cast<const Arr_Desc
*>(vd3
->free_if_array
);
5617 get_arr_desc_numbers(ap
, ap
->dim
[ap
->rank
- 1].size
, free_if
.offset
,
5618 free_if
.size
, tmp_val
, free_if
.ranges
);
5619 free_if
.base
= reinterpret_cast<char*>(ap
->base
);
5620 free_if
.el_size
= ap
->dim
[ap
->rank
- 1].size
;
5622 if (tmp_val
< pointers_number
) {
5623 LIBOFFLOAD_ERROR(c_pointer_array_mismatch
, "free_if");
5628 free_if
.val
= m_vars
[i
].free_if
;
5633 if ((flags
& (1<<flag_align_is_array
)) != 0) {
5634 ap
= static_cast<const Arr_Desc
*>(vd3
->align_array
);
5635 get_arr_desc_numbers(ap
, ap
->dim
[ap
->rank
- 1].size
, align
.offset
,
5636 align
.size
, tmp_val
, align
.ranges
);
5637 align
.base
= reinterpret_cast<char*>(ap
->base
);
5638 align
.el_size
= ap
->dim
[ap
->rank
- 1].size
;
5640 if (tmp_val
< pointers_number
) {
5641 LIBOFFLOAD_ERROR(c_pointer_array_mismatch
, "align");
5646 align
.val
= m_vars
[i
].align
;
5651 if (m_vars
[i
].into
) {
5652 ap
= static_cast<const Arr_Desc
*>(m_vars
[i
].into
);
5653 get_arr_desc_numbers(ap
, ap
->dim
[ap
->rank
- 1].size
, into
.offset
,
5654 into
.size
, tmp_val
, into
.ranges
);
5655 into
.base
= reinterpret_cast<char*>(ap
->base
);
5657 if (tmp_val
< pointers_number
) {
5658 LIBOFFLOAD_ERROR(c_pointer_array_mismatch
, "into");
5665 if ((flags
& (1<<flag_into_start_is_array
)) != 0) {
5666 ap
= static_cast<const Arr_Desc
*>(vd3
->into_start
);
5667 get_arr_desc_numbers(ap
, ap
->dim
[ap
->rank
- 1].size
, into_start
.offset
,
5668 into_start
.size
, tmp_val
, into_start
.ranges
);
5669 into_start
.base
= reinterpret_cast<char*>(ap
->base
);
5670 into_start
.el_size
= ap
->dim
[ap
->rank
- 1].size
;
5672 if (tmp_val
< pointers_number
) {
5673 LIBOFFLOAD_ERROR(c_pointer_array_mismatch
, "into_extent start");
5677 else if ((flags
& (1<<flag_into_start_is_scalar
)) != 0) {
5678 into_start
.val
= (int64_t)vd3
->into_start
;
5684 // 3.3 INTO_ELEMENTS
5686 if ((flags
& (1<<flag_into_elements_is_array
)) != 0) {
5687 ap
= static_cast<const Arr_Desc
*>(vd3
->into_elements
);
5688 get_arr_desc_numbers(ap
, ap
->dim
[ap
->rank
- 1].size
, into_elem
.offset
,
5689 into_elem
.size
, tmp_val
, into_elem
.ranges
);
5690 into_elem
.base
= reinterpret_cast<char*>(ap
->base
);
5691 into_elem
.el_size
= ap
->dim
[ap
->rank
- 1].size
;
5693 if (tmp_val
< pointers_number
) {
5694 LIBOFFLOAD_ERROR(c_pointer_array_mismatch
, "into_extent elements");
5698 else if ((flags
& (1<<flag_into_elements_is_scalar
)) != 0) {
5699 into_elem
.val
= (int64_t)vd3
->into_elements
;
5702 into_elem
.val
= m_vars
[i
].count
;
5707 if ((flags
& (1<<flag_alloc_start_is_array
)) != 0) {
5708 ap
= static_cast<const Arr_Desc
*>(vd3
->alloc_start
);
5709 get_arr_desc_numbers(ap
, ap
->dim
[ap
->rank
- 1].size
,
5710 alloc_start
.offset
, alloc_start
.size
, tmp_val
,
5711 alloc_start
.ranges
);
5712 alloc_start
.base
= reinterpret_cast<char*>(ap
->base
);
5713 alloc_start
.el_size
= ap
->dim
[ap
->rank
- 1].size
;
5715 if (tmp_val
< pointers_number
) {
5716 LIBOFFLOAD_ERROR(c_pointer_array_mismatch
, "alloc_extent start");
5720 else if ((flags
& (1<<flag_alloc_start_is_scalar
)) != 0) {
5721 alloc_start
.val
= (int64_t)vd3
->alloc_start
;
5724 alloc_start
.val
= 0;
5729 if ((flags
& (1<<flag_alloc_elements_is_array
)) != 0) {
5730 ap
= static_cast<const Arr_Desc
*>(vd3
->alloc_elements
);
5731 get_arr_desc_numbers(ap
, ap
->dim
[ap
->rank
- 1].size
, alloc_elem
.offset
,
5732 alloc_elem
.size
, tmp_val
, alloc_elem
.ranges
);
5733 alloc_elem
.base
= reinterpret_cast<char*>(ap
->base
);
5734 alloc_elem
.el_size
= ap
->dim
[ap
->rank
- 1].size
;
5735 if (tmp_val
< pointers_number
) {
5736 LIBOFFLOAD_ERROR(c_pointer_array_mismatch
,
5737 "alloc_extent elements");
5741 else if ((flags
& (1<<flag_alloc_elements_is_scalar
)) != 0) {
5742 alloc_elem
.val
= (int64_t)vd3
->alloc_elements
;
5748 for (int k
= 0; k
< pointers_number
; k
++) {
5749 int type
= flags
& 0x3f;
5750 int type_src
, type_dst
;
5752 // type_src, type_dst
5753 type_src
= type_dst
= (type
== c_data_ptr_array
) ?
5754 c_data_ptr
: (type
== c_func_ptr_array
) ?
5755 c_func_ptr
: (type
== c_void_ptr_array
) ?
5756 c_void_ptr
: (type
== c_string_ptr_array
) ?
5760 if (!ptr
.read_next(true)) {
5764 ptr
.val
= (void*)(ptr
.base
+ ptr
.offset
);
5767 // !!! If we got error at phase of reading - it's an internal
5768 // !!! error, as we must detect mismatch before
5771 if (m_vars
[i
].into
) {
5772 if (!into
.read_next(true)) {
5773 LIBOFFLOAD_ERROR(c_pointer_array_mismatch
, "into");
5777 into
.val
= (void*)(into
.base
+ into
.offset
);
5781 // Get other components of the clause
5782 if (!ext_start
.read_next(flags
& (1<<flag_extent_start_is_array
))) {
5783 LIBOFFLOAD_ERROR(c_pointer_array_mismatch
, "extent start");
5786 if (!ext_elements
.read_next(
5787 flags
& (1<<flag_extent_elements_is_array
))) {
5788 LIBOFFLOAD_ERROR(c_pointer_array_mismatch
, "extent elements");
5791 if (!alloc_if
.read_next(flags
& (1<<flag_alloc_if_is_array
))) {
5792 LIBOFFLOAD_ERROR(c_pointer_array_mismatch
, "alloc_if");
5795 if (!free_if
.read_next(flags
& (1<<flag_free_if_is_array
))) {
5796 LIBOFFLOAD_ERROR(c_pointer_array_mismatch
, "free_if");
5799 if (!align
.read_next(flags
& (1<<flag_align_is_array
))) {
5800 LIBOFFLOAD_ERROR(c_pointer_array_mismatch
, "align");
5803 if (!into_start
.read_next(flags
& (1<<flag_into_start_is_array
))) {
5804 LIBOFFLOAD_ERROR(c_pointer_array_mismatch
, "into_extent start");
5807 if (!into_elem
.read_next(flags
& (1<<flag_into_elements_is_array
))) {
5808 LIBOFFLOAD_ERROR(c_pointer_array_mismatch
, "into_extent elements");
5811 if (!alloc_start
.read_next(flags
& (1<<flag_alloc_start_is_array
))) {
5812 LIBOFFLOAD_ERROR(c_pointer_array_mismatch
, "alloc_extent start");
5815 if (!alloc_elem
.read_next(
5816 flags
& (1<<flag_alloc_elements_is_array
))) {
5817 LIBOFFLOAD_ERROR(c_pointer_array_mismatch
, "alloc_extent elements");
5821 m_vars
[new_index
+ k
].direction
.bits
= m_vars
[i
].direction
.bits
;
5822 m_vars
[new_index
+ k
].alloc_if
= alloc_if
.val
;
5823 m_vars
[new_index
+ k
].free_if
= free_if
.val
;
5824 m_vars
[new_index
+ k
].align
= align
.val
;
5825 m_vars
[new_index
+ k
].mic_offset
= 0;
5826 m_vars
[new_index
+ k
].flags
.bits
= m_vars
[i
].flags
.bits
;
5827 m_vars
[new_index
+ k
].flags
.is_pointer
= 0;
5828 m_vars
[new_index
+ k
].offset
= 0;
5829 m_vars
[new_index
+ k
].size
= m_vars
[i
].size
;
5830 m_vars
[new_index
+ k
].flags
.targetptr
= m_vars
[i
].flags
.targetptr
;
5831 m_vars
[new_index
+ k
].flags
.preallocated
=
5832 m_vars
[i
].flags
.preallocated
;
5834 if (ext_start
.val
== 0) {
5835 m_vars
[new_index
+ k
].count
= ext_elements
.val
;
5836 m_vars
[new_index
+ k
].ptr
= ptr
.val
;
5837 if (type_src
== c_string_ptr
) {
5838 m_vars
[new_index
+ k
].size
= 0;
5842 m_vars
[new_index
+ k
].count
= 0;
5843 m_vars
[new_index
+ k
].ptr
=
5844 static_cast<void*>(make_arr_desc(
5850 type_src
= type_src
== c_data_ptr
? c_cean_var_ptr
:
5851 c_string_ptr
? c_cean_var_ptr
:
5853 if (!m_vars
[i
].into
) {
5854 type_dst
= type_src
;
5858 if (m_vars
[i
].into
&& into_elem
.val
!= 0) {
5859 m_vars
[new_index
+ k
].into
=
5860 static_cast<void*>(make_arr_desc(
5865 type_dst
= (type
== c_data_ptr_array
) ? c_cean_var_ptr
:
5866 (type
== c_string_ptr_array
) ? c_cean_var_ptr
:
5870 m_vars
[new_index
+ k
].into
= NULL
;
5873 if (alloc_elem
.val
!= 0) {
5874 m_vars
[new_index
+ k
].alloc
=
5875 static_cast<void*>(make_arr_desc(
5882 m_vars
[new_index
+ k
].alloc
= NULL
;
5885 m_vars
[new_index
+ k
].type
.src
=
5886 m_vars_extra
[new_index
+ k
].type_src
= type_src
;
5887 m_vars
[new_index
+ k
].type
.dst
=
5888 m_vars_extra
[new_index
+ k
].type_dst
= type_dst
;
5890 m_vars_extra
[new_index
+ k
].alloc
= m_vars
[new_index
+ k
].alloc
;
5891 m_vars_extra
[new_index
+ k
].is_arr_ptr_el
= 1;
5892 m_vars_extra
[new_index
+ k
].ptr_arr_offset
=
5893 src_is_for_mic
? ptr
.offset
: into
.offset
;
5895 // count and alloc fields are useless at target. They can be reused
5896 // for pointer arrays.
5897 m_vars
[i
].count
= pointers_number
;
5898 m_vars
[i
].ptr_arr_offset
= new_index
;
5902 // Gets in dependencies of the previous offload via the stream "m_stream".
5903 // Out argument in_deps_amount - address of amount of the dependencies
5904 // Out argument in_deps - address of array of dependencies.
5905 // Description of the dependencies scheme for streams :
5906 // ----------------------------------------------------
5907 // Every offload forms DAG consisted of 3 nodes:
5908 // for in-transfers, runfunction and out-transfers.
5909 // Every node has in-dependencies and out-dependencies
5910 // Out-dependencies of previous node forms in-dependencies of current node.
5911 // In-dependencies of 1-st node (of in-transfers) without streams is equal
5912 // to NULL. For streams in-dependencies of 1-st node is equal to list of out
5913 // dependencies of last node of previous offload via this stream.
5914 // So we can say that DAGs of 2 consequent offloads via the same stream are
5915 // connected by the way described above.
5916 void OffloadDescriptor::get_stream_in_dependencies(
5917 uint32_t &in_deps_amount
,
5921 if (m_stream
!= no_stream
&& m_stream
!= 0) {
5922 Stream
* stream
= Stream::find_stream(m_stream
, false);
5924 LIBOFFLOAD_ERROR(c_offload_no_stream
,
5925 m_device
.get_logical_index());
5928 OffloadDescriptor
* offload
= stream
->get_last_offload();
5930 // if it's the first offload in the stream
5934 // if last offload has out-tranfers
5935 if (offload
->m_out_deps_total
) {
5936 in_deps_amount
= offload
->m_out_deps_total
;
5937 in_deps
= offload
->m_out_deps
;
5939 // last offload only sends pointer data or run function or both of them
5940 // and has no out-transfers
5941 else if (offload
->m_in_deps_total
) {
5942 in_deps_amount
= offload
->m_in_deps_total
;
5943 in_deps
= offload
->m_in_deps
;
5948 static void __offload_fini_library(void)
5950 OFFLOAD_DEBUG_TRACE(2, "Cleanup offload library ...\n");
5951 if (mic_engines_total
> 0) {
5952 delete[] mic_engines
;
5953 mic_engines_total
= 0;
5955 if (mic_proxy_fs_root
!= 0) {
5956 free(mic_proxy_fs_root
);
5957 mic_proxy_fs_root
= 0;
5960 if (knc_library_path
!= 0) {
5961 free(knc_library_path
);
5962 knc_library_path
= 0;
5965 if (knl_library_path
!= 0) {
5966 free(knl_library_path
);
5967 knl_library_path
= 0;
5970 // destroy thread key
5971 thread_key_delete(mic_thread_key
);
5974 // unload COI library
5975 if (COI::is_available
) {
5979 OFFLOAD_DEBUG_TRACE(2, "Cleanup offload library ... done\n");
5982 typedef std::pair
<int, micLcpuMask
*> deviceLcpu
;
5983 typedef std::list
<deviceLcpu
> deviceLcpuList
;
5985 static int process_offload_devices(
5986 const char *env_var
,
5987 uint32_t num_devices
,
5988 deviceLcpuList
&device_cpu_list
5991 // Value is composed of comma separated physical device index
5992 // optionally qualified by logical CPU subset, e.g. 0[60,70-80]
5993 char *buf
= strdup(env_var
);
5995 LIBOFFLOAD_ERROR(c_malloc
);
5997 bool device_set_finished
= false;
5998 int num_devices_specified
= 0;
6000 char *dev_ptr
= str
;
6001 int dev_len
= strcspn(str
, "[,");
6002 micLcpuMask
* cpu_mask
= 0;
6003 if (str
[dev_len
] == '[') {
6004 // CPU subset specified
6005 cpu_mask
= new micLcpuMask
;
6007 char *cpu_ptr
= str
+ dev_len
+ 1;
6010 bool cpu_set_finished
= false;
6011 int cpu_len
= strcspn(cpu_ptr
, ",-]");
6012 if (cpu_ptr
[cpu_len
] == ',' || cpu_ptr
[cpu_len
] == ']') {
6013 // A single CPU specified
6014 cpu_set_finished
= cpu_ptr
[cpu_len
] == ']';
6015 cpu_ptr
[cpu_len
] = '\0';
6016 // Convert cpu string to an int
6017 if (!__offload_parse_int_string(cpu_ptr
, cnum
)) {
6018 LIBOFFLOAD_ERROR(c_mic_init7
);
6023 OFFLOAD_DEBUG_TRACE(3,
6024 "Single CPU %d selected\n", cnum
);
6025 cpu_mask
->set(cnum
);
6027 cpu_ptr
= cpu_ptr
+ cpu_len
+ 1;
6028 if (cpu_set_finished
) {
6031 } else if (cpu_ptr
[cpu_len
] == '-') {
6032 int64_t range_start
, range_end
;
6033 // A range of CPUs specified
6034 cpu_ptr
[cpu_len
] = '\0';
6035 // Convert cpu string to an int
6036 if (!__offload_parse_int_string(cpu_ptr
, range_start
)) {
6037 LIBOFFLOAD_ERROR(c_mic_init8
);
6042 OFFLOAD_DEBUG_TRACE(3,
6043 "Start of CPU range specified as %d\n",
6045 cpu_ptr
= cpu_ptr
+ cpu_len
+ 1;
6046 cpu_len
= strcspn(cpu_ptr
, ",]");
6047 if (cpu_ptr
[cpu_len
] == ',' ||
6048 cpu_ptr
[cpu_len
] == ']') {
6049 cpu_set_finished
= cpu_ptr
[cpu_len
] == ']';
6050 cpu_ptr
[cpu_len
] = '\0';
6051 // Convert cpu string to an int
6052 if (!__offload_parse_int_string(
6053 cpu_ptr
, range_end
)) {
6054 LIBOFFLOAD_ERROR(c_mic_init9
);
6059 OFFLOAD_DEBUG_TRACE(3,
6060 "End of CPU range specified as %d\n",
6062 if (range_end
< range_start
) {
6063 LIBOFFLOAD_ERROR(c_mic_init10
);
6068 for (int i
=range_start
; i
<=range_end
; i
++)
6070 OFFLOAD_DEBUG_TRACE(3,
6071 "CPU %d selected as part of range\n",
6075 cpu_ptr
= cpu_ptr
+ cpu_len
+ 1;
6076 if (cpu_set_finished
) {
6082 LIBOFFLOAD_ERROR(c_mic_init10
);
6089 // Error: expected , or - or ]
6090 LIBOFFLOAD_ERROR(c_mic_init11
);
6096 // Point to next device specification
6099 device_set_finished
= true;
6101 // Skip the comma after a device specification
6104 } else if (str
[dev_len
] == ',') {
6105 // CPU subset not specified
6106 // Point to next device specification
6107 str
= str
+ dev_len
+ 1;
6109 // No more device specifications
6110 device_set_finished
= true;
6112 dev_ptr
[dev_len
] = '\0';
6113 // Convert device string to an int
6115 if (!__offload_parse_int_string(dev_ptr
, num
)) {
6116 LIBOFFLOAD_ERROR(c_mic_init5
);
6121 if (num
< 0 || num
>= num_devices
) {
6122 LIBOFFLOAD_ERROR(c_mic_init6
, num
);
6127 OFFLOAD_DEBUG_TRACE(3, "Offloadable MIC = %d\n", num
);
6128 // Save the specified physical device and cpu mask
6129 device_cpu_list
.push_back(make_pair(num
, cpu_mask
));
6130 num_devices_specified
++;
6132 if (device_set_finished
) {
6138 return num_devices_specified
;
6141 static void __offload_init_library_once(void)
6144 uint32_t num_devices
;
6145 deviceLcpuList device_cpu_list
;
6146 prefix
= report_get_message_str(c_report_host
);
6149 const char *env_var
= getenv(htrace_envname
);
6150 if (env_var
!= 0 && *env_var
!= '\0') {
6152 if (__offload_parse_int_string(env_var
, new_val
)) {
6153 console_enabled
= new_val
& 0x0f;
6157 OFFLOAD_DEBUG_TRACE(2, "---- Start of environment variable processing\n");
6158 env_var
= getenv(offload_report_envname
);
6159 if (env_var
!= 0 && *env_var
!= '\0') {
6160 OFFLOAD_DEBUG_TRACE(2, "---- %s=%s\n",
6161 offload_report_envname
, env_var
);
6163 if (__offload_parse_int_string(env_var
, env_val
)) {
6164 if (env_val
== OFFLOAD_REPORT_1
||
6165 env_val
== OFFLOAD_REPORT_2
||
6166 env_val
== OFFLOAD_REPORT_3
) {
6167 offload_report_level
= env_val
;
6168 OFFLOAD_DEBUG_TRACE(2, "Offload report level set to %d\n",
6169 offload_report_level
);
6172 LIBOFFLOAD_ERROR(c_invalid_env_report_value
,
6173 offload_report_envname
);
6177 LIBOFFLOAD_ERROR(c_invalid_env_var_int_value
,
6178 offload_report_envname
);
6181 else if (!offload_report_level
) {
6182 env_var
= getenv(timer_envname
);
6183 if (env_var
!= 0 && *env_var
!= '\0') {
6184 OFFLOAD_DEBUG_TRACE(2, "---- %s=%s\n", timer_envname
, env_var
);
6185 timer_enabled
= atoi(env_var
);
6186 OFFLOAD_DEBUG_TRACE(2, "Timer enable flag set to %d\n",
6196 // Process OFFLOAD_NODES, specification of physical MICs available
6197 env_var
= getenv("OFFLOAD_NODES");
6198 if (env_var
!= 0 && *env_var
!= '\0') {
6199 OFFLOAD_DEBUG_TRACE(2, "---- OFFLOAD_NODES=%s\n", env_var
);
6200 // Pass env var on to COI
6201 char * new_env_var
=
6202 (char*) malloc(sizeof("COI_OFFLOAD_NODES=") +
6203 strlen(env_var
) + 1);
6204 if (new_env_var
== NULL
)
6205 LIBOFFLOAD_ERROR(c_malloc
);
6206 sprintf(new_env_var
, "COI_OFFLOAD_NODES=%s", env_var
);
6207 putenv(new_env_var
);
6208 OFFLOAD_DEBUG_TRACE(2, "Setting COI_OFFLOAD_NODES = %s \n", getenv("COI_OFFLOAD_NODES"));
6210 // value is composed of comma separated physical device indexes
6211 char *buf
= strdup(env_var
);
6213 LIBOFFLOAD_ERROR(c_malloc
);
6216 for (str
= strtok_r(buf
, ",", &ptr
); str
!= 0;
6217 str
= strtok_r(0, ",", &ptr
)) {
6221 OFFLOAD_DEBUG_TRACE(2, "Number of offloadable MICs = %d\n", num_mics
);
6225 OFFLOAD_DEBUG_TRACE(2, "OFFLOAD_NODES is not set\n");
6228 // get number of devices installed in the system
6229 res
= COI::EngineGetCount(COI_ISA_MIC
, &num_devices
);
6230 if (res
!= COI_SUCCESS
) {
6234 if (num_devices
> MIC_ENGINES_MAX
) {
6235 num_devices
= MIC_ENGINES_MAX
;
6238 // Determine devices & cpus that can be used for offloading
6239 env_var
= getenv("OFFLOAD_DEVICES");
6240 if (env_var
!= 0 && *env_var
!= '\0') {
6241 OFFLOAD_DEBUG_TRACE(2, "---- OFFLOAD_DEVICES=%s\n", env_var
);
6242 if (strcasecmp(env_var
, "none") != 0) {
6244 process_offload_devices(
6245 env_var
, num_devices
, device_cpu_list
);
6246 if (mic_engines_total
> 0) {
6247 OFFLOAD_DEBUG_TRACE(2, "Valid value, %d device(s) specified\n",
6251 OFFLOAD_DEBUG_TRACE(2, "Invalid value, will not offload\n");
6256 // No need to continue since no offload devices
6261 OFFLOAD_DEBUG_TRACE(2, "OFFLOAD_DEVICES is not set\n");
6263 if (mic_engines_total
== 0) {
6264 // Fallback to using all available devices and all CPUs on each
6265 OFFLOAD_DEBUG_TRACE(2, "Fallback to all devices\n");
6266 device_cpu_list
.clear();
6267 mic_engines_total
= 0;
6268 for (int i
= 0; i
< num_devices
; i
++) {
6270 res
= COI::EngineGetHandle(COI_ISA_MIC
, i
, &engine
);
6271 if (res
== COI_SUCCESS
) {
6272 device_cpu_list
.push_back(make_pair(i
, (micLcpuMask
*)0));
6273 OFFLOAD_DEBUG_TRACE(2, "Device %d is available\n", i
);
6274 mic_engines_total
++;
6279 // no need to continue if there are no devices to offload to
6280 if (mic_engines_total
<= 0) {
6284 // Initialize indexes for available devices
6285 mic_engines
= new Engine
[mic_engines_total
];
6286 std::list
<deviceLcpu
>::iterator deviceIterator
;
6288 for (deviceIterator
= device_cpu_list
.begin();
6289 deviceIterator
!= device_cpu_list
.end();
6292 deviceLcpu device_mask_pair
= *deviceIterator
;
6293 int device_num
= device_mask_pair
.first
;
6294 micLcpuMask
*device_mask
= device_mask_pair
.second
;
6296 mic_engines
[l_idx
].set_indexes(l_idx
, device_num
);
6297 mic_engines
[l_idx
].set_cpu_mask(device_mask
);
6298 OFFLOAD_DEBUG_TRACE(2,
6299 "Logical MIC%d => Physical MIC%d\n", l_idx
, device_num
);
6300 if (device_mask
!= NULL
) {
6301 std::string cpu_string
=
6302 device_mask
->to_string
<
6304 std::string::traits_type
,
6305 std::string::allocator_type
>();
6306 OFFLOAD_DEBUG_TRACE(2, " CPUs: %s\n", cpu_string
.data());
6309 OFFLOAD_DEBUG_TRACE(2, " CPUs: all\n");
6314 // Get DMA channel count to pass it to COI
6315 env_var
= getenv("OFFLOAD_DMA_CHANNEL_COUNT");
6316 if (env_var
!= 0 && *env_var
!= '\0') {
6317 OFFLOAD_DEBUG_TRACE(2, "---- OFFLOAD_DMA_CHANNEL_COUNT=%s\n", env_var
);
6319 if (__offload_parse_int_string(env_var
, new_val
)) {
6320 mic_dma_channel_count
= new_val
;
6321 OFFLOAD_DEBUG_TRACE(2, "Using %d DMA channels\n",
6322 mic_dma_channel_count
);
6325 LIBOFFLOAD_ERROR(c_invalid_env_var_value
,
6326 "OFFLOAD_DMA_CHANNEL_COUNT");
6330 OFFLOAD_DEBUG_TRACE(2, "OFFLOAD_DMA_CHANNEL_COUNT is not set\n");
6333 // Set COI_HOST_THREAD_AFFINITY if OFFLOAD_HOST_THREAD_AFFINITY is set.
6334 // Use putenv instead of setenv as Windows has no setenv.
6335 // Note: putenv requires its argument can't be freed or modified.
6336 // So no free after call to putenv or elsewhere.
6337 env_var
= getenv("OFFLOAD_HOST_THREAD_AFFINITY");
6338 if (env_var
!= 0 && *env_var
!= '\0') {
6339 OFFLOAD_DEBUG_TRACE(2, "---- OFFLOAD_HOST_THREAD_AFFINITY=%s\n", env_var
);
6340 char * new_env_var
=
6341 (char*) malloc(sizeof("COI_HOST_THREAD_AFFINITY=") +
6342 strlen(env_var
) + 1);
6343 if (new_env_var
== NULL
)
6344 LIBOFFLOAD_ERROR(c_malloc
);
6345 sprintf(new_env_var
, "COI_HOST_THREAD_AFFINITY=%s", env_var
);
6346 putenv(new_env_var
);
6347 OFFLOAD_DEBUG_TRACE(2, "Setting COI_HOST_THREAD_AFFINITY = %s \n",
6348 getenv("COI_HOST_THREAD_AFFINITY"));
6351 OFFLOAD_DEBUG_TRACE(2, "OFFLOAD_HOST_THREAD_AFFINITY is not set\n");
6354 // library search path for KNC device binaries
6355 env_var
= getenv("MIC_LD_LIBRARY_PATH");
6357 OFFLOAD_DEBUG_TRACE(2, "---- MIC_LD_LIBRARY_PATH=%s\n", env_var
);
6358 knc_library_path
= strdup(env_var
);
6359 if (knc_library_path
== NULL
)
6360 LIBOFFLOAD_ERROR(c_malloc
);
6361 OFFLOAD_DEBUG_TRACE(2, "KNC library path set to %s\n", knc_library_path
);
6364 OFFLOAD_DEBUG_TRACE(2, "MIC_LD_LIBRARY_PATH is not set\n");
6367 // library search path for KNL device binaries
6368 env_var
= getenv("LD_LIBRARY_PATH");
6370 OFFLOAD_DEBUG_TRACE(2, "---- LD_LIBRARY_PATH=%s\n", env_var
);
6371 knl_library_path
= strdup(env_var
);
6372 if (knl_library_path
== NULL
)
6373 LIBOFFLOAD_ERROR(c_malloc
);
6374 OFFLOAD_DEBUG_TRACE(2, "KNL library path set to %s\n", knl_library_path
);
6377 OFFLOAD_DEBUG_TRACE(2, "LD_LIBRARY_PATH is not set\n");
6380 // memory size reserved for COI buffers
6381 env_var
= getenv("MIC_BUFFERSIZE");
6382 if (env_var
!= 0 && *env_var
!= '\0') {
6383 OFFLOAD_DEBUG_TRACE(2, "---- MIC_BUFFERSIZE=%s\n", env_var
);
6385 if (__offload_parse_size_string(env_var
, new_size
)) {
6386 mic_buffer_size
= new_size
;
6387 OFFLOAD_DEBUG_TRACE(2,
6388 "Reserved memory for COI buffers set to %lld bytes\n",
6392 LIBOFFLOAD_ERROR(c_invalid_env_var_value
, "MIC_BUFFERSIZE");
6396 OFFLOAD_DEBUG_TRACE(2, "MIC_BUFFERSIZE is not set\n");
6399 // memory size reserved for 4K pages for COI buffers
6400 env_var
= getenv("MIC_4K_BUFFER_RESERVE_SIZE");
6401 if (env_var
!= 0 && *env_var
!= '\0') {
6402 OFFLOAD_DEBUG_TRACE(2, "---- MIC_4K_BUFFER_RESERVE_SIZE=%s\n", env_var
);
6404 if (__offload_parse_size_string(env_var
, new_size
)) {
6405 mic_4k_buffer_size
= new_size
;
6406 OFFLOAD_DEBUG_TRACE(2,
6407 "Reserved memory for 4K COI buffers set to %lld bytes\n",
6408 mic_4k_buffer_size
);
6411 LIBOFFLOAD_ERROR(c_invalid_env_var_value
, "MIC_4K_BUFFER_RESERVE_SIZE");
6415 OFFLOAD_DEBUG_TRACE(2, "MIC_4K_BUFFER_RESERVE_SIZE is not set\n");
6418 // memory size reserved for 2M pages for COI buffers
6419 env_var
= getenv("MIC_2M_BUFFER_RESERVE_SIZE");
6420 if (env_var
!= 0 && *env_var
!= '\0') {
6421 OFFLOAD_DEBUG_TRACE(2, "---- MIC_2M_BUFFER_RESERVE_SIZE=%s\n", env_var
);
6423 if (__offload_parse_size_string(env_var
, new_size
)) {
6424 mic_2m_buffer_size
= new_size
;
6425 OFFLOAD_DEBUG_TRACE(2,
6426 "Reserved memory for 2M COI buffers set to %lld bytes\n",
6427 mic_2m_buffer_size
);
6430 LIBOFFLOAD_ERROR(c_invalid_env_var_value
,
6431 "MIC_2M_BUFFER_RESERVE_SIZE");
6435 OFFLOAD_DEBUG_TRACE(2, "MIC_2M_BUFFER_RESERVE_SIZE is not set\n");
6438 // determine stacksize for the pipeline on the device
6439 env_var
= getenv("MIC_STACKSIZE");
6440 if (env_var
!= 0 && *env_var
!= '\0') {
6441 OFFLOAD_DEBUG_TRACE(2, "---- MIC_STACKSIZE=%s\n", env_var
);
6443 if (__offload_parse_size_string(env_var
, new_size
) &&
6444 (new_size
>= 16384) && ((new_size
& 4095) == 0)) {
6445 mic_stack_size
= new_size
;
6446 OFFLOAD_DEBUG_TRACE(2, "MIC stack size set to %lld bytes\n",
6450 LIBOFFLOAD_ERROR(c_mic_init3
);
6454 OFFLOAD_DEBUG_TRACE(2, "MIC_STACKSIZE is not set\n");
6458 env_var
= getenv("MIC_PROXY_IO");
6459 if (env_var
!= 0 && *env_var
!= '\0') {
6460 OFFLOAD_DEBUG_TRACE(2, "---- MIC_PROXY_IO=%s\n", env_var
);
6462 if (__offload_parse_int_string(env_var
, new_val
)) {
6463 mic_proxy_io
= new_val
;
6464 OFFLOAD_DEBUG_TRACE(2, "MIC proxy i/o set to %s\n",
6468 LIBOFFLOAD_ERROR(c_invalid_env_var_int_value
, "MIC_PROXY_IO");
6472 OFFLOAD_DEBUG_TRACE(2, "MIC_PROXY_IO is not set\n");
6476 env_var
= getenv("MIC_PROXY_FS_ROOT");
6477 if (env_var
!= 0 && *env_var
!= '\0') {
6478 OFFLOAD_DEBUG_TRACE(2, "---- MIC_PROXY_FS_ROOT=%s\n", env_var
);
6479 mic_proxy_fs_root
= strdup(env_var
);
6480 if (mic_proxy_fs_root
== NULL
)
6481 LIBOFFLOAD_ERROR(c_malloc
);
6482 OFFLOAD_DEBUG_TRACE(2, "MIC proxy fs root set to %s\n",
6486 OFFLOAD_DEBUG_TRACE(2, "MIC_PROXY_FS_ROOT is not set\n");
6489 // Prepare environment for the target process using the following
6491 // - If MIC_ENV_PREFIX is set then any environment variable on the
6492 // host which has that prefix are copied to the device without
6494 // All other host environment variables are ignored.
6495 // - If MIC_ENV_PREFIX is not set or if MIC_ENV_PREFIX="" then host
6496 // environment is duplicated.
6497 env_var
= getenv("MIC_ENV_PREFIX");
6498 if (env_var
!= 0 && *env_var
!= '\0') {
6499 OFFLOAD_DEBUG_TRACE(2, "---- MIC_ENV_PREFIX=%s\n", env_var
);
6500 mic_env_vars
.set_prefix(env_var
);
6502 int len
= strlen(env_var
);
6503 for (int i
= 0; environ
[i
] != 0; i
++) {
6504 if (strncmp(environ
[i
], env_var
, len
) == 0 &&
6505 strncmp(environ
[i
], "MIC_LD_LIBRARY_PATH", 19) != 0 &&
6506 environ
[i
][len
] != '=') {
6507 mic_env_vars
.analyze_env_var(environ
[i
]);
6512 OFFLOAD_DEBUG_TRACE(2, "MIC_ENV_PREFIX is not set\n");
6515 // create key for thread data
6516 if (thread_key_create(&mic_thread_key
, Engine::destroy_thread_data
)) {
6517 LIBOFFLOAD_ERROR(c_mic_init4
, errno
);
6522 cpu_frequency
= COI::PerfGetCycleFrequency();
6524 env_var
= getenv(mic_use_2mb_buffers_envname
);
6525 if (env_var
!= 0 && *env_var
!= '\0') {
6526 OFFLOAD_DEBUG_TRACE(2, "---- %s=%s\n",
6527 mic_use_2mb_buffers_envname
, env_var
);
6529 if (__offload_parse_size_string(env_var
, new_size
)) {
6530 __offload_use_2mb_buffers
= new_size
;
6531 OFFLOAD_DEBUG_TRACE(2,
6532 "Threshold for use of 2M buffers set to %lld\n",
6533 __offload_use_2mb_buffers
);
6536 LIBOFFLOAD_ERROR(c_invalid_env_var_value
,
6537 mic_use_2mb_buffers_envname
);
6541 OFFLOAD_DEBUG_TRACE(2, "%s is not set\n", mic_use_2mb_buffers_envname
);
6544 env_var
= getenv(mic_use_async_buffer_write_envname
);
6545 if (env_var
!= 0 && *env_var
!= '\0') {
6546 OFFLOAD_DEBUG_TRACE(2, "---- %s=%s\n",
6547 mic_use_async_buffer_write_envname
, env_var
);
6549 if (__offload_parse_size_string(env_var
, new_size
)) {
6550 __offload_use_async_buffer_write
= new_size
;
6551 OFFLOAD_DEBUG_TRACE(2,
6552 "Threshold for async buffer write set to %lld\n",
6553 __offload_use_async_buffer_write
);
6557 OFFLOAD_DEBUG_TRACE(2, "%s is not set\n",
6558 mic_use_async_buffer_write_envname
);
6561 env_var
= getenv(mic_use_async_buffer_read_envname
);
6562 if (env_var
!= 0 && *env_var
!= '\0') {
6563 OFFLOAD_DEBUG_TRACE(2, "---- %s=%s\n",
6564 mic_use_async_buffer_read_envname
, env_var
);
6566 if (__offload_parse_size_string(env_var
, new_size
)) {
6567 __offload_use_async_buffer_read
= new_size
;
6568 OFFLOAD_DEBUG_TRACE(2,
6569 "Threshold for async buffer read set to %lld\n",
6570 __offload_use_async_buffer_read
);
6574 OFFLOAD_DEBUG_TRACE(2, "%s is not set\n",
6575 mic_use_async_buffer_read_envname
);
6578 // mic initialization type
6579 env_var
= getenv(offload_init_envname
);
6580 if (env_var
!= 0 && *env_var
!= '\0') {
6581 OFFLOAD_DEBUG_TRACE(2, "---- %s=%s\n",
6582 offload_init_envname
, env_var
);
6583 if (strcmp(env_var
, "on_offload") == 0) {
6584 __offload_init_type
= c_init_on_offload
;
6585 OFFLOAD_DEBUG_TRACE(2,
6586 "A MIC device will be initialized "
6587 "on first offload to that device\n");
6589 else if (strcmp(env_var
, "on_offload_all") == 0) {
6590 __offload_init_type
= c_init_on_offload_all
;
6591 OFFLOAD_DEBUG_TRACE(2,
6592 "All MIC devices will be initialized "
6593 "on first offload to any device\n");
6595 else if (strcmp(env_var
, "on_start") == 0) {
6596 __offload_init_type
= c_init_on_start
;
6597 OFFLOAD_DEBUG_TRACE(2,
6598 "All MIC devices will be initialized "
6599 "at program start\n");
6602 LIBOFFLOAD_ERROR(c_invalid_env_var_value
, offload_init_envname
);
6606 OFFLOAD_DEBUG_TRACE(2, "%s is not set\n", offload_init_envname
);
6610 env_var
= getenv(offload_active_wait_envname
);
6611 if (env_var
!= 0 && *env_var
!= '\0') {
6612 OFFLOAD_DEBUG_TRACE(2, "---- %s=%s\n",
6613 offload_active_wait_envname
, env_var
);
6615 if (__offload_parse_int_string(env_var
, new_val
)) {
6616 __offload_active_wait
= new_val
;
6617 OFFLOAD_DEBUG_TRACE(2,
6618 "Flag to poll on event completion is set to %d\n",
6619 __offload_active_wait
);
6622 LIBOFFLOAD_ERROR(c_invalid_env_var_int_value
,
6623 offload_active_wait_envname
);
6627 OFFLOAD_DEBUG_TRACE(2, "%s is not set\n", offload_active_wait_envname
);
6631 env_var
= getenv(offload_always_wait_envname
);
6632 if (env_var
!= 0 && *env_var
!= '\0') {
6633 OFFLOAD_DEBUG_TRACE(2, "---- %s=%s\n",
6634 offload_always_wait_envname
, env_var
);
6636 if (__offload_parse_int_string(env_var
, new_val
)) {
6637 __offload_always_wait
= new_val
;
6638 OFFLOAD_DEBUG_TRACE(2,
6639 "Flag to poll on event completion is set to %d\n",
6640 __offload_active_wait
);
6643 LIBOFFLOAD_ERROR(c_invalid_env_var_int_value
,
6644 offload_always_wait_envname
);
6648 OFFLOAD_DEBUG_TRACE(2, "%s is not set\n", offload_always_wait_envname
);
6652 env_var
= getenv(omp_device_num_envname
);
6653 if (env_var
!= 0 && *env_var
!= '\0') {
6654 OFFLOAD_DEBUG_TRACE(2, "---- %s=%s\n",
6655 omp_device_num_envname
, env_var
);
6657 if (__offload_parse_int_string(env_var
, new_val
) && new_val
>= 0) {
6658 __omp_device_num
= new_val
;
6659 OFFLOAD_DEBUG_TRACE(2, "OpenMP default device number is set to %d\n",
6663 LIBOFFLOAD_ERROR(c_omp_invalid_device_num_env
,
6664 omp_device_num_envname
);
6668 OFFLOAD_DEBUG_TRACE(2, "%s is not set\n", omp_device_num_envname
);
6671 // parallel copy of offload_transfer
6672 env_var
= getenv(parallel_copy_envname
);
6673 if (env_var
!= 0 && *env_var
!= '\0') {
6674 OFFLOAD_DEBUG_TRACE(2, "---- %s=%s\n",
6675 parallel_copy_envname
, env_var
);
6677 if (__offload_parse_int_string(env_var
, new_val
) && new_val
>= 0) {
6678 __offload_parallel_copy
= new_val
;
6679 OFFLOAD_DEBUG_TRACE(2,
6680 "Flag for using async buffer copy is set to %d\n",
6681 __offload_parallel_copy
);
6684 LIBOFFLOAD_ERROR(c_invalid_env_var_value
,
6685 parallel_copy_envname
);
6689 OFFLOAD_DEBUG_TRACE(2, "%s is not set\n", parallel_copy_envname
);
6692 // use COI interface for noncontiguous arrays transfer
6693 env_var
= getenv(use_coi_noncontiguous_transfer_envname
);
6694 if (env_var
!= 0 && *env_var
!= '\0') {
6695 OFFLOAD_DEBUG_TRACE(2, "---- %s=%s\n",
6696 use_coi_noncontiguous_transfer_envname
, env_var
);
6698 if (__offload_parse_size_string(env_var
, new_size
)) {
6699 __offload_use_coi_noncontiguous_transfer
= new_size
;
6700 OFFLOAD_DEBUG_TRACE(2,
6701 "Flag for using new COI noncontiguous API is set to %d\n",
6702 __offload_use_coi_noncontiguous_transfer
);
6705 LIBOFFLOAD_ERROR(c_invalid_env_var_value
,
6706 use_coi_noncontiguous_transfer_envname
);
6710 OFFLOAD_DEBUG_TRACE(2, "%s is not set\n",
6711 use_coi_noncontiguous_transfer_envname
);
6714 OFFLOAD_DEBUG_TRACE(2, "---- End of environment variable processing\n");
6720 extern int __offload_init_library(void)
6722 // do one time intialization
6723 static OffloadOnceControl ctrl
= OFFLOAD_ONCE_CONTROL_INIT
;
6724 __offload_run_once(&ctrl
, __offload_init_library_once
);
6726 // offload is available if COI is available and the number of devices > 0
6727 bool is_available
= COI::is_available
&& (mic_engines_total
> 0);
6729 // register pending libraries if there are any
6730 if (is_available
&& __target_libs
) {
6731 mutex_locker_t
locker(__target_libs_lock
);
6733 for (TargetImageList::iterator it
= __target_libs_list
.begin();
6734 it
!= __target_libs_list
.end(); it
++) {
6735 // Register library in COI
6736 COI::ProcessRegisterLibraries(1, &it
->data
, &it
->size
,
6737 &it
->origin
, &it
->offset
);
6739 // add lib to all engines
6740 for (int i
= 0; i
< mic_engines_total
; i
++) {
6741 mic_engines
[i
].add_lib(*it
);
6745 __target_libs
= false;
6746 __target_libs_list
.clear();
6749 return is_available
;
6752 extern "C" bool __offload_target_image_is_executable(const void *target_image
)
6754 const struct Image
*image
= static_cast<const struct Image
*>(target_image
);
6757 const char *name
= image
->data
;
6758 const void *data
= image
->data
+ strlen(image
->data
) + 1;
6760 // determine image type
6761 const Elf64_Ehdr
*hdr
= static_cast<const Elf64_Ehdr
*>(data
);
6762 return (hdr
->e_type
== ET_EXEC
);
6765 extern "C" bool __offload_register_image(const void *target_image
)
6767 const struct Image
*image
= static_cast<const struct Image
*>(target_image
);
6768 const void *data
= image
->data
+ strlen(image
->data
) + 1;
6769 uint64_t size
= image
->size
;
6770 uint64_t offset
= 0;
6773 const char *fat_name
= image
->data
;
6774 char *mic_name
= (char *) malloc(strlen(image
->data
) + 1);
6775 char *host_name
= (char *) malloc(strlen(image
->data
));
6778 if ((mic_name
== NULL
) || (host_name
== NULL
))
6779 LIBOFFLOAD_ERROR(c_malloc
);
6781 // The origin name is the name of the file on the host
6782 // this is used by Vtune, since it is a fat binary we
6783 // use the host file name of the fat binary.
6784 // Driver prepends the host file name ending with "?"
6785 // to the image->data name so need to extract the string
6786 // name format: <mic_name>?<origin>
6790 while ((*fat_name
!= '\0') && (*fat_name
!= '?')) {
6791 mic_name
[i
] = *fat_name
;
6796 // Remove the host file name by inserting end of string marker
6800 if (*fat_name
== '?') {
6801 // The string following "?" is the name of the host file name.
6804 while (*fat_name
!= '\0') {
6805 host_name
[i
] = *fat_name
;
6809 host_name
[i
] = '\0';
6812 // Windows current does not have host name
6817 // our actions depend on the image type
6818 const Elf64_Ehdr
*hdr
= static_cast<const Elf64_Ehdr
*>(data
);
6819 switch (hdr
->e_type
) {
6821 __current_image_is_dll
= false;
6822 // Each offload application is supposed to have only one target
6823 // image representing target executable.
6824 // No thread synchronization is required here as the initialization
6825 // code is always executed in a single thread.
6826 if (__target_exe
!= 0) {
6827 LIBOFFLOAD_ERROR(c_multiple_target_exes
);
6830 __target_exe
= new TargetImage(mic_name
, data
, size
, host_name
, offset
);
6832 // Registration code for execs is always called from the context
6833 // of main and thus we can safely call any function here,
6834 // including LoadLibrary API on windows. This is the place where
6835 // we do the offload library initialization.
6836 if (__offload_init_library()) {
6837 // initialize engine if init_type is on_start
6838 if (__offload_init_type
== c_init_on_start
) {
6839 for (int i
= 0; i
< mic_engines_total
; i
++) {
6840 mic_engines
[i
].init();
6844 return mic_engines_total
> 0;
6848 char * fullname
= NULL
;
6849 __current_image_is_dll
= true;
6850 // We add the library to a list of pending libraries
6851 __target_libs_lock
.lock();
6852 __target_libs
= true;
6853 __target_libs_list
.push_back(
6854 TargetImage(mic_name
, data
, size
, fullname
, offset
));
6855 __target_libs_lock
.unlock();
6856 // If __target_exe is set, then main has started running
6857 // If not main, then we can't do anything useful here
6858 // because this registration code is called from DllMain
6859 // context (on windows).
6860 if (__target_exe
!= 0) {
6861 // There is no need to delay loading the library
6862 if (!__offload_init_library()) {
6863 // Couldn't validate library as a fat offload library
6864 LIBOFFLOAD_ERROR(c_unknown_binary_type
);
6872 // something is definitely wrong, issue an error and exit
6873 LIBOFFLOAD_ERROR(c_unknown_binary_type
);
6878 // When dlopen is used dlclose may happen after the COI process
6879 // is destroyed. In which case images cannot be unloaded and should
6880 // be skipped. So track if coi has been unloaded.
6881 static bool coi_may_have_been_unloaded
= false;
6883 extern "C" void __offload_unregister_image(const void *target_image
)
6885 // Target image is packed as follows:
6886 // 8 bytes - size of the target binary
6887 // null-terminated string - binary name
6888 // <size> bytes - binary contents
6889 const struct Image
{
6892 } *image
= static_cast<const struct Image
*>(target_image
);
6895 const char *name
= image
->data
;
6896 const void *data
= image
->data
+ strlen(image
->data
) + 1;
6898 // our actions depend on the image type
6899 const Elf64_Ehdr
*hdr
= static_cast<const Elf64_Ehdr
*>(data
);
6900 if (hdr
->e_type
== ET_EXEC
) {
6901 // We are executing exec's desctructors.
6902 // It is time to do a library cleanup.
6903 if (timer_enabled
) {
6904 Offload_Timer_Print();
6907 coi_may_have_been_unloaded
= true;
6909 // Do not unload the MYO library if it loaded in dll.
6910 if (!__myo_init_in_so
)
6913 __offload_myoFini();
6914 #endif // MYO_SUPPORT
6916 __offload_fini_library();
6919 else if ((hdr
->e_type
== ET_DYN
) && !coi_may_have_been_unloaded
) {
6920 for (int i
= 0; i
< mic_engines_total
; i
++) {
6921 mic_engines
[i
].unload_library(data
, name
);
6927 extern "C" void __offload_register_task_callback(void (*cb
)(void *))
6929 task_completion_callback
= cb
;
6932 // Runtime trace interface for user programs
6934 void __offload_console_trace(int level
)
6936 console_enabled
= level
;
6939 // User-visible offload API
6941 int _Offload_number_of_devices(void)
6943 __offload_init_library();
6944 return mic_engines_total
;
6947 int _Offload_get_device_number(void)
6952 int _Offload_get_physical_device_number(void)
6957 int _Offload_signaled(int index
, void *signal
)
6959 __offload_init_library();
6961 // check index value
6963 LIBOFFLOAD_ERROR(c_offload_signaled1
, index
);
6967 index
%= mic_engines_total
;
6969 // find associated async task
6970 OffloadDescriptor
*task
=
6971 mic_engines
[index
].find_signal(signal
, false);
6973 LIBOFFLOAD_ERROR(c_offload_signaled2
, signal
);
6976 // if signal is removed by wait completing
6977 else if (task
== SIGNAL_HAS_COMPLETED
) {
6980 return task
->is_signaled();
6983 void _Offload_report(int val
)
6985 if (val
== OFFLOAD_REPORT_ON
||
6986 val
== OFFLOAD_REPORT_OFF
) {
6987 offload_report_enabled
= val
;
6991 int _Offload_find_associated_mic_memory(
6993 const void* cpu_addr
,
6994 void** cpu_base_addr
,
6995 uint64_t* buf_length
,
6997 uint64_t* mic_buf_start_offset
,
7001 __offload_init_library();
7003 // check target value
7005 LIBOFFLOAD_ERROR(c_offload_signaled1
, target
);
7008 target
%= mic_engines_total
;
7010 // find existing association in pointer table
7011 PtrData
* ptr_data
= mic_engines
[target
].find_ptr_data(cpu_addr
);
7012 if (ptr_data
== 0) {
7013 OFFLOAD_TRACE(3, "Association does not exist\n");
7017 OFFLOAD_TRACE(3, "Found association: base %p, length %lld, is_static %d\n",
7018 ptr_data
->cpu_addr
.start(), ptr_data
->cpu_addr
.length(),
7019 ptr_data
->is_static
);
7021 if (ptr_data
->mic_buf
!= 0 && ptr_data
->mic_addr
== 0) {
7022 COIRESULT res
= COI::BufferGetSinkAddress(ptr_data
->mic_buf
,
7023 &ptr_data
->mic_addr
);
7024 if (res
!= COI_SUCCESS
) {
7028 *cpu_base_addr
= const_cast<void *>(ptr_data
->cpu_addr
.start());
7029 *buf_length
= ptr_data
->cpu_addr
.length() - ptr_data
->alloc_disp
;
7030 *mic_addr
= (void *)(ptr_data
->mic_addr
+ ptr_data
->mic_offset
);
7031 *mic_buf_start_offset
= ptr_data
->alloc_disp
;
7032 *is_static
= ptr_data
->is_static
;
7033 return ptr_data
->is_static
? 1 : ptr_data
->get_reference();
7036 _Offload_stream
_Offload_stream_create(
7037 int device
, // MIC device number
7038 int number_of_cpus
// Cores allocated to the stream
7041 __offload_init_library();
7043 // check target value
7045 LIBOFFLOAD_ERROR(c_offload_signaled1
, device
);
7048 device
%= mic_engines_total
;
7050 // Create new stream and get its handle
7051 _Offload_stream handle
= Stream::add_stream(device
, number_of_cpus
);
7053 OFFLOAD_TRACE(3, "Can't create stream\n");
7057 // create pipeline associated with the new stream
7058 mic_engines
[device
].get_pipeline(handle
);
7063 int _Offload_stream_destroy(
7064 int device
, // MIC device number
7065 _Offload_stream handle
// stream to destroy
7068 if (Stream::get_streams_count() == 0) {
7069 LIBOFFLOAD_ERROR(c_offload_streams_are_absent
);
7072 // check target value
7074 LIBOFFLOAD_ERROR(c_offload_signaled1
, device
);
7077 device
%= mic_engines_total
;
7079 mic_engines
[device
].stream_destroy(handle
);
7084 int _Offload_stream_delete(
7085 _Offload_stream handle
// stream to destroy
7088 int device
; // MIC device number
7091 if (Stream::get_streams_count() == 0) {
7092 LIBOFFLOAD_ERROR(c_offload_streams_are_absent
);
7096 stream
= Stream::find_stream(handle
, false);
7097 // the stream was not created or was destroyed
7099 LIBOFFLOAD_ERROR(c_offload_no_stream
, device
);
7103 device
= stream
->get_device();
7105 mic_engines
[device
].stream_destroy(handle
);
7110 int _Offload_stream_completed(int device
, _Offload_stream handler
)
7112 if (Stream::get_streams_count() == 0) {
7113 LIBOFFLOAD_ERROR(c_offload_streams_are_absent
);
7116 // check device index value
7118 LIBOFFLOAD_ERROR(c_offload_signaled1
, device
);
7121 else if (device
> -1) {
7122 device
%= mic_engines_total
;
7128 stream
= Stream::find_stream(handler
, false);
7130 // the stream was not created or was destroyed
7132 LIBOFFLOAD_ERROR(c_offload_no_stream
, device
);
7136 if (device
!= stream
->get_device()) {
7137 LIBOFFLOAD_ERROR(c_offload_device_doesnt_match_to_stream
,
7138 stream
->get_device());
7141 // find associated async task
7142 OffloadDescriptor
*task
= stream
->get_last_offload();
7144 // offload was completed by offload_wait pragma or wait clause
7148 return task
->is_signaled();
7150 // zero handler is for all streams at the device
7152 StreamMap stream_map
= Stream::all_streams
;
7153 for (StreamMap::iterator it
= stream_map
.begin();
7154 it
!= stream_map
.end(); it
++) {
7155 Stream
* stream
= it
->second
;
7156 if (device
!= -1 && device
!= stream
->get_device()) {
7159 // find associated async task
7160 OffloadDescriptor
*task
= stream
->get_last_offload();
7162 // offload was completed by offload_wait pragma or wait clause
7166 // if even one stream is not completed result is false
7167 if (!task
->is_signaled()) {
7171 // no uncompleted streams
7176 int _Offload_stream_is_empty(_Offload_stream handle
)
7180 if (Stream::get_streams_count() == 0) {
7181 LIBOFFLOAD_ERROR(c_offload_streams_are_absent
);
7185 Stream
* stream
= Stream::find_stream(handle
, false);
7187 // the stream was not created or was destroyed
7189 LIBOFFLOAD_ERROR(c_offload_no_stream
, device
);
7192 device
= stream
->get_device();
7197 // Use 0 for device index as _Offload_stream_completed
7198 // ignores this value while defining streams completion
7199 return _Offload_stream_completed(device
, handle
);
7202 int _Offload_device_streams_completed(int device
)
7204 if (Stream::get_streams_count() == 0) {
7205 LIBOFFLOAD_ERROR(c_offload_streams_are_absent
);
7208 // check index value
7210 LIBOFFLOAD_ERROR(c_offload_signaled1
, device
);
7213 else if (device
> -1) {
7214 device
%= mic_engines_total
;
7217 StreamMap stream_map
= Stream::all_streams
;
7218 for (StreamMap::iterator it
= stream_map
.begin();
7219 it
!= stream_map
.end(); it
++)
7221 Stream
* stream
= it
->second
;
7223 if (device
!= -1 && device
!= stream
->get_device()) {
7226 // find associated async task
7227 OffloadDescriptor
*task
= stream
->get_last_offload();
7229 // offload was completed by offload_wait pragma or wait clause
7233 // if even one stream is not completed result is false
7234 if (!task
->is_signaled()) {
7238 // no uncompleted streams
7243 int __dbg_is_attached
= 0;
7244 int __dbg_target_id
= -1;
7245 pid_t __dbg_target_so_pid
= -1;
7246 char __dbg_target_exe_name
[MAX_TARGET_NAME
] = {0};
7247 const int __dbg_api_major_version
= 1;
7248 const int __dbg_api_minor_version
= 0;
7250 void __dbg_target_so_loaded()
7253 void __dbg_target_so_unloaded()