runtime: allow preemption in fast syscall return
[official-gcc.git] / liboffloadmic / runtime / offload_engine.h
blobb4cdad503be3f6b532c512c2506e590076bd7696
1 /*
2 Copyright (c) 2014-2016 Intel Corporation. All Rights Reserved.
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions
6 are met:
8 * Redistributions of source code must retain the above copyright
9 notice, this list of conditions and the following disclaimer.
10 * Redistributions in binary form must reproduce the above copyright
11 notice, this list of conditions and the following disclaimer in the
12 documentation and/or other materials provided with the distribution.
13 * Neither the name of Intel Corporation nor the names of its
14 contributors may be used to endorse or promote products derived
15 from this software without specific prior written permission.
17 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 #ifndef OFFLOAD_ENGINE_H_INCLUDED
32 #define OFFLOAD_ENGINE_H_INCLUDED
34 #include <limits.h>
35 #include <bitset>
36 #include <list>
37 #include <set>
38 #include <map>
39 #include "offload_common.h"
40 #include "coi/coi_client.h"
42 #define SIGNAL_HAS_COMPLETED ((OffloadDescriptor *)-1)
43 const int64_t no_stream = -1;
45 // Address range
46 class MemRange {
47 public:
48 MemRange() : m_start(0), m_length(0) {}
49 MemRange(const void *addr, uint64_t len) : m_start(addr), m_length(len) {}
51 const void* start() const {
52 return m_start;
55 const void* end() const {
56 return static_cast<const char*>(m_start) + m_length;
59 uint64_t length() const {
60 return m_length;
63 // returns true if given range overlaps with another one
64 bool overlaps(const MemRange &o) const {
65 // Two address ranges A[start, end) and B[start,end) overlap
66 // if A.start < B.end and A.end > B.start.
67 return start() < o.end() && end() > o.start();
70 // returns true if given range contains the other range
71 bool contains(const MemRange &o) const {
72 return start() <= o.start() && o.end() <= end();
75 private:
76 const void* m_start;
77 uint64_t m_length;
80 // Data associated with a pointer variable
81 class PtrData {
82 public:
83 PtrData(const void *addr, uint64_t len) :
84 cpu_addr(addr, len), cpu_buf(0),
85 mic_addr(0), alloc_disp(0), mic_buf(0), mic_offset(0),
86 ref_count(0), is_static(false), is_omp_associate(false)
90 // Copy constructor
92 PtrData(const PtrData& ptr):
93 cpu_addr(ptr.cpu_addr), cpu_buf(ptr.cpu_buf),
94 mic_addr(ptr.mic_addr), alloc_disp(ptr.alloc_disp),
95 mic_buf(ptr.mic_buf), mic_offset(ptr.mic_offset),
96 ref_count(ptr.ref_count), is_static(ptr.is_static),
97 is_omp_associate(ptr.is_omp_associate),
98 var_alloc_type(0)
101 bool operator<(const PtrData &o) const {
102 // Variables are sorted by the CPU start address.
103 // Overlapping memory ranges are considered equal.
104 return (cpu_addr.start() < o.cpu_addr.start()) &&
105 !cpu_addr.overlaps(o.cpu_addr);
108 long add_reference() {
109 if (is_omp_associate || (is_static && !var_alloc_type)) {
110 return LONG_MAX;
112 #ifndef TARGET_WINNT
113 return __sync_fetch_and_add(&ref_count, 1);
114 #else // TARGET_WINNT
115 return _InterlockedIncrement(&ref_count) - 1;
116 #endif // TARGET_WINNT
119 long remove_reference() {
120 if (is_omp_associate || (is_static && !var_alloc_type)) {
121 return LONG_MAX;
123 #ifndef TARGET_WINNT
124 return __sync_sub_and_fetch(&ref_count, 1);
125 #else // TARGET_WINNT
126 return _InterlockedDecrement(&ref_count);
127 #endif // TARGET_WINNT
130 long get_reference() const {
131 if (is_omp_associate || (is_static && !var_alloc_type)) {
132 return LONG_MAX;
134 return ref_count;
137 public:
138 // CPU address range
139 const MemRange cpu_addr;
141 // CPU and MIC buffers
142 COIBUFFER cpu_buf;
143 COIBUFFER mic_buf;
145 // placeholder for buffer address on mic
146 uint64_t mic_addr;
148 uint64_t alloc_disp;
150 // additional offset to pointer data on MIC for improving bandwidth for
151 // data which is not 4K aligned
152 uint32_t mic_offset;
154 // if true buffers are created from static memory
155 bool is_static;
157 // true if MIC buffer created by omp_target_associate
158 bool is_omp_associate;
160 bool var_alloc_type;
161 mutex_t alloc_ptr_data_lock;
163 private:
164 // reference count for the entry
165 long ref_count;
168 typedef std::list<PtrData*> PtrDataList;
170 class PtrDataTable {
171 public:
172 typedef std::set<PtrData> PtrSet;
174 PtrData* find_ptr_data(const void *ptr) {
175 m_ptr_lock.lock();
176 PtrSet::iterator res = list.find(PtrData(ptr, 0));
178 m_ptr_lock.unlock();
179 if (res == list.end()) {
180 return 0;
182 return const_cast<PtrData*>(res.operator->());
185 PtrData* insert_ptr_data(const void *ptr, uint64_t len, bool &is_new) {
186 m_ptr_lock.lock();
187 std::pair<PtrSet::iterator, bool> res =
188 list.insert(PtrData(ptr, len));
190 PtrData* ptr_data = const_cast<PtrData*>(res.first.operator->());
191 m_ptr_lock.unlock();
193 is_new = res.second;
194 if (is_new) {
195 // It's necessary to lock as soon as possible.
196 // unlock must be done at call site of insert_ptr_data at
197 // branch for is_new
198 ptr_data->alloc_ptr_data_lock.lock();
200 return ptr_data;
203 void remove_ptr_data(const void *ptr) {
204 m_ptr_lock.lock();
205 list.erase(PtrData(ptr, 0));
206 m_ptr_lock.unlock();
208 private:
210 PtrSet list;
211 mutex_t m_ptr_lock;
214 // Data associated with automatic variable
215 class AutoData {
216 public:
217 AutoData(const void *addr, uint64_t len) :
218 cpu_addr(addr, len), ref_count(0)
221 bool operator<(const AutoData &o) const {
222 // Variables are sorted by the CPU start address.
223 // Overlapping memory ranges are considered equal.
224 return (cpu_addr.start() < o.cpu_addr.start()) &&
225 !cpu_addr.overlaps(o.cpu_addr);
228 long add_reference() {
229 #ifndef TARGET_WINNT
230 return __sync_fetch_and_add(&ref_count, 1);
231 #else // TARGET_WINNT
232 return _InterlockedIncrement(&ref_count) - 1;
233 #endif // TARGET_WINNT
236 long remove_reference() {
237 #ifndef TARGET_WINNT
238 return __sync_sub_and_fetch(&ref_count, 1);
239 #else // TARGET_WINNT
240 return _InterlockedDecrement(&ref_count);
241 #endif // TARGET_WINNT
244 long nullify_reference() {
245 #ifndef TARGET_WINNT
246 return __sync_lock_test_and_set(&ref_count, 0);
247 #else // TARGET_WINNT
248 return _InterlockedExchange(&ref_count,0);
249 #endif // TARGET_WINNT
252 long get_reference() const {
253 return ref_count;
256 public:
257 // CPU address range
258 const MemRange cpu_addr;
260 private:
261 // reference count for the entry
262 long ref_count;
265 // Set of autimatic variables
266 typedef std::set<AutoData> AutoSet;
268 // Target image data
269 struct TargetImage
271 TargetImage(const char *_name, const void *_data, uint64_t _size,
272 const char *_origin, uint64_t _offset) :
273 name(_name), data(_data), size(_size),
274 origin(_origin), offset(_offset)
277 // library name
278 const char* name;
280 // contents and size
281 const void* data;
282 uint64_t size;
284 // file of origin and offset within that file
285 const char* origin;
286 uint64_t offset;
289 typedef std::list<TargetImage> TargetImageList;
291 // dynamic library and Image associated with lib
292 struct DynLib
294 DynLib(const char *_name, const void *_data,
295 COILIBRARY _lib) :
296 name(_name), data(_data), lib(_lib)
298 // library name
299 const char* name;
301 // contents
302 const void* data;
304 COILIBRARY lib;
306 typedef std::list<DynLib> DynLibList;
308 // Data associated with persistent auto objects
309 struct PersistData
311 PersistData(const void *addr, uint64_t routine_num,
312 uint64_t size, uint64_t thread) :
313 stack_cpu_addr(addr), routine_id(routine_num), thread_id(thread)
315 stack_ptr_data = new PtrData(0, size);
317 // 1-st key value - beginning of the stack at CPU
318 const void * stack_cpu_addr;
319 // 2-nd key value - identifier of routine invocation at CPU
320 uint64_t routine_id;
321 // 3-rd key value - thread identifier
322 uint64_t thread_id;
324 // corresponded PtrData; only stack_ptr_data->mic_buf is used
325 PtrData * stack_ptr_data;
326 // used to get offset of the variable in stack buffer
327 char * cpu_stack_addr;
330 typedef std::list<PersistData> PersistDataList;
332 // Data associated with stream
333 struct Stream
335 Stream(int device, int num_of_cpus) :
336 m_number_of_cpus(num_of_cpus), m_pipeline(0), m_last_offload(0),
337 m_device(device)
339 ~Stream() {
340 if (m_pipeline) {
341 COI::PipelineDestroy(m_pipeline);
345 COIPIPELINE get_pipeline(void) {
346 return(m_pipeline);
349 int get_device(void) {
350 return(m_device);
353 int get_cpu_number(void) {
354 return(m_number_of_cpus);
357 void set_pipeline(COIPIPELINE pipeline) {
358 m_pipeline = pipeline;
361 OffloadDescriptor* get_last_offload(void) {
362 return(m_last_offload);
365 void set_last_offload(OffloadDescriptor* last_offload) {
366 m_last_offload = last_offload;
369 static Stream* find_stream(uint64_t handle, bool remove);
371 static _Offload_stream add_stream(int device, int number_of_cpus) {
372 _Offload_stream result;
373 m_stream_lock.lock();
374 result = ++m_streams_count;
375 all_streams[m_streams_count] = new Stream(device, number_of_cpus);
376 m_stream_lock.unlock();
377 return(result);
380 static uint64_t get_streams_count() {
381 return m_streams_count;
384 typedef std::map<uint64_t, Stream*> StreamMap;
386 static uint64_t m_streams_count;
387 static StreamMap all_streams;
388 static mutex_t m_stream_lock;
390 int m_device;
392 // number of cpus
393 int m_number_of_cpus;
395 // The pipeline associated with the stream
396 COIPIPELINE m_pipeline;
398 // The last offload occured via the stream
399 OffloadDescriptor* m_last_offload;
401 // Cpus used by the stream
402 std::bitset<COI_MAX_HW_THREADS> m_stream_cpus;
405 typedef std::map<uint64_t, Stream*> StreamMap;
406 typedef std::bitset<COI_MAX_HW_THREADS> micLcpuMask;
408 // ordered by count double linked list of cpus used by streams
409 typedef struct CpuEl{
410 uint64_t count; // number of streams using the cpu
411 struct CpuEl* prev; // cpu with the same or lesser count
412 struct CpuEl* next; // cpu with the same or greater count
413 } CpuEl;
415 // class representing a single engine
416 struct Engine {
417 friend void __offload_init_library_once(void);
418 friend void __offload_fini_library(void);
420 #define CPU_INDEX(x) (x - m_cpus)
421 #define check_result(res, tag, ...) \
423 if (res == COI_PROCESS_DIED) { \
424 fini_process(true); \
425 exit(1); \
427 if (res != COI_SUCCESS) { \
428 __liboffload_error_support(tag, __VA_ARGS__); \
429 exit(1); \
433 int get_logical_index() const {
434 return m_index;
437 int get_physical_index() const {
438 return m_physical_index;
441 const COIPROCESS& get_process() const {
442 return m_process;
445 bool get_ready() {
446 return m_ready;
449 uint64_t get_thread_id(void);
451 // initialize device
452 void init(void);
454 // unload library
455 void unload_library(const void *data, const char *name);
457 // add new library
458 void add_lib(const TargetImage &lib)
460 m_lock.lock();
461 m_ready = false;
462 m_images.push_back(lib);
463 m_lock.unlock();
466 COIRESULT compute(
467 _Offload_stream stream,
468 const std::list<COIBUFFER> &buffers,
469 const void* data,
470 uint16_t data_size,
471 void* ret,
472 uint16_t ret_size,
473 uint32_t num_deps,
474 const COIEVENT* deps,
475 COIEVENT* event
478 #ifdef MYO_SUPPORT
479 // temporary workaround for blocking behavior for myoiLibInit/Fini calls
480 void init_myo(COIEVENT *event) {
481 COIRESULT res;
482 res = COI::PipelineRunFunction(get_pipeline(),
483 m_funcs[c_func_myo_init],
484 0, 0, 0, 0, 0, 0, 0, 0, 0,
485 event);
486 check_result(res, c_pipeline_run_func, m_index, res);
489 void fini_myo(COIEVENT *event) {
490 COIRESULT res;
491 res = COI::PipelineRunFunction(get_pipeline(),
492 m_funcs[c_func_myo_fini],
493 0, 0, 0, 0, 0, 0, 0, 0, 0,
494 event);
495 check_result(res, c_pipeline_run_func, m_index, res);
497 #endif // MYO_SUPPORT
500 // Memory association table
502 PtrData* find_ptr_data(const void *ptr) {
503 return m_ptr_set.find_ptr_data(ptr);
506 PtrData* find_targetptr_data(const void *ptr) {
507 return m_targetptr_set.find_ptr_data(ptr);
510 PtrData* insert_ptr_data(const void *ptr, uint64_t len, bool &is_new) {
511 return m_ptr_set.insert_ptr_data(ptr, len, is_new);
514 PtrData* insert_targetptr_data(const void *ptr, uint64_t len,
515 bool &is_new) {
516 return m_targetptr_set.insert_ptr_data(ptr, len, is_new);
519 void remove_ptr_data(const void *ptr) {
520 m_ptr_set.remove_ptr_data(ptr);
523 void remove_targetptr_data(const void *ptr) {
524 m_targetptr_set.remove_ptr_data(ptr);
528 // Automatic variables
530 AutoData* find_auto_data(const void *ptr) {
531 AutoSet &auto_vars = get_auto_vars();
532 AutoSet::iterator res = auto_vars.find(AutoData(ptr, 0));
533 if (res == auto_vars.end()) {
534 return 0;
536 return const_cast<AutoData*>(res.operator->());
539 AutoData* insert_auto_data(const void *ptr, uint64_t len) {
540 AutoSet &auto_vars = get_auto_vars();
541 std::pair<AutoSet::iterator, bool> res =
542 auto_vars.insert(AutoData(ptr, len));
543 return const_cast<AutoData*>(res.first.operator->());
546 void remove_auto_data(const void *ptr) {
547 get_auto_vars().erase(AutoData(ptr, 0));
551 // Signals
553 void add_signal(const void *signal, OffloadDescriptor *desc) {
554 m_signal_lock.lock();
555 m_signal_map[signal] = desc;
556 m_signal_lock.unlock();
559 OffloadDescriptor* find_signal(const void *signal, bool remove) {
560 OffloadDescriptor *desc = 0;
562 m_signal_lock.lock();
564 SignalMap::iterator it = m_signal_map.find(signal);
565 if (it != m_signal_map.end()) {
566 desc = it->second;
567 if (remove) {
568 it->second = SIGNAL_HAS_COMPLETED;
572 m_signal_lock.unlock();
574 return desc;
577 void complete_signaled_ofld(const void *signal) {
579 m_signal_lock.lock();
581 SignalMap::iterator it = m_signal_map.find(signal);
582 if (it != m_signal_map.end()) {
583 it->second = SIGNAL_HAS_COMPLETED;
586 m_signal_lock.unlock();
589 void stream_destroy(_Offload_stream handle);
591 void move_cpu_el_after(CpuEl* cpu_what, CpuEl* cpu_after);
592 void print_stream_cpu_list(const char *);
594 COIPIPELINE get_pipeline(_Offload_stream stream);
596 StreamMap get_stream_map() {
597 return m_stream_map;
600 // stop device process
601 void fini_process(bool verbose);
603 // list of stacks active at the engine
604 PersistDataList m_persist_list;
606 private:
607 Engine() : m_index(-1), m_physical_index(-1), m_process(0), m_ready(false),
608 m_proc_number(0), m_assigned_cpus(0), m_cpus(0), m_cpu_head(0)
611 ~Engine() {
612 m_ready = false;
613 for (StreamMap::iterator it = m_stream_map.begin();
614 it != m_stream_map.end(); it++) {
615 Stream * stream = it->second;
616 delete stream;
618 if (m_process != 0) {
619 fini_process(false);
621 if (m_assigned_cpus) {
622 delete m_assigned_cpus;
626 // set indexes
627 void set_indexes(int logical_index, int physical_index) {
628 m_index = logical_index;
629 m_physical_index = physical_index;
632 // set CPU mask
633 void set_cpu_mask(micLcpuMask *cpu_mask)
635 m_assigned_cpus = cpu_mask;
638 // start process on device
639 void init_process();
641 void load_libraries(void);
642 void init_ptr_data(void);
644 // performs library intialization on the device side
645 pid_t init_device(void);
647 private:
648 // get pipeline associated with a calling thread
649 COIPIPELINE get_pipeline(void);
651 // get automatic vars set associated with the calling thread
652 AutoSet& get_auto_vars(void);
654 // destructor for thread data
655 static void destroy_thread_data(void *data);
657 private:
658 typedef std::set<PtrData> PtrSet;
659 typedef std::map<const void*, OffloadDescriptor*> SignalMap;
661 // device indexes
662 int m_index;
663 int m_physical_index;
665 // cpu mask
666 micLcpuMask *m_assigned_cpus;
668 // number of COI pipes created for the engine
669 long m_proc_number;
671 // process handle
672 COIPROCESS m_process;
674 // If false, device either has not been initialized or new libraries
675 // have been added.
676 bool m_ready;
677 mutex_t m_lock;
679 // List of libraries to be loaded
680 TargetImageList m_images;
682 // var tables
683 PtrDataTable m_ptr_set;
684 PtrDataTable m_targetptr_set;
686 // signals
687 SignalMap m_signal_map;
688 mutex_t m_signal_lock;
690 // streams
691 StreamMap m_stream_map;
692 mutex_t m_stream_lock;
693 int m_num_cores;
694 int m_num_threads;
695 CpuEl* m_cpus;
696 CpuEl* m_cpu_head;
698 // List of dynamic libraries to be registred
699 DynLibList m_dyn_libs;
701 // constants for accessing device function handles
702 enum {
703 c_func_compute = 0,
704 #ifdef MYO_SUPPORT
705 c_func_myo_init,
706 c_func_myo_fini,
707 #endif // MYO_SUPPORT
708 c_func_init,
709 c_func_var_table_size,
710 c_func_var_table_copy,
711 c_func_set_stream_affinity,
712 c_funcs_total
714 static const char* m_func_names[c_funcs_total];
716 // device function handles
717 COIFUNCTION m_funcs[c_funcs_total];
719 // int -> name mapping for device signals
720 static const int c_signal_max = 32;
721 static const char* c_signal_names[c_signal_max];
724 #endif // OFFLOAD_ENGINE_H_INCLUDED