2015-11-15 Paul Thomas <pault@gcc.gnu.org>
[official-gcc.git] / liboffloadmic / runtime / offload_engine.h
blobabd5cc82f3058e1f7526a2854f6a5c4dd28b521f
1 /*
2 Copyright (c) 2014-2015 Intel Corporation. All Rights Reserved.
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions
6 are met:
8 * Redistributions of source code must retain the above copyright
9 notice, this list of conditions and the following disclaimer.
10 * Redistributions in binary form must reproduce the above copyright
11 notice, this list of conditions and the following disclaimer in the
12 documentation and/or other materials provided with the distribution.
13 * Neither the name of Intel Corporation nor the names of its
14 contributors may be used to endorse or promote products derived
15 from this software without specific prior written permission.
17 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 #ifndef OFFLOAD_ENGINE_H_INCLUDED
32 #define OFFLOAD_ENGINE_H_INCLUDED
34 #include <limits.h>
35 #include <bitset>
36 #include <list>
37 #include <set>
38 #include <map>
39 #include "offload_common.h"
40 #include "coi/coi_client.h"
42 #define SIGNAL_IS_REMOVED ((OffloadDescriptor *)-1)
43 const int64_t no_stream = -1;
45 // Address range
46 class MemRange {
47 public:
48 MemRange() : m_start(0), m_length(0) {}
49 MemRange(const void *addr, uint64_t len) : m_start(addr), m_length(len) {}
51 const void* start() const {
52 return m_start;
55 const void* end() const {
56 return static_cast<const char*>(m_start) + m_length;
59 uint64_t length() const {
60 return m_length;
63 // returns true if given range overlaps with another one
64 bool overlaps(const MemRange &o) const {
65 // Two address ranges A[start, end) and B[start,end) overlap
66 // if A.start < B.end and A.end > B.start.
67 return start() < o.end() && end() > o.start();
70 // returns true if given range contains the other range
71 bool contains(const MemRange &o) const {
72 return start() <= o.start() && o.end() <= end();
75 private:
76 const void* m_start;
77 uint64_t m_length;
80 // Data associated with a pointer variable
81 class PtrData {
82 public:
83 PtrData(const void *addr, uint64_t len) :
84 cpu_addr(addr, len), cpu_buf(0),
85 mic_addr(0), alloc_disp(0), mic_buf(0), mic_offset(0),
86 ref_count(0), is_static(false)
90 // Copy constructor
92 PtrData(const PtrData& ptr):
93 cpu_addr(ptr.cpu_addr), cpu_buf(ptr.cpu_buf),
94 mic_addr(ptr.mic_addr), alloc_disp(ptr.alloc_disp),
95 mic_buf(ptr.mic_buf), mic_offset(ptr.mic_offset),
96 ref_count(ptr.ref_count), is_static(ptr.is_static)
99 bool operator<(const PtrData &o) const {
100 // Variables are sorted by the CPU start address.
101 // Overlapping memory ranges are considered equal.
102 return (cpu_addr.start() < o.cpu_addr.start()) &&
103 !cpu_addr.overlaps(o.cpu_addr);
106 long add_reference() {
107 if (is_static) {
108 return LONG_MAX;
110 #ifndef TARGET_WINNT
111 return __sync_fetch_and_add(&ref_count, 1);
112 #else // TARGET_WINNT
113 return _InterlockedIncrement(&ref_count) - 1;
114 #endif // TARGET_WINNT
117 long remove_reference() {
118 if (is_static) {
119 return LONG_MAX;
121 #ifndef TARGET_WINNT
122 return __sync_sub_and_fetch(&ref_count, 1);
123 #else // TARGET_WINNT
124 return _InterlockedDecrement(&ref_count);
125 #endif // TARGET_WINNT
128 long get_reference() const {
129 if (is_static) {
130 return LONG_MAX;
132 return ref_count;
135 public:
136 // CPU address range
137 const MemRange cpu_addr;
139 // CPU and MIC buffers
140 COIBUFFER cpu_buf;
141 COIBUFFER mic_buf;
143 // placeholder for buffer address on mic
144 uint64_t mic_addr;
146 uint64_t alloc_disp;
148 // additional offset to pointer data on MIC for improving bandwidth for
149 // data which is not 4K aligned
150 uint32_t mic_offset;
152 // if true buffers are created from static memory
153 bool is_static;
154 mutex_t alloc_ptr_data_lock;
156 private:
157 // reference count for the entry
158 long ref_count;
161 typedef std::list<PtrData*> PtrDataList;
163 class PtrDataTable {
164 public:
165 typedef std::set<PtrData> PtrSet;
167 PtrData* find_ptr_data(const void *ptr) {
168 m_ptr_lock.lock();
169 PtrSet::iterator res = list.find(PtrData(ptr, 0));
171 m_ptr_lock.unlock();
172 if (res == list.end()) {
173 return 0;
175 return const_cast<PtrData*>(res.operator->());
178 PtrData* insert_ptr_data(const void *ptr, uint64_t len, bool &is_new) {
179 m_ptr_lock.lock();
180 std::pair<PtrSet::iterator, bool> res =
181 list.insert(PtrData(ptr, len));
183 PtrData* ptr_data = const_cast<PtrData*>(res.first.operator->());
184 m_ptr_lock.unlock();
186 is_new = res.second;
187 if (is_new) {
188 // It's necessary to lock as soon as possible.
189 // unlock must be done at call site of insert_ptr_data at
190 // branch for is_new
191 ptr_data->alloc_ptr_data_lock.lock();
193 return ptr_data;
196 void remove_ptr_data(const void *ptr) {
197 m_ptr_lock.lock();
198 list.erase(PtrData(ptr, 0));
199 m_ptr_lock.unlock();
201 private:
203 PtrSet list;
204 mutex_t m_ptr_lock;
207 // Data associated with automatic variable
208 class AutoData {
209 public:
210 AutoData(const void *addr, uint64_t len) :
211 cpu_addr(addr, len), ref_count(0)
214 bool operator<(const AutoData &o) const {
215 // Variables are sorted by the CPU start address.
216 // Overlapping memory ranges are considered equal.
217 return (cpu_addr.start() < o.cpu_addr.start()) &&
218 !cpu_addr.overlaps(o.cpu_addr);
221 long add_reference() {
222 #ifndef TARGET_WINNT
223 return __sync_fetch_and_add(&ref_count, 1);
224 #else // TARGET_WINNT
225 return _InterlockedIncrement(&ref_count) - 1;
226 #endif // TARGET_WINNT
229 long remove_reference() {
230 #ifndef TARGET_WINNT
231 return __sync_sub_and_fetch(&ref_count, 1);
232 #else // TARGET_WINNT
233 return _InterlockedDecrement(&ref_count);
234 #endif // TARGET_WINNT
237 long nullify_reference() {
238 #ifndef TARGET_WINNT
239 return __sync_lock_test_and_set(&ref_count, 0);
240 #else // TARGET_WINNT
241 return _InterlockedExchange(&ref_count,0);
242 #endif // TARGET_WINNT
245 long get_reference() const {
246 return ref_count;
249 public:
250 // CPU address range
251 const MemRange cpu_addr;
253 private:
254 // reference count for the entry
255 long ref_count;
258 // Set of autimatic variables
259 typedef std::set<AutoData> AutoSet;
261 // Target image data
262 struct TargetImage
264 TargetImage(const char *_name, const void *_data, uint64_t _size,
265 const char *_origin, uint64_t _offset) :
266 name(_name), data(_data), size(_size),
267 origin(_origin), offset(_offset)
270 // library name
271 const char* name;
273 // contents and size
274 const void* data;
275 uint64_t size;
277 // file of origin and offset within that file
278 const char* origin;
279 uint64_t offset;
282 typedef std::list<TargetImage> TargetImageList;
284 // dynamic library and Image associated with lib
285 struct DynLib
287 DynLib(const char *_name, const void *_data,
288 COILIBRARY _lib) :
289 name(_name), data(_data), lib(_lib)
291 // library name
292 const char* name;
294 // contents
295 const void* data;
297 COILIBRARY lib;
299 typedef std::list<DynLib> DynLibList;
301 // Data associated with persistent auto objects
302 struct PersistData
304 PersistData(const void *addr, uint64_t routine_num,
305 uint64_t size, uint64_t thread) :
306 stack_cpu_addr(addr), routine_id(routine_num), thread_id(thread)
308 stack_ptr_data = new PtrData(0, size);
310 // 1-st key value - beginning of the stack at CPU
311 const void * stack_cpu_addr;
312 // 2-nd key value - identifier of routine invocation at CPU
313 uint64_t routine_id;
314 // 3-rd key value - thread identifier
315 uint64_t thread_id;
317 // corresponded PtrData; only stack_ptr_data->mic_buf is used
318 PtrData * stack_ptr_data;
319 // used to get offset of the variable in stack buffer
320 char * cpu_stack_addr;
323 typedef std::list<PersistData> PersistDataList;
325 // Data associated with stream
326 struct Stream
328 Stream(int device, int num_of_cpus) :
329 m_number_of_cpus(num_of_cpus), m_pipeline(0), m_last_offload(0),
330 m_device(device)
332 ~Stream() {
333 if (m_pipeline) {
334 COI::PipelineDestroy(m_pipeline);
338 COIPIPELINE get_pipeline(void) {
339 return(m_pipeline);
342 int get_device(void) {
343 return(m_device);
346 int get_cpu_number(void) {
347 return(m_number_of_cpus);
350 void set_pipeline(COIPIPELINE pipeline) {
351 m_pipeline = pipeline;
354 OffloadDescriptor* get_last_offload(void) {
355 return(m_last_offload);
358 void set_last_offload(OffloadDescriptor* last_offload) {
359 m_last_offload = last_offload;
362 static Stream* find_stream(uint64_t handle, bool remove);
364 static _Offload_stream add_stream(int device, int number_of_cpus) {
365 m_stream_lock.lock();
366 all_streams[++m_streams_count] = new Stream(device, number_of_cpus);
367 m_stream_lock.unlock();
368 return(m_streams_count);
371 typedef std::map<uint64_t, Stream*> StreamMap;
373 static uint64_t m_streams_count;
374 static StreamMap all_streams;
375 static mutex_t m_stream_lock;
377 int m_device;
379 // number of cpus
380 int m_number_of_cpus;
382 // The pipeline associated with the stream
383 COIPIPELINE m_pipeline;
385 // The last offload occured via the stream
386 OffloadDescriptor* m_last_offload;
388 // Cpus used by the stream
389 std::bitset<COI_MAX_HW_THREADS> m_stream_cpus;
392 typedef std::map<uint64_t, Stream*> StreamMap;
394 // class representing a single engine
395 struct Engine {
396 friend void __offload_init_library_once(void);
397 friend void __offload_fini_library(void);
399 #define check_result(res, tag, ...) \
401 if (res == COI_PROCESS_DIED) { \
402 fini_process(true); \
403 exit(1); \
405 if (res != COI_SUCCESS) { \
406 __liboffload_error_support(tag, __VA_ARGS__); \
407 exit(1); \
411 int get_logical_index() const {
412 return m_index;
415 int get_physical_index() const {
416 return m_physical_index;
419 const COIPROCESS& get_process() const {
420 return m_process;
423 uint64_t get_thread_id(void);
425 // initialize device
426 void init(void);
428 // unload library
429 void unload_library(const void *data, const char *name);
431 // add new library
432 void add_lib(const TargetImage &lib)
434 m_lock.lock();
435 m_ready = false;
436 m_images.push_back(lib);
437 m_lock.unlock();
440 COIRESULT compute(
441 _Offload_stream stream,
442 const std::list<COIBUFFER> &buffers,
443 const void* data,
444 uint16_t data_size,
445 void* ret,
446 uint16_t ret_size,
447 uint32_t num_deps,
448 const COIEVENT* deps,
449 COIEVENT* event
452 #ifdef MYO_SUPPORT
453 // temporary workaround for blocking behavior for myoiLibInit/Fini calls
454 void init_myo(COIEVENT *event) {
455 COIRESULT res;
456 res = COI::PipelineRunFunction(get_pipeline(),
457 m_funcs[c_func_myo_init],
458 0, 0, 0, 0, 0, 0, 0, 0, 0,
459 event);
460 check_result(res, c_pipeline_run_func, m_index, res);
463 void fini_myo(COIEVENT *event) {
464 COIRESULT res;
465 res = COI::PipelineRunFunction(get_pipeline(),
466 m_funcs[c_func_myo_fini],
467 0, 0, 0, 0, 0, 0, 0, 0, 0,
468 event);
469 check_result(res, c_pipeline_run_func, m_index, res);
471 #endif // MYO_SUPPORT
474 // Memory association table
476 PtrData* find_ptr_data(const void *ptr) {
477 return m_ptr_set.find_ptr_data(ptr);
480 PtrData* find_targetptr_data(const void *ptr) {
481 return m_targetptr_set.find_ptr_data(ptr);
484 PtrData* insert_ptr_data(const void *ptr, uint64_t len, bool &is_new) {
485 return m_ptr_set.insert_ptr_data(ptr, len, is_new);
488 PtrData* insert_targetptr_data(const void *ptr, uint64_t len,
489 bool &is_new) {
490 return m_targetptr_set.insert_ptr_data(ptr, len, is_new);
493 void remove_ptr_data(const void *ptr) {
494 m_ptr_set.remove_ptr_data(ptr);
497 void remove_targetptr_data(const void *ptr) {
498 m_targetptr_set.remove_ptr_data(ptr);
502 // Automatic variables
504 AutoData* find_auto_data(const void *ptr) {
505 AutoSet &auto_vars = get_auto_vars();
506 AutoSet::iterator res = auto_vars.find(AutoData(ptr, 0));
507 if (res == auto_vars.end()) {
508 return 0;
510 return const_cast<AutoData*>(res.operator->());
513 AutoData* insert_auto_data(const void *ptr, uint64_t len) {
514 AutoSet &auto_vars = get_auto_vars();
515 std::pair<AutoSet::iterator, bool> res =
516 auto_vars.insert(AutoData(ptr, len));
517 return const_cast<AutoData*>(res.first.operator->());
520 void remove_auto_data(const void *ptr) {
521 get_auto_vars().erase(AutoData(ptr, 0));
525 // Signals
527 void add_signal(const void *signal, OffloadDescriptor *desc) {
528 m_signal_lock.lock();
529 m_signal_map[signal] = desc;
530 m_signal_lock.unlock();
533 OffloadDescriptor* find_signal(const void *signal, bool remove) {
534 OffloadDescriptor *desc = 0;
536 m_signal_lock.lock();
538 SignalMap::iterator it = m_signal_map.find(signal);
539 if (it != m_signal_map.end()) {
540 desc = it->second;
541 if (remove) {
542 it->second = SIGNAL_IS_REMOVED;
546 m_signal_lock.unlock();
548 return desc;
551 void stream_destroy(_Offload_stream handle);
553 COIPIPELINE get_pipeline(_Offload_stream stream);
555 StreamMap get_stream_map() {
556 return m_stream_map;
559 // stop device process
560 void fini_process(bool verbose);
562 // list of stacks active at the engine
563 PersistDataList m_persist_list;
565 private:
566 Engine() : m_index(-1), m_physical_index(-1), m_process(0), m_ready(false),
567 m_proc_number(0)
570 ~Engine() {
571 for (StreamMap::iterator it = m_stream_map.begin();
572 it != m_stream_map.end(); it++) {
573 Stream * stream = it->second;
574 delete stream;
576 if (m_process != 0) {
577 fini_process(false);
581 // set indexes
582 void set_indexes(int logical_index, int physical_index) {
583 m_index = logical_index;
584 m_physical_index = physical_index;
587 // start process on device
588 void init_process();
590 void load_libraries(void);
591 void init_ptr_data(void);
593 // performs library intialization on the device side
594 pid_t init_device(void);
596 private:
597 // get pipeline associated with a calling thread
598 COIPIPELINE get_pipeline(void);
600 // get automatic vars set associated with the calling thread
601 AutoSet& get_auto_vars(void);
603 // destructor for thread data
604 static void destroy_thread_data(void *data);
606 private:
607 typedef std::set<PtrData> PtrSet;
608 typedef std::map<const void*, OffloadDescriptor*> SignalMap;
610 // device indexes
611 int m_index;
612 int m_physical_index;
614 // number of COI pipes created for the engine
615 long m_proc_number;
617 // process handle
618 COIPROCESS m_process;
620 // If false, device either has not been initialized or new libraries
621 // have been added.
622 bool m_ready;
623 mutex_t m_lock;
625 // List of libraries to be loaded
626 TargetImageList m_images;
628 // var tables
629 PtrDataTable m_ptr_set;
630 PtrDataTable m_targetptr_set;
632 // signals
633 SignalMap m_signal_map;
634 mutex_t m_signal_lock;
636 // streams
637 StreamMap m_stream_map;
638 mutex_t m_stream_lock;
639 int m_num_cores;
640 int m_num_threads;
641 std::bitset<COI_MAX_HW_THREADS> m_cpus;
643 // List of dynamic libraries to be registred
644 DynLibList m_dyn_libs;
646 // constants for accessing device function handles
647 enum {
648 c_func_compute = 0,
649 #ifdef MYO_SUPPORT
650 c_func_myo_init,
651 c_func_myo_fini,
652 #endif // MYO_SUPPORT
653 c_func_init,
654 c_func_var_table_size,
655 c_func_var_table_copy,
656 c_func_set_stream_affinity,
657 c_funcs_total
659 static const char* m_func_names[c_funcs_total];
661 // device function handles
662 COIFUNCTION m_funcs[c_funcs_total];
664 // int -> name mapping for device signals
665 static const int c_signal_max = 32;
666 static const char* c_signal_names[c_signal_max];
669 #endif // OFFLOAD_ENGINE_H_INCLUDED