Fix finding of a first match predictor
[official-gcc.git] / libgomp / plugin / plugin-nvptx.c
blob327500c01aa72bfbcbb6d47d11acfeb3451e4d87
1 /* Plugin for NVPTX execution.
3 Copyright (C) 2013-2016 Free Software Foundation, Inc.
5 Contributed by Mentor Embedded.
7 This file is part of the GNU Offloading and Multi Processing Library
8 (libgomp).
10 Libgomp is free software; you can redistribute it and/or modify it
11 under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 3, or (at your option)
13 any later version.
15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
18 more details.
20 Under Section 7 of GPL version 3, you are granted additional
21 permissions described in the GCC Runtime Library Exception, version
22 3.1, as published by the Free Software Foundation.
24 You should have received a copy of the GNU General Public License and
25 a copy of the GCC Runtime Library Exception along with this program;
26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
27 <http://www.gnu.org/licenses/>. */
29 /* Nvidia PTX-specific parts of OpenACC support. The cuda driver
30 library appears to hold some implicit state, but the documentation
31 is not clear as to what that state might be. Or how one might
32 propagate it from one thread to another. */
34 #include "openacc.h"
35 #include "config.h"
36 #include "libgomp-plugin.h"
37 #include "oacc-plugin.h"
38 #include "gomp-constants.h"
40 #include <pthread.h>
41 #include <cuda.h>
42 #include <stdbool.h>
43 #include <stdint.h>
44 #include <string.h>
45 #include <stdio.h>
46 #include <unistd.h>
47 #include <assert.h>
49 static const char *
50 cuda_error (CUresult r)
52 #if CUDA_VERSION < 7000
53 /* Specified in documentation and present in library from at least
54 5.5. Not declared in header file prior to 7.0. */
55 extern CUresult cuGetErrorString (CUresult, const char **);
56 #endif
57 const char *desc;
59 r = cuGetErrorString (r, &desc);
60 if (r != CUDA_SUCCESS)
61 desc = "unknown cuda error";
63 return desc;
66 /* Convenience macros for the frequently used CUDA library call and
67 error handling sequence. This does not capture all the cases we
68 use in this file, but is common enough. */
70 #define CUDA_CALL_ERET(ERET, FN, ...) \
71 do { \
72 unsigned __r = FN (__VA_ARGS__); \
73 if (__r != CUDA_SUCCESS) \
74 { \
75 GOMP_PLUGIN_error (#FN " error: %s", \
76 cuda_error (__r)); \
77 return ERET; \
78 } \
79 } while (0)
81 #define CUDA_CALL(FN, ...) \
82 CUDA_CALL_ERET (false, (FN), __VA_ARGS__)
84 #define CUDA_CALL_ASSERT(FN, ...) \
85 do { \
86 unsigned __r = FN (__VA_ARGS__); \
87 if (__r != CUDA_SUCCESS) \
88 { \
89 GOMP_PLUGIN_fatal (#FN " error: %s", \
90 cuda_error (__r)); \
91 } \
92 } while (0)
94 static unsigned int instantiated_devices = 0;
95 static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
97 struct ptx_stream
99 CUstream stream;
100 pthread_t host_thread;
101 bool multithreaded;
103 CUdeviceptr d;
104 void *h;
105 void *h_begin;
106 void *h_end;
107 void *h_next;
108 void *h_prev;
109 void *h_tail;
111 struct ptx_stream *next;
114 /* Thread-specific data for PTX. */
116 struct nvptx_thread
118 struct ptx_stream *current_stream;
119 struct ptx_device *ptx_dev;
122 struct map
124 int async;
125 size_t size;
126 char mappings[0];
129 static bool
130 map_init (struct ptx_stream *s)
132 int size = getpagesize ();
134 assert (s);
135 assert (!s->d);
136 assert (!s->h);
138 CUDA_CALL (cuMemAllocHost, &s->h, size);
139 CUDA_CALL (cuMemHostGetDevicePointer, &s->d, s->h, 0);
141 assert (s->h);
143 s->h_begin = s->h;
144 s->h_end = s->h_begin + size;
145 s->h_next = s->h_prev = s->h_tail = s->h_begin;
147 assert (s->h_next);
148 assert (s->h_end);
149 return true;
152 static bool
153 map_fini (struct ptx_stream *s)
155 CUDA_CALL (cuMemFreeHost, s->h);
156 return true;
159 static void
160 map_pop (struct ptx_stream *s)
162 struct map *m;
164 assert (s != NULL);
165 assert (s->h_next);
166 assert (s->h_prev);
167 assert (s->h_tail);
169 m = s->h_tail;
171 s->h_tail += m->size;
173 if (s->h_tail >= s->h_end)
174 s->h_tail = s->h_begin + (int) (s->h_tail - s->h_end);
176 if (s->h_next == s->h_tail)
177 s->h_prev = s->h_next;
179 assert (s->h_next >= s->h_begin);
180 assert (s->h_tail >= s->h_begin);
181 assert (s->h_prev >= s->h_begin);
183 assert (s->h_next <= s->h_end);
184 assert (s->h_tail <= s->h_end);
185 assert (s->h_prev <= s->h_end);
188 static void
189 map_push (struct ptx_stream *s, int async, size_t size, void **h, void **d)
191 int left;
192 int offset;
193 struct map *m;
195 assert (s != NULL);
197 left = s->h_end - s->h_next;
198 size += sizeof (struct map);
200 assert (s->h_prev);
201 assert (s->h_next);
203 if (size >= left)
205 m = s->h_prev;
206 m->size += left;
207 s->h_next = s->h_begin;
209 if (s->h_next + size > s->h_end)
210 GOMP_PLUGIN_fatal ("unable to push map");
213 assert (s->h_next);
215 m = s->h_next;
216 m->async = async;
217 m->size = size;
219 offset = (void *)&m->mappings[0] - s->h;
221 *d = (void *)(s->d + offset);
222 *h = (void *)(s->h + offset);
224 s->h_prev = s->h_next;
225 s->h_next += size;
227 assert (s->h_prev);
228 assert (s->h_next);
230 assert (s->h_next >= s->h_begin);
231 assert (s->h_tail >= s->h_begin);
232 assert (s->h_prev >= s->h_begin);
233 assert (s->h_next <= s->h_end);
234 assert (s->h_tail <= s->h_end);
235 assert (s->h_prev <= s->h_end);
237 return;
240 /* Target data function launch information. */
242 struct targ_fn_launch
244 const char *fn;
245 unsigned short dim[GOMP_DIM_MAX];
248 /* Target PTX object information. */
250 struct targ_ptx_obj
252 const char *code;
253 size_t size;
256 /* Target data image information. */
258 typedef struct nvptx_tdata
260 const struct targ_ptx_obj *ptx_objs;
261 unsigned ptx_num;
263 const char *const *var_names;
264 unsigned var_num;
266 const struct targ_fn_launch *fn_descs;
267 unsigned fn_num;
268 } nvptx_tdata_t;
270 /* Descriptor of a loaded function. */
272 struct targ_fn_descriptor
274 CUfunction fn;
275 const struct targ_fn_launch *launch;
278 /* A loaded PTX image. */
279 struct ptx_image_data
281 const void *target_data;
282 CUmodule module;
284 struct targ_fn_descriptor *fns; /* Array of functions. */
286 struct ptx_image_data *next;
289 struct ptx_device
291 CUcontext ctx;
292 bool ctx_shared;
293 CUdevice dev;
294 struct ptx_stream *null_stream;
295 /* All non-null streams associated with this device (actually context),
296 either created implicitly or passed in from the user (via
297 acc_set_cuda_stream). */
298 struct ptx_stream *active_streams;
299 struct {
300 struct ptx_stream **arr;
301 int size;
302 } async_streams;
303 /* A lock for use when manipulating the above stream list and array. */
304 pthread_mutex_t stream_lock;
305 int ord;
306 bool overlap;
307 bool map;
308 bool concur;
309 int mode;
310 bool mkern;
312 struct ptx_image_data *images; /* Images loaded on device. */
313 pthread_mutex_t image_lock; /* Lock for above list. */
315 struct ptx_device *next;
318 enum ptx_event_type
320 PTX_EVT_MEM,
321 PTX_EVT_KNL,
322 PTX_EVT_SYNC,
323 PTX_EVT_ASYNC_CLEANUP
326 struct ptx_event
328 CUevent *evt;
329 int type;
330 void *addr;
331 int ord;
332 int val;
334 struct ptx_event *next;
337 static pthread_mutex_t ptx_event_lock;
338 static struct ptx_event *ptx_events;
340 static struct ptx_device **ptx_devices;
342 static inline struct nvptx_thread *
343 nvptx_thread (void)
345 return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
348 static bool
349 init_streams_for_device (struct ptx_device *ptx_dev, int concurrency)
351 int i;
352 struct ptx_stream *null_stream
353 = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
355 null_stream->stream = NULL;
356 null_stream->host_thread = pthread_self ();
357 null_stream->multithreaded = true;
358 null_stream->d = (CUdeviceptr) NULL;
359 null_stream->h = NULL;
360 if (!map_init (null_stream))
361 return false;
363 ptx_dev->null_stream = null_stream;
364 ptx_dev->active_streams = NULL;
365 pthread_mutex_init (&ptx_dev->stream_lock, NULL);
367 if (concurrency < 1)
368 concurrency = 1;
370 /* This is just a guess -- make space for as many async streams as the
371 current device is capable of concurrently executing. This can grow
372 later as necessary. No streams are created yet. */
373 ptx_dev->async_streams.arr
374 = GOMP_PLUGIN_malloc (concurrency * sizeof (struct ptx_stream *));
375 ptx_dev->async_streams.size = concurrency;
377 for (i = 0; i < concurrency; i++)
378 ptx_dev->async_streams.arr[i] = NULL;
380 return true;
383 static bool
384 fini_streams_for_device (struct ptx_device *ptx_dev)
386 free (ptx_dev->async_streams.arr);
388 bool ret = true;
389 while (ptx_dev->active_streams != NULL)
391 struct ptx_stream *s = ptx_dev->active_streams;
392 ptx_dev->active_streams = ptx_dev->active_streams->next;
394 ret &= map_fini (s);
396 CUresult r = cuStreamDestroy (s->stream);
397 if (r != CUDA_SUCCESS)
399 GOMP_PLUGIN_error ("cuStreamDestroy error: %s", cuda_error (r));
400 ret = false;
402 free (s);
405 ret &= map_fini (ptx_dev->null_stream);
406 free (ptx_dev->null_stream);
407 return ret;
410 /* Select a stream for (OpenACC-semantics) ASYNC argument for the current
411 thread THREAD (and also current device/context). If CREATE is true, create
412 the stream if it does not exist (or use EXISTING if it is non-NULL), and
413 associate the stream with the same thread argument. Returns stream to use
414 as result. */
416 static struct ptx_stream *
417 select_stream_for_async (int async, pthread_t thread, bool create,
418 CUstream existing)
420 struct nvptx_thread *nvthd = nvptx_thread ();
421 /* Local copy of TLS variable. */
422 struct ptx_device *ptx_dev = nvthd->ptx_dev;
423 struct ptx_stream *stream = NULL;
424 int orig_async = async;
426 /* The special value acc_async_noval (-1) maps (for now) to an
427 implicitly-created stream, which is then handled the same as any other
428 numbered async stream. Other options are available, e.g. using the null
429 stream for anonymous async operations, or choosing an idle stream from an
430 active set. But, stick with this for now. */
431 if (async > acc_async_sync)
432 async++;
434 if (create)
435 pthread_mutex_lock (&ptx_dev->stream_lock);
437 /* NOTE: AFAICT there's no particular need for acc_async_sync to map to the
438 null stream, and in fact better performance may be obtainable if it doesn't
439 (because the null stream enforces overly-strict synchronisation with
440 respect to other streams for legacy reasons, and that's probably not
441 needed with OpenACC). Maybe investigate later. */
442 if (async == acc_async_sync)
443 stream = ptx_dev->null_stream;
444 else if (async >= 0 && async < ptx_dev->async_streams.size
445 && ptx_dev->async_streams.arr[async] && !(create && existing))
446 stream = ptx_dev->async_streams.arr[async];
447 else if (async >= 0 && create)
449 if (async >= ptx_dev->async_streams.size)
451 int i, newsize = ptx_dev->async_streams.size * 2;
453 if (async >= newsize)
454 newsize = async + 1;
456 ptx_dev->async_streams.arr
457 = GOMP_PLUGIN_realloc (ptx_dev->async_streams.arr,
458 newsize * sizeof (struct ptx_stream *));
460 for (i = ptx_dev->async_streams.size; i < newsize; i++)
461 ptx_dev->async_streams.arr[i] = NULL;
463 ptx_dev->async_streams.size = newsize;
466 /* Create a new stream on-demand if there isn't one already, or if we're
467 setting a particular async value to an existing (externally-provided)
468 stream. */
469 if (!ptx_dev->async_streams.arr[async] || existing)
471 CUresult r;
472 struct ptx_stream *s
473 = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
475 if (existing)
476 s->stream = existing;
477 else
479 r = cuStreamCreate (&s->stream, CU_STREAM_DEFAULT);
480 if (r != CUDA_SUCCESS)
482 pthread_mutex_unlock (&ptx_dev->stream_lock);
483 GOMP_PLUGIN_fatal ("cuStreamCreate error: %s",
484 cuda_error (r));
488 /* If CREATE is true, we're going to be queueing some work on this
489 stream. Associate it with the current host thread. */
490 s->host_thread = thread;
491 s->multithreaded = false;
493 s->d = (CUdeviceptr) NULL;
494 s->h = NULL;
495 if (!map_init (s))
497 pthread_mutex_unlock (&ptx_dev->stream_lock);
498 GOMP_PLUGIN_fatal ("map_init fail");
501 s->next = ptx_dev->active_streams;
502 ptx_dev->active_streams = s;
503 ptx_dev->async_streams.arr[async] = s;
506 stream = ptx_dev->async_streams.arr[async];
508 else if (async < 0)
510 if (create)
511 pthread_mutex_unlock (&ptx_dev->stream_lock);
512 GOMP_PLUGIN_fatal ("bad async %d", async);
515 if (create)
517 assert (stream != NULL);
519 /* If we're trying to use the same stream from different threads
520 simultaneously, set stream->multithreaded to true. This affects the
521 behaviour of acc_async_test_all and acc_wait_all, which are supposed to
522 only wait for asynchronous launches from the same host thread they are
523 invoked on. If multiple threads use the same async value, we make note
524 of that here and fall back to testing/waiting for all threads in those
525 functions. */
526 if (thread != stream->host_thread)
527 stream->multithreaded = true;
529 pthread_mutex_unlock (&ptx_dev->stream_lock);
531 else if (stream && !stream->multithreaded
532 && !pthread_equal (stream->host_thread, thread))
533 GOMP_PLUGIN_fatal ("async %d used on wrong thread", orig_async);
535 return stream;
538 /* Initialize the device. Return TRUE on success, else FALSE. PTX_DEV_LOCK
539 should be locked on entry and remains locked on exit. */
541 static bool
542 nvptx_init (void)
544 int ndevs;
546 if (instantiated_devices != 0)
547 return true;
549 CUDA_CALL (cuInit, 0);
550 ptx_events = NULL;
551 pthread_mutex_init (&ptx_event_lock, NULL);
553 CUDA_CALL (cuDeviceGetCount, &ndevs);
554 ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *)
555 * ndevs);
556 return true;
559 /* Select the N'th PTX device for the current host thread. The device must
560 have been previously opened before calling this function. */
562 static bool
563 nvptx_attach_host_thread_to_device (int n)
565 CUdevice dev;
566 CUresult r;
567 struct ptx_device *ptx_dev;
568 CUcontext thd_ctx;
570 r = cuCtxGetDevice (&dev);
571 if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
573 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
574 return false;
577 if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n)
578 return true;
579 else
581 CUcontext old_ctx;
583 ptx_dev = ptx_devices[n];
584 if (!ptx_dev)
586 GOMP_PLUGIN_error ("device %d not found", n);
587 return false;
590 CUDA_CALL (cuCtxGetCurrent, &thd_ctx);
592 /* We don't necessarily have a current context (e.g. if it has been
593 destroyed. Pop it if we do though. */
594 if (thd_ctx != NULL)
595 CUDA_CALL (cuCtxPopCurrent, &old_ctx);
597 CUDA_CALL (cuCtxPushCurrent, ptx_dev->ctx);
599 return true;
602 static struct ptx_device *
603 nvptx_open_device (int n)
605 struct ptx_device *ptx_dev;
606 CUdevice dev, ctx_dev;
607 CUresult r;
608 int async_engines, pi;
610 CUDA_CALL_ERET (NULL, cuDeviceGet, &dev, n);
612 ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device));
614 ptx_dev->ord = n;
615 ptx_dev->dev = dev;
616 ptx_dev->ctx_shared = false;
618 r = cuCtxGetDevice (&ctx_dev);
619 if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
621 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
622 return NULL;
625 if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev)
627 /* The current host thread has an active context for a different device.
628 Detach it. */
629 CUcontext old_ctx;
630 CUDA_CALL_ERET (NULL, cuCtxPopCurrent, &old_ctx);
633 CUDA_CALL_ERET (NULL, cuCtxGetCurrent, &ptx_dev->ctx);
635 if (!ptx_dev->ctx)
636 CUDA_CALL_ERET (NULL, cuCtxCreate, &ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
637 else
638 ptx_dev->ctx_shared = true;
640 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
641 &pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
642 ptx_dev->overlap = pi;
644 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
645 &pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
646 ptx_dev->map = pi;
648 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
649 &pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
650 ptx_dev->concur = pi;
652 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
653 &pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
654 ptx_dev->mode = pi;
656 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
657 &pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
658 ptx_dev->mkern = pi;
660 r = cuDeviceGetAttribute (&async_engines,
661 CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
662 if (r != CUDA_SUCCESS)
663 async_engines = 1;
665 ptx_dev->images = NULL;
666 pthread_mutex_init (&ptx_dev->image_lock, NULL);
668 if (!init_streams_for_device (ptx_dev, async_engines))
669 return NULL;
671 return ptx_dev;
674 static bool
675 nvptx_close_device (struct ptx_device *ptx_dev)
677 if (!ptx_dev)
678 return true;
680 if (!fini_streams_for_device (ptx_dev))
681 return false;
683 pthread_mutex_destroy (&ptx_dev->image_lock);
685 if (!ptx_dev->ctx_shared)
686 CUDA_CALL (cuCtxDestroy, ptx_dev->ctx);
688 free (ptx_dev);
689 return true;
692 static int
693 nvptx_get_num_devices (void)
695 int n;
697 /* PR libgomp/65099: Currently, we only support offloading in 64-bit
698 configurations. */
699 if (sizeof (void *) != 8)
700 return 0;
702 /* This function will be called before the plugin has been initialized in
703 order to enumerate available devices, but CUDA API routines can't be used
704 until cuInit has been called. Just call it now (but don't yet do any
705 further initialization). */
706 if (instantiated_devices == 0)
708 CUresult r = cuInit (0);
709 /* This is not an error: e.g. we may have CUDA libraries installed but
710 no devices available. */
711 if (r != CUDA_SUCCESS)
712 return 0;
715 CUDA_CALL_ERET (-1, cuDeviceGetCount, &n);
716 return n;
720 static bool
721 link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs,
722 unsigned num_objs)
724 CUjit_option opts[6];
725 void *optvals[6];
726 float elapsed = 0.0;
727 #define LOGSIZE 8192
728 char elog[LOGSIZE];
729 char ilog[LOGSIZE];
730 unsigned long logsize = LOGSIZE;
731 CUlinkState linkstate;
732 CUresult r;
733 void *linkout;
734 size_t linkoutsize __attribute__ ((unused));
736 opts[0] = CU_JIT_WALL_TIME;
737 optvals[0] = &elapsed;
739 opts[1] = CU_JIT_INFO_LOG_BUFFER;
740 optvals[1] = &ilog[0];
742 opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
743 optvals[2] = (void *) logsize;
745 opts[3] = CU_JIT_ERROR_LOG_BUFFER;
746 optvals[3] = &elog[0];
748 opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
749 optvals[4] = (void *) logsize;
751 opts[5] = CU_JIT_LOG_VERBOSE;
752 optvals[5] = (void *) 1;
754 CUDA_CALL (cuLinkCreate, 6, opts, optvals, &linkstate);
756 for (; num_objs--; ptx_objs++)
758 /* cuLinkAddData's 'data' argument erroneously omits the const
759 qualifier. */
760 GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs->code);
761 r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, (char*)ptx_objs->code,
762 ptx_objs->size, 0, 0, 0, 0);
763 if (r != CUDA_SUCCESS)
765 GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
766 GOMP_PLUGIN_error ("cuLinkAddData (ptx_code) error: %s",
767 cuda_error (r));
768 return false;
772 GOMP_PLUGIN_debug (0, "Linking\n");
773 r = cuLinkComplete (linkstate, &linkout, &linkoutsize);
775 GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed);
776 GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]);
778 if (r != CUDA_SUCCESS)
780 GOMP_PLUGIN_error ("cuLinkComplete error: %s", cuda_error (r));
781 return false;
784 CUDA_CALL (cuModuleLoadData, module, linkout);
785 CUDA_CALL (cuLinkDestroy, linkstate);
786 return true;
789 static void
790 event_gc (bool memmap_lockable)
792 struct ptx_event *ptx_event = ptx_events;
793 struct ptx_event *async_cleanups = NULL;
794 struct nvptx_thread *nvthd = nvptx_thread ();
796 pthread_mutex_lock (&ptx_event_lock);
798 while (ptx_event != NULL)
800 CUresult r;
801 struct ptx_event *e = ptx_event;
803 ptx_event = ptx_event->next;
805 if (e->ord != nvthd->ptx_dev->ord)
806 continue;
808 r = cuEventQuery (*e->evt);
809 if (r == CUDA_SUCCESS)
811 bool append_async = false;
812 CUevent *te;
814 te = e->evt;
816 switch (e->type)
818 case PTX_EVT_MEM:
819 case PTX_EVT_SYNC:
820 break;
822 case PTX_EVT_KNL:
823 map_pop (e->addr);
824 break;
826 case PTX_EVT_ASYNC_CLEANUP:
828 /* The function gomp_plugin_async_unmap_vars needs to claim the
829 memory-map splay tree lock for the current device, so we
830 can't call it when one of our callers has already claimed
831 the lock. In that case, just delay the GC for this event
832 until later. */
833 if (!memmap_lockable)
834 continue;
836 append_async = true;
838 break;
841 cuEventDestroy (*te);
842 free ((void *)te);
844 /* Unlink 'e' from ptx_events list. */
845 if (ptx_events == e)
846 ptx_events = ptx_events->next;
847 else
849 struct ptx_event *e_ = ptx_events;
850 while (e_->next != e)
851 e_ = e_->next;
852 e_->next = e_->next->next;
855 if (append_async)
857 e->next = async_cleanups;
858 async_cleanups = e;
860 else
861 free (e);
865 pthread_mutex_unlock (&ptx_event_lock);
867 /* We have to do these here, after ptx_event_lock is released. */
868 while (async_cleanups)
870 struct ptx_event *e = async_cleanups;
871 async_cleanups = async_cleanups->next;
873 GOMP_PLUGIN_async_unmap_vars (e->addr, e->val);
874 free (e);
878 static void
879 event_add (enum ptx_event_type type, CUevent *e, void *h, int val)
881 struct ptx_event *ptx_event;
882 struct nvptx_thread *nvthd = nvptx_thread ();
884 assert (type == PTX_EVT_MEM || type == PTX_EVT_KNL || type == PTX_EVT_SYNC
885 || type == PTX_EVT_ASYNC_CLEANUP);
887 ptx_event = GOMP_PLUGIN_malloc (sizeof (struct ptx_event));
888 ptx_event->type = type;
889 ptx_event->evt = e;
890 ptx_event->addr = h;
891 ptx_event->ord = nvthd->ptx_dev->ord;
892 ptx_event->val = val;
894 pthread_mutex_lock (&ptx_event_lock);
896 ptx_event->next = ptx_events;
897 ptx_events = ptx_event;
899 pthread_mutex_unlock (&ptx_event_lock);
902 void
903 nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
904 int async, unsigned *dims, void *targ_mem_desc)
906 struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
907 CUfunction function;
908 CUresult r;
909 int i;
910 struct ptx_stream *dev_str;
911 void *kargs[1];
912 void *hp, *dp;
913 struct nvptx_thread *nvthd = nvptx_thread ();
914 const char *maybe_abort_msg = "(perhaps abort was called)";
916 function = targ_fn->fn;
918 dev_str = select_stream_for_async (async, pthread_self (), false, NULL);
919 assert (dev_str == nvthd->current_stream);
921 /* Initialize the launch dimensions. Typically this is constant,
922 provided by the device compiler, but we must permit runtime
923 values. */
924 int seen_zero = 0;
925 for (i = 0; i != GOMP_DIM_MAX; i++)
927 if (targ_fn->launch->dim[i])
928 dims[i] = targ_fn->launch->dim[i];
929 if (!dims[i])
930 seen_zero = 1;
933 if (seen_zero)
935 for (i = 0; i != GOMP_DIM_MAX; i++)
936 if (!dims[i])
937 dims[i] = /* TODO */ 32;
940 /* This reserves a chunk of a pre-allocated page of memory mapped on both
941 the host and the device. HP is a host pointer to the new chunk, and DP is
942 the corresponding device pointer. */
943 map_push (dev_str, async, mapnum * sizeof (void *), &hp, &dp);
945 GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__);
947 /* Copy the array of arguments to the mapped page. */
948 for (i = 0; i < mapnum; i++)
949 ((void **) hp)[i] = devaddrs[i];
951 /* Copy the (device) pointers to arguments to the device (dp and hp might in
952 fact have the same value on a unified-memory system). */
953 CUDA_CALL_ASSERT (cuMemcpy, (CUdeviceptr) dp, (CUdeviceptr) hp,
954 mapnum * sizeof (void *));
955 GOMP_PLUGIN_debug (0, " %s: kernel %s: launch"
956 " gangs=%u, workers=%u, vectors=%u\n",
957 __FUNCTION__, targ_fn->launch->fn,
958 dims[0], dims[1], dims[2]);
960 // OpenACC CUDA
962 // num_gangs nctaid.x
963 // num_workers ntid.y
964 // vector length ntid.x
966 kargs[0] = &dp;
967 CUDA_CALL_ASSERT (cuLaunchKernel, function,
968 dims[GOMP_DIM_GANG], 1, 1,
969 dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1,
970 0, dev_str->stream, kargs, 0);
972 #ifndef DISABLE_ASYNC
973 if (async < acc_async_noval)
975 r = cuStreamSynchronize (dev_str->stream);
976 if (r == CUDA_ERROR_LAUNCH_FAILED)
977 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
978 maybe_abort_msg);
979 else if (r != CUDA_SUCCESS)
980 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
982 else
984 CUevent *e;
986 e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
988 r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
989 if (r == CUDA_ERROR_LAUNCH_FAILED)
990 GOMP_PLUGIN_fatal ("cuEventCreate error: %s %s\n", cuda_error (r),
991 maybe_abort_msg);
992 else if (r != CUDA_SUCCESS)
993 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
995 event_gc (true);
997 CUDA_CALL_ASSERT (cuEventRecord, *e, dev_str->stream);
999 event_add (PTX_EVT_KNL, e, (void *)dev_str, 0);
1001 #else
1002 r = cuCtxSynchronize ();
1003 if (r == CUDA_ERROR_LAUNCH_FAILED)
1004 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
1005 maybe_abort_msg);
1006 else if (r != CUDA_SUCCESS)
1007 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
1008 #endif
1010 GOMP_PLUGIN_debug (0, " %s: kernel %s: finished\n", __FUNCTION__,
1011 targ_fn->launch->fn);
1013 #ifndef DISABLE_ASYNC
1014 if (async < acc_async_noval)
1015 #endif
1016 map_pop (dev_str);
1019 void * openacc_get_current_cuda_context (void);
1021 static void *
1022 nvptx_alloc (size_t s)
1024 CUdeviceptr d;
1026 CUDA_CALL_ERET (NULL, cuMemAlloc, &d, s);
1027 return (void *) d;
1030 static bool
1031 nvptx_free (void *p)
1033 CUdeviceptr pb;
1034 size_t ps;
1036 CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) p);
1037 if ((CUdeviceptr) p != pb)
1039 GOMP_PLUGIN_error ("invalid device address");
1040 return false;
1043 CUDA_CALL (cuMemFree, (CUdeviceptr) p);
1044 return true;
1048 static bool
1049 nvptx_host2dev (void *d, const void *h, size_t s)
1051 CUdeviceptr pb;
1052 size_t ps;
1053 struct nvptx_thread *nvthd = nvptx_thread ();
1055 if (!s)
1056 return true;
1057 if (!d)
1059 GOMP_PLUGIN_error ("invalid device address");
1060 return false;
1063 CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
1065 if (!pb)
1067 GOMP_PLUGIN_error ("invalid device address");
1068 return false;
1070 if (!h)
1072 GOMP_PLUGIN_error ("invalid host address");
1073 return false;
1075 if (d == h)
1077 GOMP_PLUGIN_error ("invalid host or device address");
1078 return false;
1080 if ((void *)(d + s) > (void *)(pb + ps))
1082 GOMP_PLUGIN_error ("invalid size");
1083 return false;
1086 #ifndef DISABLE_ASYNC
1087 if (nvthd->current_stream != nvthd->ptx_dev->null_stream)
1089 CUevent *e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1090 CUDA_CALL (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1091 event_gc (false);
1092 CUDA_CALL (cuMemcpyHtoDAsync,
1093 (CUdeviceptr) d, h, s, nvthd->current_stream->stream);
1094 CUDA_CALL (cuEventRecord, *e, nvthd->current_stream->stream);
1095 event_add (PTX_EVT_MEM, e, (void *)h, 0);
1097 else
1098 #endif
1099 CUDA_CALL (cuMemcpyHtoD, (CUdeviceptr) d, h, s);
1101 return true;
1104 static bool
1105 nvptx_dev2host (void *h, const void *d, size_t s)
1107 CUdeviceptr pb;
1108 size_t ps;
1109 struct nvptx_thread *nvthd = nvptx_thread ();
1111 if (!s)
1112 return true;
1113 if (!d)
1115 GOMP_PLUGIN_error ("invalid device address");
1116 return false;
1119 CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
1121 if (!pb)
1123 GOMP_PLUGIN_error ("invalid device address");
1124 return false;
1126 if (!h)
1128 GOMP_PLUGIN_error ("invalid host address");
1129 return false;
1131 if (d == h)
1133 GOMP_PLUGIN_error ("invalid host or device address");
1134 return false;
1136 if ((void *)(d + s) > (void *)(pb + ps))
1138 GOMP_PLUGIN_error ("invalid size");
1139 return false;
1142 #ifndef DISABLE_ASYNC
1143 if (nvthd->current_stream != nvthd->ptx_dev->null_stream)
1145 CUevent *e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1146 CUDA_CALL (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1147 event_gc (false);
1148 CUDA_CALL (cuMemcpyDtoHAsync,
1149 h, (CUdeviceptr) d, s, nvthd->current_stream->stream);
1150 CUDA_CALL (cuEventRecord, *e, nvthd->current_stream->stream);
1151 event_add (PTX_EVT_MEM, e, (void *)h, 0);
1153 else
1154 #endif
1155 CUDA_CALL (cuMemcpyDtoH, h, (CUdeviceptr) d, s);
1157 return true;
1160 static void
1161 nvptx_set_async (int async)
1163 struct nvptx_thread *nvthd = nvptx_thread ();
1164 nvthd->current_stream
1165 = select_stream_for_async (async, pthread_self (), true, NULL);
1168 static int
1169 nvptx_async_test (int async)
1171 CUresult r;
1172 struct ptx_stream *s;
1174 s = select_stream_for_async (async, pthread_self (), false, NULL);
1176 if (!s)
1177 GOMP_PLUGIN_fatal ("unknown async %d", async);
1179 r = cuStreamQuery (s->stream);
1180 if (r == CUDA_SUCCESS)
1182 /* The oacc-parallel.c:goacc_wait function calls this hook to determine
1183 whether all work has completed on this stream, and if so omits the call
1184 to the wait hook. If that happens, event_gc might not get called
1185 (which prevents variables from getting unmapped and their associated
1186 device storage freed), so call it here. */
1187 event_gc (true);
1188 return 1;
1190 else if (r == CUDA_ERROR_NOT_READY)
1191 return 0;
1193 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
1195 return 0;
1198 static int
1199 nvptx_async_test_all (void)
1201 struct ptx_stream *s;
1202 pthread_t self = pthread_self ();
1203 struct nvptx_thread *nvthd = nvptx_thread ();
1205 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1207 for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
1209 if ((s->multithreaded || pthread_equal (s->host_thread, self))
1210 && cuStreamQuery (s->stream) == CUDA_ERROR_NOT_READY)
1212 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1213 return 0;
1217 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1219 event_gc (true);
1221 return 1;
1224 static void
1225 nvptx_wait (int async)
1227 struct ptx_stream *s;
1229 s = select_stream_for_async (async, pthread_self (), false, NULL);
1230 if (!s)
1231 GOMP_PLUGIN_fatal ("unknown async %d", async);
1233 CUDA_CALL_ASSERT (cuStreamSynchronize, s->stream);
1235 event_gc (true);
1238 static void
1239 nvptx_wait_async (int async1, int async2)
1241 CUevent *e;
1242 struct ptx_stream *s1, *s2;
1243 pthread_t self = pthread_self ();
1245 /* The stream that is waiting (rather than being waited for) doesn't
1246 necessarily have to exist already. */
1247 s2 = select_stream_for_async (async2, self, true, NULL);
1249 s1 = select_stream_for_async (async1, self, false, NULL);
1250 if (!s1)
1251 GOMP_PLUGIN_fatal ("invalid async 1\n");
1253 if (s1 == s2)
1254 GOMP_PLUGIN_fatal ("identical parameters");
1256 e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1258 CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1260 event_gc (true);
1262 CUDA_CALL_ASSERT (cuEventRecord, *e, s1->stream);
1264 event_add (PTX_EVT_SYNC, e, NULL, 0);
1266 CUDA_CALL_ASSERT (cuStreamWaitEvent, s2->stream, *e, 0);
1269 static void
1270 nvptx_wait_all (void)
1272 CUresult r;
1273 struct ptx_stream *s;
1274 pthread_t self = pthread_self ();
1275 struct nvptx_thread *nvthd = nvptx_thread ();
1277 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1279 /* Wait for active streams initiated by this thread (or by multiple threads)
1280 to complete. */
1281 for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
1283 if (s->multithreaded || pthread_equal (s->host_thread, self))
1285 r = cuStreamQuery (s->stream);
1286 if (r == CUDA_SUCCESS)
1287 continue;
1288 else if (r != CUDA_ERROR_NOT_READY)
1289 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
1291 CUDA_CALL_ASSERT (cuStreamSynchronize, s->stream);
1295 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1297 event_gc (true);
1300 static void
1301 nvptx_wait_all_async (int async)
1303 struct ptx_stream *waiting_stream, *other_stream;
1304 CUevent *e;
1305 struct nvptx_thread *nvthd = nvptx_thread ();
1306 pthread_t self = pthread_self ();
1308 /* The stream doing the waiting. This could be the first mention of the
1309 stream, so create it if necessary. */
1310 waiting_stream
1311 = select_stream_for_async (async, pthread_self (), true, NULL);
1313 /* Launches on the null stream already block on other streams in the
1314 context. */
1315 if (!waiting_stream || waiting_stream == nvthd->ptx_dev->null_stream)
1316 return;
1318 event_gc (true);
1320 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1322 for (other_stream = nvthd->ptx_dev->active_streams;
1323 other_stream != NULL;
1324 other_stream = other_stream->next)
1326 if (!other_stream->multithreaded
1327 && !pthread_equal (other_stream->host_thread, self))
1328 continue;
1330 e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1332 CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1334 /* Record an event on the waited-for stream. */
1335 CUDA_CALL_ASSERT (cuEventRecord, *e, other_stream->stream);
1337 event_add (PTX_EVT_SYNC, e, NULL, 0);
1339 CUDA_CALL_ASSERT (cuStreamWaitEvent, waiting_stream->stream, *e, 0);
1342 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1345 static void *
1346 nvptx_get_current_cuda_device (void)
1348 struct nvptx_thread *nvthd = nvptx_thread ();
1350 if (!nvthd || !nvthd->ptx_dev)
1351 return NULL;
1353 return &nvthd->ptx_dev->dev;
1356 static void *
1357 nvptx_get_current_cuda_context (void)
1359 struct nvptx_thread *nvthd = nvptx_thread ();
1361 if (!nvthd || !nvthd->ptx_dev)
1362 return NULL;
1364 return nvthd->ptx_dev->ctx;
1367 static void *
1368 nvptx_get_cuda_stream (int async)
1370 struct ptx_stream *s;
1371 struct nvptx_thread *nvthd = nvptx_thread ();
1373 if (!nvthd || !nvthd->ptx_dev)
1374 return NULL;
1376 s = select_stream_for_async (async, pthread_self (), false, NULL);
1378 return s ? s->stream : NULL;
1381 static int
1382 nvptx_set_cuda_stream (int async, void *stream)
1384 struct ptx_stream *oldstream;
1385 pthread_t self = pthread_self ();
1386 struct nvptx_thread *nvthd = nvptx_thread ();
1388 if (async < 0)
1389 GOMP_PLUGIN_fatal ("bad async %d", async);
1391 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1393 /* We have a list of active streams and an array mapping async values to
1394 entries of that list. We need to take "ownership" of the passed-in stream,
1395 and add it to our list, removing the previous entry also (if there was one)
1396 in order to prevent resource leaks. Note the potential for surprise
1397 here: maybe we should keep track of passed-in streams and leave it up to
1398 the user to tidy those up, but that doesn't work for stream handles
1399 returned from acc_get_cuda_stream above... */
1401 oldstream = select_stream_for_async (async, self, false, NULL);
1403 if (oldstream)
1405 if (nvthd->ptx_dev->active_streams == oldstream)
1406 nvthd->ptx_dev->active_streams = nvthd->ptx_dev->active_streams->next;
1407 else
1409 struct ptx_stream *s = nvthd->ptx_dev->active_streams;
1410 while (s->next != oldstream)
1411 s = s->next;
1412 s->next = s->next->next;
1415 CUDA_CALL_ASSERT (cuStreamDestroy, oldstream->stream);
1417 if (!map_fini (oldstream))
1418 GOMP_PLUGIN_fatal ("error when freeing host memory");
1420 free (oldstream);
1423 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1425 (void) select_stream_for_async (async, self, true, (CUstream) stream);
1427 return 1;
1430 /* Plugin entry points. */
1432 const char *
1433 GOMP_OFFLOAD_get_name (void)
1435 return "nvptx";
1438 unsigned int
1439 GOMP_OFFLOAD_get_caps (void)
1441 return GOMP_OFFLOAD_CAP_OPENACC_200;
1445 GOMP_OFFLOAD_get_type (void)
1447 return OFFLOAD_TARGET_TYPE_NVIDIA_PTX;
1451 GOMP_OFFLOAD_get_num_devices (void)
1453 return nvptx_get_num_devices ();
1456 bool
1457 GOMP_OFFLOAD_init_device (int n)
1459 struct ptx_device *dev;
1461 pthread_mutex_lock (&ptx_dev_lock);
1463 if (!nvptx_init () || ptx_devices[n] != NULL)
1465 pthread_mutex_unlock (&ptx_dev_lock);
1466 return false;
1469 dev = nvptx_open_device (n);
1470 if (dev)
1472 ptx_devices[n] = dev;
1473 instantiated_devices++;
1476 pthread_mutex_unlock (&ptx_dev_lock);
1478 return dev != NULL;
1481 bool
1482 GOMP_OFFLOAD_fini_device (int n)
1484 pthread_mutex_lock (&ptx_dev_lock);
1486 if (ptx_devices[n] != NULL)
1488 if (!nvptx_attach_host_thread_to_device (n)
1489 || !nvptx_close_device (ptx_devices[n]))
1491 pthread_mutex_unlock (&ptx_dev_lock);
1492 return false;
1494 ptx_devices[n] = NULL;
1495 instantiated_devices--;
1498 pthread_mutex_unlock (&ptx_dev_lock);
1499 return true;
1502 /* Return the libgomp version number we're compatible with. There is
1503 no requirement for cross-version compatibility. */
1505 unsigned
1506 GOMP_OFFLOAD_version (void)
1508 return GOMP_VERSION;
1511 /* Load the (partial) program described by TARGET_DATA to device
1512 number ORD. Allocate and return TARGET_TABLE. */
1515 GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
1516 struct addr_pair **target_table)
1518 CUmodule module;
1519 const char *const *var_names;
1520 const struct targ_fn_launch *fn_descs;
1521 unsigned int fn_entries, var_entries, i, j;
1522 struct targ_fn_descriptor *targ_fns;
1523 struct addr_pair *targ_tbl;
1524 const nvptx_tdata_t *img_header = (const nvptx_tdata_t *) target_data;
1525 struct ptx_image_data *new_image;
1526 struct ptx_device *dev;
1528 if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1530 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1531 " (expected %u, received %u)",
1532 GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1533 return -1;
1536 if (!nvptx_attach_host_thread_to_device (ord)
1537 || !link_ptx (&module, img_header->ptx_objs, img_header->ptx_num))
1538 return -1;
1540 dev = ptx_devices[ord];
1542 /* The mkoffload utility emits a struct of pointers/integers at the
1543 start of each offload image. The array of kernel names and the
1544 functions addresses form a one-to-one correspondence. */
1546 var_entries = img_header->var_num;
1547 var_names = img_header->var_names;
1548 fn_entries = img_header->fn_num;
1549 fn_descs = img_header->fn_descs;
1551 targ_tbl = GOMP_PLUGIN_malloc (sizeof (struct addr_pair)
1552 * (fn_entries + var_entries));
1553 targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
1554 * fn_entries);
1556 *target_table = targ_tbl;
1558 new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data));
1559 new_image->target_data = target_data;
1560 new_image->module = module;
1561 new_image->fns = targ_fns;
1563 pthread_mutex_lock (&dev->image_lock);
1564 new_image->next = dev->images;
1565 dev->images = new_image;
1566 pthread_mutex_unlock (&dev->image_lock);
1568 for (i = 0; i < fn_entries; i++, targ_fns++, targ_tbl++)
1570 CUfunction function;
1572 CUDA_CALL_ERET (-1, cuModuleGetFunction, &function, module,
1573 fn_descs[i].fn);
1575 targ_fns->fn = function;
1576 targ_fns->launch = &fn_descs[i];
1578 targ_tbl->start = (uintptr_t) targ_fns;
1579 targ_tbl->end = targ_tbl->start + 1;
1582 for (j = 0; j < var_entries; j++, targ_tbl++)
1584 CUdeviceptr var;
1585 size_t bytes;
1587 CUDA_CALL_ERET (-1, cuModuleGetGlobal,
1588 &var, &bytes, module, var_names[j]);
1590 targ_tbl->start = (uintptr_t) var;
1591 targ_tbl->end = targ_tbl->start + bytes;
1594 return fn_entries + var_entries;
1597 /* Unload the program described by TARGET_DATA. DEV_DATA is the
1598 function descriptors allocated by G_O_load_image. */
1600 bool
1601 GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data)
1603 struct ptx_image_data *image, **prev_p;
1604 struct ptx_device *dev = ptx_devices[ord];
1606 if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1608 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1609 " (expected %u, received %u)",
1610 GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1611 return false;
1614 bool ret = true;
1615 pthread_mutex_lock (&dev->image_lock);
1616 for (prev_p = &dev->images; (image = *prev_p) != 0; prev_p = &image->next)
1617 if (image->target_data == target_data)
1619 *prev_p = image->next;
1620 if (cuModuleUnload (image->module) != CUDA_SUCCESS)
1621 ret = false;
1622 free (image->fns);
1623 free (image);
1624 break;
1626 pthread_mutex_unlock (&dev->image_lock);
1627 return ret;
1630 void *
1631 GOMP_OFFLOAD_alloc (int ord, size_t size)
1633 if (!nvptx_attach_host_thread_to_device (ord))
1634 return NULL;
1635 return nvptx_alloc (size);
1638 bool
1639 GOMP_OFFLOAD_free (int ord, void *ptr)
1641 return (nvptx_attach_host_thread_to_device (ord)
1642 && nvptx_free (ptr));
1645 bool
1646 GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
1648 return (nvptx_attach_host_thread_to_device (ord)
1649 && nvptx_dev2host (dst, src, n));
1652 bool
1653 GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
1655 return (nvptx_attach_host_thread_to_device (ord)
1656 && nvptx_host2dev (dst, src, n));
1659 void (*device_run) (int n, void *fn_ptr, void *vars) = NULL;
1661 void
1662 GOMP_OFFLOAD_openacc_parallel (void (*fn) (void *), size_t mapnum,
1663 void **hostaddrs, void **devaddrs,
1664 int async, unsigned *dims, void *targ_mem_desc)
1666 nvptx_exec (fn, mapnum, hostaddrs, devaddrs, async, dims, targ_mem_desc);
1669 void
1670 GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc, int async)
1672 struct nvptx_thread *nvthd = nvptx_thread ();
1673 CUevent *e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1675 CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1676 CUDA_CALL_ASSERT (cuEventRecord, *e, nvthd->current_stream->stream);
1677 event_add (PTX_EVT_ASYNC_CLEANUP, e, targ_mem_desc, async);
1681 GOMP_OFFLOAD_openacc_async_test (int async)
1683 return nvptx_async_test (async);
1687 GOMP_OFFLOAD_openacc_async_test_all (void)
1689 return nvptx_async_test_all ();
1692 void
1693 GOMP_OFFLOAD_openacc_async_wait (int async)
1695 nvptx_wait (async);
1698 void
1699 GOMP_OFFLOAD_openacc_async_wait_async (int async1, int async2)
1701 nvptx_wait_async (async1, async2);
1704 void
1705 GOMP_OFFLOAD_openacc_async_wait_all (void)
1707 nvptx_wait_all ();
1710 void
1711 GOMP_OFFLOAD_openacc_async_wait_all_async (int async)
1713 nvptx_wait_all_async (async);
1716 void
1717 GOMP_OFFLOAD_openacc_async_set_async (int async)
1719 nvptx_set_async (async);
1722 void *
1723 GOMP_OFFLOAD_openacc_create_thread_data (int ord)
1725 struct ptx_device *ptx_dev;
1726 struct nvptx_thread *nvthd
1727 = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
1728 CUcontext thd_ctx;
1730 ptx_dev = ptx_devices[ord];
1732 assert (ptx_dev);
1734 CUDA_CALL_ASSERT (cuCtxGetCurrent, &thd_ctx);
1736 assert (ptx_dev->ctx);
1738 if (!thd_ctx)
1739 CUDA_CALL_ASSERT (cuCtxPushCurrent, ptx_dev->ctx);
1741 nvthd->current_stream = ptx_dev->null_stream;
1742 nvthd->ptx_dev = ptx_dev;
1744 return (void *) nvthd;
1747 void
1748 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
1750 free (data);
1753 void *
1754 GOMP_OFFLOAD_openacc_get_current_cuda_device (void)
1756 return nvptx_get_current_cuda_device ();
1759 void *
1760 GOMP_OFFLOAD_openacc_get_current_cuda_context (void)
1762 return nvptx_get_current_cuda_context ();
1765 /* NOTE: This returns a CUstream, not a ptx_stream pointer. */
1767 void *
1768 GOMP_OFFLOAD_openacc_get_cuda_stream (int async)
1770 return nvptx_get_cuda_stream (async);
1773 /* NOTE: This takes a CUstream, not a ptx_stream pointer. */
1776 GOMP_OFFLOAD_openacc_set_cuda_stream (int async, void *stream)
1778 return nvptx_set_cuda_stream (async, stream);