2016-04-20 Yannick Moy <moy@adacore.com>
[official-gcc.git] / libgomp / plugin / plugin-nvptx.c
blob3f1bb6d90e92fe1881b8537097bb4d6eea155ed3
1 /* Plugin for NVPTX execution.
3 Copyright (C) 2013-2016 Free Software Foundation, Inc.
5 Contributed by Mentor Embedded.
7 This file is part of the GNU Offloading and Multi Processing Library
8 (libgomp).
10 Libgomp is free software; you can redistribute it and/or modify it
11 under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 3, or (at your option)
13 any later version.
15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
18 more details.
20 Under Section 7 of GPL version 3, you are granted additional
21 permissions described in the GCC Runtime Library Exception, version
22 3.1, as published by the Free Software Foundation.
24 You should have received a copy of the GNU General Public License and
25 a copy of the GCC Runtime Library Exception along with this program;
26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
27 <http://www.gnu.org/licenses/>. */
29 /* Nvidia PTX-specific parts of OpenACC support. The cuda driver
30 library appears to hold some implicit state, but the documentation
31 is not clear as to what that state might be. Or how one might
32 propagate it from one thread to another. */
34 #include "openacc.h"
35 #include "config.h"
36 #include "libgomp-plugin.h"
37 #include "oacc-plugin.h"
38 #include "gomp-constants.h"
40 #include <pthread.h>
41 #include <cuda.h>
42 #include <stdbool.h>
43 #include <stdint.h>
44 #include <string.h>
45 #include <stdio.h>
46 #include <unistd.h>
47 #include <assert.h>
49 static const char *
50 cuda_error (CUresult r)
52 #if CUDA_VERSION < 7000
53 /* Specified in documentation and present in library from at least
54 5.5. Not declared in header file prior to 7.0. */
55 extern CUresult cuGetErrorString (CUresult, const char **);
56 #endif
57 const char *desc;
59 r = cuGetErrorString (r, &desc);
60 if (r != CUDA_SUCCESS)
61 desc = "unknown cuda error";
63 return desc;
66 static unsigned int instantiated_devices = 0;
67 static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
69 struct ptx_stream
71 CUstream stream;
72 pthread_t host_thread;
73 bool multithreaded;
75 CUdeviceptr d;
76 void *h;
77 void *h_begin;
78 void *h_end;
79 void *h_next;
80 void *h_prev;
81 void *h_tail;
83 struct ptx_stream *next;
86 /* Thread-specific data for PTX. */
88 struct nvptx_thread
90 struct ptx_stream *current_stream;
91 struct ptx_device *ptx_dev;
94 struct map
96 int async;
97 size_t size;
98 char mappings[0];
101 static void
102 map_init (struct ptx_stream *s)
104 CUresult r;
106 int size = getpagesize ();
108 assert (s);
109 assert (!s->d);
110 assert (!s->h);
112 r = cuMemAllocHost (&s->h, size);
113 if (r != CUDA_SUCCESS)
114 GOMP_PLUGIN_fatal ("cuMemAllocHost error: %s", cuda_error (r));
116 r = cuMemHostGetDevicePointer (&s->d, s->h, 0);
117 if (r != CUDA_SUCCESS)
118 GOMP_PLUGIN_fatal ("cuMemHostGetDevicePointer error: %s", cuda_error (r));
120 assert (s->h);
122 s->h_begin = s->h;
123 s->h_end = s->h_begin + size;
124 s->h_next = s->h_prev = s->h_tail = s->h_begin;
126 assert (s->h_next);
127 assert (s->h_end);
130 static void
131 map_fini (struct ptx_stream *s)
133 CUresult r;
135 r = cuMemFreeHost (s->h);
136 if (r != CUDA_SUCCESS)
137 GOMP_PLUGIN_fatal ("cuMemFreeHost error: %s", cuda_error (r));
140 static void
141 map_pop (struct ptx_stream *s)
143 struct map *m;
145 assert (s != NULL);
146 assert (s->h_next);
147 assert (s->h_prev);
148 assert (s->h_tail);
150 m = s->h_tail;
152 s->h_tail += m->size;
154 if (s->h_tail >= s->h_end)
155 s->h_tail = s->h_begin + (int) (s->h_tail - s->h_end);
157 if (s->h_next == s->h_tail)
158 s->h_prev = s->h_next;
160 assert (s->h_next >= s->h_begin);
161 assert (s->h_tail >= s->h_begin);
162 assert (s->h_prev >= s->h_begin);
164 assert (s->h_next <= s->h_end);
165 assert (s->h_tail <= s->h_end);
166 assert (s->h_prev <= s->h_end);
169 static void
170 map_push (struct ptx_stream *s, int async, size_t size, void **h, void **d)
172 int left;
173 int offset;
174 struct map *m;
176 assert (s != NULL);
178 left = s->h_end - s->h_next;
179 size += sizeof (struct map);
181 assert (s->h_prev);
182 assert (s->h_next);
184 if (size >= left)
186 m = s->h_prev;
187 m->size += left;
188 s->h_next = s->h_begin;
190 if (s->h_next + size > s->h_end)
191 GOMP_PLUGIN_fatal ("unable to push map");
194 assert (s->h_next);
196 m = s->h_next;
197 m->async = async;
198 m->size = size;
200 offset = (void *)&m->mappings[0] - s->h;
202 *d = (void *)(s->d + offset);
203 *h = (void *)(s->h + offset);
205 s->h_prev = s->h_next;
206 s->h_next += size;
208 assert (s->h_prev);
209 assert (s->h_next);
211 assert (s->h_next >= s->h_begin);
212 assert (s->h_tail >= s->h_begin);
213 assert (s->h_prev >= s->h_begin);
214 assert (s->h_next <= s->h_end);
215 assert (s->h_tail <= s->h_end);
216 assert (s->h_prev <= s->h_end);
218 return;
221 /* Target data function launch information. */
223 struct targ_fn_launch
225 const char *fn;
226 unsigned short dim[GOMP_DIM_MAX];
229 /* Target PTX object information. */
231 struct targ_ptx_obj
233 const char *code;
234 size_t size;
237 /* Target data image information. */
239 typedef struct nvptx_tdata
241 const struct targ_ptx_obj *ptx_objs;
242 unsigned ptx_num;
244 const char *const *var_names;
245 unsigned var_num;
247 const struct targ_fn_launch *fn_descs;
248 unsigned fn_num;
249 } nvptx_tdata_t;
251 /* Descriptor of a loaded function. */
253 struct targ_fn_descriptor
255 CUfunction fn;
256 const struct targ_fn_launch *launch;
259 /* A loaded PTX image. */
260 struct ptx_image_data
262 const void *target_data;
263 CUmodule module;
265 struct targ_fn_descriptor *fns; /* Array of functions. */
267 struct ptx_image_data *next;
270 struct ptx_device
272 CUcontext ctx;
273 bool ctx_shared;
274 CUdevice dev;
275 struct ptx_stream *null_stream;
276 /* All non-null streams associated with this device (actually context),
277 either created implicitly or passed in from the user (via
278 acc_set_cuda_stream). */
279 struct ptx_stream *active_streams;
280 struct {
281 struct ptx_stream **arr;
282 int size;
283 } async_streams;
284 /* A lock for use when manipulating the above stream list and array. */
285 pthread_mutex_t stream_lock;
286 int ord;
287 bool overlap;
288 bool map;
289 bool concur;
290 int mode;
291 bool mkern;
293 struct ptx_image_data *images; /* Images loaded on device. */
294 pthread_mutex_t image_lock; /* Lock for above list. */
296 struct ptx_device *next;
299 enum ptx_event_type
301 PTX_EVT_MEM,
302 PTX_EVT_KNL,
303 PTX_EVT_SYNC,
304 PTX_EVT_ASYNC_CLEANUP
307 struct ptx_event
309 CUevent *evt;
310 int type;
311 void *addr;
312 int ord;
314 struct ptx_event *next;
317 static pthread_mutex_t ptx_event_lock;
318 static struct ptx_event *ptx_events;
320 static struct ptx_device **ptx_devices;
322 static inline struct nvptx_thread *
323 nvptx_thread (void)
325 return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
328 static void
329 init_streams_for_device (struct ptx_device *ptx_dev, int concurrency)
331 int i;
332 struct ptx_stream *null_stream
333 = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
335 null_stream->stream = NULL;
336 null_stream->host_thread = pthread_self ();
337 null_stream->multithreaded = true;
338 null_stream->d = (CUdeviceptr) NULL;
339 null_stream->h = NULL;
340 map_init (null_stream);
341 ptx_dev->null_stream = null_stream;
343 ptx_dev->active_streams = NULL;
344 pthread_mutex_init (&ptx_dev->stream_lock, NULL);
346 if (concurrency < 1)
347 concurrency = 1;
349 /* This is just a guess -- make space for as many async streams as the
350 current device is capable of concurrently executing. This can grow
351 later as necessary. No streams are created yet. */
352 ptx_dev->async_streams.arr
353 = GOMP_PLUGIN_malloc (concurrency * sizeof (struct ptx_stream *));
354 ptx_dev->async_streams.size = concurrency;
356 for (i = 0; i < concurrency; i++)
357 ptx_dev->async_streams.arr[i] = NULL;
360 static void
361 fini_streams_for_device (struct ptx_device *ptx_dev)
363 free (ptx_dev->async_streams.arr);
365 while (ptx_dev->active_streams != NULL)
367 struct ptx_stream *s = ptx_dev->active_streams;
368 ptx_dev->active_streams = ptx_dev->active_streams->next;
370 map_fini (s);
371 cuStreamDestroy (s->stream);
372 free (s);
375 map_fini (ptx_dev->null_stream);
376 free (ptx_dev->null_stream);
379 /* Select a stream for (OpenACC-semantics) ASYNC argument for the current
380 thread THREAD (and also current device/context). If CREATE is true, create
381 the stream if it does not exist (or use EXISTING if it is non-NULL), and
382 associate the stream with the same thread argument. Returns stream to use
383 as result. */
385 static struct ptx_stream *
386 select_stream_for_async (int async, pthread_t thread, bool create,
387 CUstream existing)
389 struct nvptx_thread *nvthd = nvptx_thread ();
390 /* Local copy of TLS variable. */
391 struct ptx_device *ptx_dev = nvthd->ptx_dev;
392 struct ptx_stream *stream = NULL;
393 int orig_async = async;
395 /* The special value acc_async_noval (-1) maps (for now) to an
396 implicitly-created stream, which is then handled the same as any other
397 numbered async stream. Other options are available, e.g. using the null
398 stream for anonymous async operations, or choosing an idle stream from an
399 active set. But, stick with this for now. */
400 if (async > acc_async_sync)
401 async++;
403 if (create)
404 pthread_mutex_lock (&ptx_dev->stream_lock);
406 /* NOTE: AFAICT there's no particular need for acc_async_sync to map to the
407 null stream, and in fact better performance may be obtainable if it doesn't
408 (because the null stream enforces overly-strict synchronisation with
409 respect to other streams for legacy reasons, and that's probably not
410 needed with OpenACC). Maybe investigate later. */
411 if (async == acc_async_sync)
412 stream = ptx_dev->null_stream;
413 else if (async >= 0 && async < ptx_dev->async_streams.size
414 && ptx_dev->async_streams.arr[async] && !(create && existing))
415 stream = ptx_dev->async_streams.arr[async];
416 else if (async >= 0 && create)
418 if (async >= ptx_dev->async_streams.size)
420 int i, newsize = ptx_dev->async_streams.size * 2;
422 if (async >= newsize)
423 newsize = async + 1;
425 ptx_dev->async_streams.arr
426 = GOMP_PLUGIN_realloc (ptx_dev->async_streams.arr,
427 newsize * sizeof (struct ptx_stream *));
429 for (i = ptx_dev->async_streams.size; i < newsize; i++)
430 ptx_dev->async_streams.arr[i] = NULL;
432 ptx_dev->async_streams.size = newsize;
435 /* Create a new stream on-demand if there isn't one already, or if we're
436 setting a particular async value to an existing (externally-provided)
437 stream. */
438 if (!ptx_dev->async_streams.arr[async] || existing)
440 CUresult r;
441 struct ptx_stream *s
442 = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
444 if (existing)
445 s->stream = existing;
446 else
448 r = cuStreamCreate (&s->stream, CU_STREAM_DEFAULT);
449 if (r != CUDA_SUCCESS)
450 GOMP_PLUGIN_fatal ("cuStreamCreate error: %s", cuda_error (r));
453 /* If CREATE is true, we're going to be queueing some work on this
454 stream. Associate it with the current host thread. */
455 s->host_thread = thread;
456 s->multithreaded = false;
458 s->d = (CUdeviceptr) NULL;
459 s->h = NULL;
460 map_init (s);
462 s->next = ptx_dev->active_streams;
463 ptx_dev->active_streams = s;
464 ptx_dev->async_streams.arr[async] = s;
467 stream = ptx_dev->async_streams.arr[async];
469 else if (async < 0)
470 GOMP_PLUGIN_fatal ("bad async %d", async);
472 if (create)
474 assert (stream != NULL);
476 /* If we're trying to use the same stream from different threads
477 simultaneously, set stream->multithreaded to true. This affects the
478 behaviour of acc_async_test_all and acc_wait_all, which are supposed to
479 only wait for asynchronous launches from the same host thread they are
480 invoked on. If multiple threads use the same async value, we make note
481 of that here and fall back to testing/waiting for all threads in those
482 functions. */
483 if (thread != stream->host_thread)
484 stream->multithreaded = true;
486 pthread_mutex_unlock (&ptx_dev->stream_lock);
488 else if (stream && !stream->multithreaded
489 && !pthread_equal (stream->host_thread, thread))
490 GOMP_PLUGIN_fatal ("async %d used on wrong thread", orig_async);
492 return stream;
495 /* Initialize the device. Return TRUE on success, else FALSE. PTX_DEV_LOCK
496 should be locked on entry and remains locked on exit. */
498 static bool
499 nvptx_init (void)
501 CUresult r;
502 int ndevs;
504 if (instantiated_devices != 0)
505 return true;
507 r = cuInit (0);
508 if (r != CUDA_SUCCESS)
509 GOMP_PLUGIN_fatal ("cuInit error: %s", cuda_error (r));
511 ptx_events = NULL;
513 pthread_mutex_init (&ptx_event_lock, NULL);
515 r = cuDeviceGetCount (&ndevs);
516 if (r != CUDA_SUCCESS)
517 GOMP_PLUGIN_fatal ("cuDeviceGetCount error: %s", cuda_error (r));
519 ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *)
520 * ndevs);
522 return true;
525 /* Select the N'th PTX device for the current host thread. The device must
526 have been previously opened before calling this function. */
528 static void
529 nvptx_attach_host_thread_to_device (int n)
531 CUdevice dev;
532 CUresult r;
533 struct ptx_device *ptx_dev;
534 CUcontext thd_ctx;
536 r = cuCtxGetDevice (&dev);
537 if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
538 GOMP_PLUGIN_fatal ("cuCtxGetDevice error: %s", cuda_error (r));
540 if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n)
541 return;
542 else
544 CUcontext old_ctx;
546 ptx_dev = ptx_devices[n];
547 assert (ptx_dev);
549 r = cuCtxGetCurrent (&thd_ctx);
550 if (r != CUDA_SUCCESS)
551 GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));
553 /* We don't necessarily have a current context (e.g. if it has been
554 destroyed. Pop it if we do though. */
555 if (thd_ctx != NULL)
557 r = cuCtxPopCurrent (&old_ctx);
558 if (r != CUDA_SUCCESS)
559 GOMP_PLUGIN_fatal ("cuCtxPopCurrent error: %s", cuda_error (r));
562 r = cuCtxPushCurrent (ptx_dev->ctx);
563 if (r != CUDA_SUCCESS)
564 GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s", cuda_error (r));
568 static struct ptx_device *
569 nvptx_open_device (int n)
571 struct ptx_device *ptx_dev;
572 CUdevice dev, ctx_dev;
573 CUresult r;
574 int async_engines, pi;
576 r = cuDeviceGet (&dev, n);
577 if (r != CUDA_SUCCESS)
578 GOMP_PLUGIN_fatal ("cuDeviceGet error: %s", cuda_error (r));
580 ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device));
582 ptx_dev->ord = n;
583 ptx_dev->dev = dev;
584 ptx_dev->ctx_shared = false;
586 r = cuCtxGetDevice (&ctx_dev);
587 if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
588 GOMP_PLUGIN_fatal ("cuCtxGetDevice error: %s", cuda_error (r));
590 if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev)
592 /* The current host thread has an active context for a different device.
593 Detach it. */
594 CUcontext old_ctx;
596 r = cuCtxPopCurrent (&old_ctx);
597 if (r != CUDA_SUCCESS)
598 GOMP_PLUGIN_fatal ("cuCtxPopCurrent error: %s", cuda_error (r));
601 r = cuCtxGetCurrent (&ptx_dev->ctx);
602 if (r != CUDA_SUCCESS)
603 GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));
605 if (!ptx_dev->ctx)
607 r = cuCtxCreate (&ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
608 if (r != CUDA_SUCCESS)
609 GOMP_PLUGIN_fatal ("cuCtxCreate error: %s", cuda_error (r));
611 else
612 ptx_dev->ctx_shared = true;
614 r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
615 if (r != CUDA_SUCCESS)
616 GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
618 ptx_dev->overlap = pi;
620 r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
621 if (r != CUDA_SUCCESS)
622 GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
624 ptx_dev->map = pi;
626 r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
627 if (r != CUDA_SUCCESS)
628 GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
630 ptx_dev->concur = pi;
632 r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
633 if (r != CUDA_SUCCESS)
634 GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
636 ptx_dev->mode = pi;
638 r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
639 if (r != CUDA_SUCCESS)
640 GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
642 ptx_dev->mkern = pi;
644 r = cuDeviceGetAttribute (&async_engines,
645 CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
646 if (r != CUDA_SUCCESS)
647 async_engines = 1;
649 ptx_dev->images = NULL;
650 pthread_mutex_init (&ptx_dev->image_lock, NULL);
652 init_streams_for_device (ptx_dev, async_engines);
654 return ptx_dev;
657 static void
658 nvptx_close_device (struct ptx_device *ptx_dev)
660 CUresult r;
662 if (!ptx_dev)
663 return;
665 fini_streams_for_device (ptx_dev);
667 pthread_mutex_destroy (&ptx_dev->image_lock);
669 if (!ptx_dev->ctx_shared)
671 r = cuCtxDestroy (ptx_dev->ctx);
672 if (r != CUDA_SUCCESS)
673 GOMP_PLUGIN_fatal ("cuCtxDestroy error: %s", cuda_error (r));
676 free (ptx_dev);
679 static int
680 nvptx_get_num_devices (void)
682 int n;
683 CUresult r;
685 /* PR libgomp/65099: Currently, we only support offloading in 64-bit
686 configurations. */
687 if (sizeof (void *) != 8)
688 return 0;
690 /* This function will be called before the plugin has been initialized in
691 order to enumerate available devices, but CUDA API routines can't be used
692 until cuInit has been called. Just call it now (but don't yet do any
693 further initialization). */
694 if (instantiated_devices == 0)
696 r = cuInit (0);
697 /* This is not an error: e.g. we may have CUDA libraries installed but
698 no devices available. */
699 if (r != CUDA_SUCCESS)
700 return 0;
703 r = cuDeviceGetCount (&n);
704 if (r!= CUDA_SUCCESS)
705 GOMP_PLUGIN_fatal ("cuDeviceGetCount error: %s", cuda_error (r));
707 return n;
711 static void
712 link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs,
713 unsigned num_objs)
715 CUjit_option opts[6];
716 void *optvals[6];
717 float elapsed = 0.0;
718 #define LOGSIZE 8192
719 char elog[LOGSIZE];
720 char ilog[LOGSIZE];
721 unsigned long logsize = LOGSIZE;
722 CUlinkState linkstate;
723 CUresult r;
724 void *linkout;
725 size_t linkoutsize __attribute__ ((unused));
727 opts[0] = CU_JIT_WALL_TIME;
728 optvals[0] = &elapsed;
730 opts[1] = CU_JIT_INFO_LOG_BUFFER;
731 optvals[1] = &ilog[0];
733 opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
734 optvals[2] = (void *) logsize;
736 opts[3] = CU_JIT_ERROR_LOG_BUFFER;
737 optvals[3] = &elog[0];
739 opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
740 optvals[4] = (void *) logsize;
742 opts[5] = CU_JIT_LOG_VERBOSE;
743 optvals[5] = (void *) 1;
745 r = cuLinkCreate (6, opts, optvals, &linkstate);
746 if (r != CUDA_SUCCESS)
747 GOMP_PLUGIN_fatal ("cuLinkCreate error: %s", cuda_error (r));
749 for (; num_objs--; ptx_objs++)
751 /* cuLinkAddData's 'data' argument erroneously omits the const
752 qualifier. */
753 GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs->code);
754 r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, (char*)ptx_objs->code,
755 ptx_objs->size, 0, 0, 0, 0);
756 if (r != CUDA_SUCCESS)
758 GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
759 GOMP_PLUGIN_fatal ("cuLinkAddData (ptx_code) error: %s",
760 cuda_error (r));
764 GOMP_PLUGIN_debug (0, "Linking\n");
765 r = cuLinkComplete (linkstate, &linkout, &linkoutsize);
767 GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed);
768 GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]);
770 if (r != CUDA_SUCCESS)
771 GOMP_PLUGIN_fatal ("cuLinkComplete error: %s", cuda_error (r));
773 r = cuModuleLoadData (module, linkout);
774 if (r != CUDA_SUCCESS)
775 GOMP_PLUGIN_fatal ("cuModuleLoadData error: %s", cuda_error (r));
777 r = cuLinkDestroy (linkstate);
778 if (r != CUDA_SUCCESS)
779 GOMP_PLUGIN_fatal ("cuLinkDestory error: %s", cuda_error (r));
782 static void
783 event_gc (bool memmap_lockable)
785 struct ptx_event *ptx_event = ptx_events;
786 struct nvptx_thread *nvthd = nvptx_thread ();
788 pthread_mutex_lock (&ptx_event_lock);
790 while (ptx_event != NULL)
792 CUresult r;
793 struct ptx_event *e = ptx_event;
795 ptx_event = ptx_event->next;
797 if (e->ord != nvthd->ptx_dev->ord)
798 continue;
800 r = cuEventQuery (*e->evt);
801 if (r == CUDA_SUCCESS)
803 CUevent *te;
805 te = e->evt;
807 switch (e->type)
809 case PTX_EVT_MEM:
810 case PTX_EVT_SYNC:
811 break;
813 case PTX_EVT_KNL:
814 map_pop (e->addr);
815 break;
817 case PTX_EVT_ASYNC_CLEANUP:
819 /* The function gomp_plugin_async_unmap_vars needs to claim the
820 memory-map splay tree lock for the current device, so we
821 can't call it when one of our callers has already claimed
822 the lock. In that case, just delay the GC for this event
823 until later. */
824 if (!memmap_lockable)
825 continue;
827 GOMP_PLUGIN_async_unmap_vars (e->addr);
829 break;
832 cuEventDestroy (*te);
833 free ((void *)te);
835 if (ptx_events == e)
836 ptx_events = ptx_events->next;
837 else
839 struct ptx_event *e_ = ptx_events;
840 while (e_->next != e)
841 e_ = e_->next;
842 e_->next = e_->next->next;
845 free (e);
849 pthread_mutex_unlock (&ptx_event_lock);
852 static void
853 event_add (enum ptx_event_type type, CUevent *e, void *h)
855 struct ptx_event *ptx_event;
856 struct nvptx_thread *nvthd = nvptx_thread ();
858 assert (type == PTX_EVT_MEM || type == PTX_EVT_KNL || type == PTX_EVT_SYNC
859 || type == PTX_EVT_ASYNC_CLEANUP);
861 ptx_event = GOMP_PLUGIN_malloc (sizeof (struct ptx_event));
862 ptx_event->type = type;
863 ptx_event->evt = e;
864 ptx_event->addr = h;
865 ptx_event->ord = nvthd->ptx_dev->ord;
867 pthread_mutex_lock (&ptx_event_lock);
869 ptx_event->next = ptx_events;
870 ptx_events = ptx_event;
872 pthread_mutex_unlock (&ptx_event_lock);
875 void
876 nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
877 int async, unsigned *dims, void *targ_mem_desc)
879 struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
880 CUfunction function;
881 CUresult r;
882 int i;
883 struct ptx_stream *dev_str;
884 void *kargs[1];
885 void *hp, *dp;
886 struct nvptx_thread *nvthd = nvptx_thread ();
887 const char *maybe_abort_msg = "(perhaps abort was called)";
889 function = targ_fn->fn;
891 dev_str = select_stream_for_async (async, pthread_self (), false, NULL);
892 assert (dev_str == nvthd->current_stream);
894 /* Initialize the launch dimensions. Typically this is constant,
895 provided by the device compiler, but we must permit runtime
896 values. */
897 int seen_zero = 0;
898 for (i = 0; i != GOMP_DIM_MAX; i++)
900 if (targ_fn->launch->dim[i])
901 dims[i] = targ_fn->launch->dim[i];
902 if (!dims[i])
903 seen_zero = 1;
906 if (seen_zero)
908 for (i = 0; i != GOMP_DIM_MAX; i++)
909 if (!dims[i])
910 dims[i] = /* TODO */ 32;
913 /* This reserves a chunk of a pre-allocated page of memory mapped on both
914 the host and the device. HP is a host pointer to the new chunk, and DP is
915 the corresponding device pointer. */
916 map_push (dev_str, async, mapnum * sizeof (void *), &hp, &dp);
918 GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__);
920 /* Copy the array of arguments to the mapped page. */
921 for (i = 0; i < mapnum; i++)
922 ((void **) hp)[i] = devaddrs[i];
924 /* Copy the (device) pointers to arguments to the device (dp and hp might in
925 fact have the same value on a unified-memory system). */
926 r = cuMemcpy ((CUdeviceptr)dp, (CUdeviceptr)hp, mapnum * sizeof (void *));
927 if (r != CUDA_SUCCESS)
928 GOMP_PLUGIN_fatal ("cuMemcpy failed: %s", cuda_error (r));
930 GOMP_PLUGIN_debug (0, " %s: kernel %s: launch"
931 " gangs=%u, workers=%u, vectors=%u\n",
932 __FUNCTION__, targ_fn->launch->fn,
933 dims[0], dims[1], dims[2]);
935 // OpenACC CUDA
937 // num_gangs nctaid.x
938 // num_workers ntid.y
939 // vector length ntid.x
941 kargs[0] = &dp;
942 r = cuLaunchKernel (function,
943 dims[GOMP_DIM_GANG], 1, 1,
944 dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1,
945 0, dev_str->stream, kargs, 0);
946 if (r != CUDA_SUCCESS)
947 GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
949 #ifndef DISABLE_ASYNC
950 if (async < acc_async_noval)
952 r = cuStreamSynchronize (dev_str->stream);
953 if (r == CUDA_ERROR_LAUNCH_FAILED)
954 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
955 maybe_abort_msg);
956 else if (r != CUDA_SUCCESS)
957 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
959 else
961 CUevent *e;
963 e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
965 r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
966 if (r == CUDA_ERROR_LAUNCH_FAILED)
967 GOMP_PLUGIN_fatal ("cuEventCreate error: %s %s\n", cuda_error (r),
968 maybe_abort_msg);
969 else if (r != CUDA_SUCCESS)
970 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
972 event_gc (true);
974 r = cuEventRecord (*e, dev_str->stream);
975 if (r != CUDA_SUCCESS)
976 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
978 event_add (PTX_EVT_KNL, e, (void *)dev_str);
980 #else
981 r = cuCtxSynchronize ();
982 if (r == CUDA_ERROR_LAUNCH_FAILED)
983 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
984 maybe_abort_msg);
985 else if (r != CUDA_SUCCESS)
986 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
987 #endif
989 GOMP_PLUGIN_debug (0, " %s: kernel %s: finished\n", __FUNCTION__,
990 targ_fn->launch->fn);
992 #ifndef DISABLE_ASYNC
993 if (async < acc_async_noval)
994 #endif
995 map_pop (dev_str);
998 void * openacc_get_current_cuda_context (void);
1000 static void *
1001 nvptx_alloc (size_t s)
1003 CUdeviceptr d;
1004 CUresult r;
1006 r = cuMemAlloc (&d, s);
1007 if (r == CUDA_ERROR_OUT_OF_MEMORY)
1008 return 0;
1009 if (r != CUDA_SUCCESS)
1010 GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
1011 return (void *)d;
1014 static void
1015 nvptx_free (void *p)
1017 CUresult r;
1018 CUdeviceptr pb;
1019 size_t ps;
1021 r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)p);
1022 if (r != CUDA_SUCCESS)
1023 GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r));
1025 if ((CUdeviceptr)p != pb)
1026 GOMP_PLUGIN_fatal ("invalid device address");
1028 r = cuMemFree ((CUdeviceptr)p);
1029 if (r != CUDA_SUCCESS)
1030 GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
1033 static void *
1034 nvptx_host2dev (void *d, const void *h, size_t s)
1036 CUresult r;
1037 CUdeviceptr pb;
1038 size_t ps;
1039 struct nvptx_thread *nvthd = nvptx_thread ();
1041 if (!s)
1042 return 0;
1044 if (!d)
1045 GOMP_PLUGIN_fatal ("invalid device address");
1047 r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)d);
1048 if (r != CUDA_SUCCESS)
1049 GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r));
1051 if (!pb)
1052 GOMP_PLUGIN_fatal ("invalid device address");
1054 if (!h)
1055 GOMP_PLUGIN_fatal ("invalid host address");
1057 if (d == h)
1058 GOMP_PLUGIN_fatal ("invalid host or device address");
1060 if ((void *)(d + s) > (void *)(pb + ps))
1061 GOMP_PLUGIN_fatal ("invalid size");
1063 #ifndef DISABLE_ASYNC
1064 if (nvthd->current_stream != nvthd->ptx_dev->null_stream)
1066 CUevent *e;
1068 e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1070 r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1071 if (r != CUDA_SUCCESS)
1072 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1074 event_gc (false);
1076 r = cuMemcpyHtoDAsync ((CUdeviceptr)d, h, s,
1077 nvthd->current_stream->stream);
1078 if (r != CUDA_SUCCESS)
1079 GOMP_PLUGIN_fatal ("cuMemcpyHtoDAsync error: %s", cuda_error (r));
1081 r = cuEventRecord (*e, nvthd->current_stream->stream);
1082 if (r != CUDA_SUCCESS)
1083 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1085 event_add (PTX_EVT_MEM, e, (void *)h);
1087 else
1088 #endif
1090 r = cuMemcpyHtoD ((CUdeviceptr)d, h, s);
1091 if (r != CUDA_SUCCESS)
1092 GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
1095 return 0;
1098 static void *
1099 nvptx_dev2host (void *h, const void *d, size_t s)
1101 CUresult r;
1102 CUdeviceptr pb;
1103 size_t ps;
1104 struct nvptx_thread *nvthd = nvptx_thread ();
1106 if (!s)
1107 return 0;
1109 if (!d)
1110 GOMP_PLUGIN_fatal ("invalid device address");
1112 r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)d);
1113 if (r != CUDA_SUCCESS)
1114 GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r));
1116 if (!pb)
1117 GOMP_PLUGIN_fatal ("invalid device address");
1119 if (!h)
1120 GOMP_PLUGIN_fatal ("invalid host address");
1122 if (d == h)
1123 GOMP_PLUGIN_fatal ("invalid host or device address");
1125 if ((void *)(d + s) > (void *)(pb + ps))
1126 GOMP_PLUGIN_fatal ("invalid size");
1128 #ifndef DISABLE_ASYNC
1129 if (nvthd->current_stream != nvthd->ptx_dev->null_stream)
1131 CUevent *e;
1133 e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1135 r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1136 if (r != CUDA_SUCCESS)
1137 GOMP_PLUGIN_fatal ("cuEventCreate error: %s\n", cuda_error (r));
1139 event_gc (false);
1141 r = cuMemcpyDtoHAsync (h, (CUdeviceptr)d, s,
1142 nvthd->current_stream->stream);
1143 if (r != CUDA_SUCCESS)
1144 GOMP_PLUGIN_fatal ("cuMemcpyDtoHAsync error: %s", cuda_error (r));
1146 r = cuEventRecord (*e, nvthd->current_stream->stream);
1147 if (r != CUDA_SUCCESS)
1148 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1150 event_add (PTX_EVT_MEM, e, (void *)h);
1152 else
1153 #endif
1155 r = cuMemcpyDtoH (h, (CUdeviceptr)d, s);
1156 if (r != CUDA_SUCCESS)
1157 GOMP_PLUGIN_fatal ("cuMemcpyDtoH error: %s", cuda_error (r));
1160 return 0;
1163 static void
1164 nvptx_set_async (int async)
1166 struct nvptx_thread *nvthd = nvptx_thread ();
1167 nvthd->current_stream
1168 = select_stream_for_async (async, pthread_self (), true, NULL);
1171 static int
1172 nvptx_async_test (int async)
1174 CUresult r;
1175 struct ptx_stream *s;
1177 s = select_stream_for_async (async, pthread_self (), false, NULL);
1179 if (!s)
1180 GOMP_PLUGIN_fatal ("unknown async %d", async);
1182 r = cuStreamQuery (s->stream);
1183 if (r == CUDA_SUCCESS)
1185 /* The oacc-parallel.c:goacc_wait function calls this hook to determine
1186 whether all work has completed on this stream, and if so omits the call
1187 to the wait hook. If that happens, event_gc might not get called
1188 (which prevents variables from getting unmapped and their associated
1189 device storage freed), so call it here. */
1190 event_gc (true);
1191 return 1;
1193 else if (r == CUDA_ERROR_NOT_READY)
1194 return 0;
1196 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
1198 return 0;
1201 static int
1202 nvptx_async_test_all (void)
1204 struct ptx_stream *s;
1205 pthread_t self = pthread_self ();
1206 struct nvptx_thread *nvthd = nvptx_thread ();
1208 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1210 for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
1212 if ((s->multithreaded || pthread_equal (s->host_thread, self))
1213 && cuStreamQuery (s->stream) == CUDA_ERROR_NOT_READY)
1215 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1216 return 0;
1220 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1222 event_gc (true);
1224 return 1;
1227 static void
1228 nvptx_wait (int async)
1230 CUresult r;
1231 struct ptx_stream *s;
1233 s = select_stream_for_async (async, pthread_self (), false, NULL);
1235 if (!s)
1236 GOMP_PLUGIN_fatal ("unknown async %d", async);
1238 r = cuStreamSynchronize (s->stream);
1239 if (r != CUDA_SUCCESS)
1240 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1242 event_gc (true);
1245 static void
1246 nvptx_wait_async (int async1, int async2)
1248 CUresult r;
1249 CUevent *e;
1250 struct ptx_stream *s1, *s2;
1251 pthread_t self = pthread_self ();
1253 /* The stream that is waiting (rather than being waited for) doesn't
1254 necessarily have to exist already. */
1255 s2 = select_stream_for_async (async2, self, true, NULL);
1257 s1 = select_stream_for_async (async1, self, false, NULL);
1258 if (!s1)
1259 GOMP_PLUGIN_fatal ("invalid async 1\n");
1261 if (s1 == s2)
1262 GOMP_PLUGIN_fatal ("identical parameters");
1264 e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1266 r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1267 if (r != CUDA_SUCCESS)
1268 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1270 event_gc (true);
1272 r = cuEventRecord (*e, s1->stream);
1273 if (r != CUDA_SUCCESS)
1274 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1276 event_add (PTX_EVT_SYNC, e, NULL);
1278 r = cuStreamWaitEvent (s2->stream, *e, 0);
1279 if (r != CUDA_SUCCESS)
1280 GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuda_error (r));
1283 static void
1284 nvptx_wait_all (void)
1286 CUresult r;
1287 struct ptx_stream *s;
1288 pthread_t self = pthread_self ();
1289 struct nvptx_thread *nvthd = nvptx_thread ();
1291 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1293 /* Wait for active streams initiated by this thread (or by multiple threads)
1294 to complete. */
1295 for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
1297 if (s->multithreaded || pthread_equal (s->host_thread, self))
1299 r = cuStreamQuery (s->stream);
1300 if (r == CUDA_SUCCESS)
1301 continue;
1302 else if (r != CUDA_ERROR_NOT_READY)
1303 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
1305 r = cuStreamSynchronize (s->stream);
1306 if (r != CUDA_SUCCESS)
1307 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1311 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1313 event_gc (true);
1316 static void
1317 nvptx_wait_all_async (int async)
1319 CUresult r;
1320 struct ptx_stream *waiting_stream, *other_stream;
1321 CUevent *e;
1322 struct nvptx_thread *nvthd = nvptx_thread ();
1323 pthread_t self = pthread_self ();
1325 /* The stream doing the waiting. This could be the first mention of the
1326 stream, so create it if necessary. */
1327 waiting_stream
1328 = select_stream_for_async (async, pthread_self (), true, NULL);
1330 /* Launches on the null stream already block on other streams in the
1331 context. */
1332 if (!waiting_stream || waiting_stream == nvthd->ptx_dev->null_stream)
1333 return;
1335 event_gc (true);
1337 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1339 for (other_stream = nvthd->ptx_dev->active_streams;
1340 other_stream != NULL;
1341 other_stream = other_stream->next)
1343 if (!other_stream->multithreaded
1344 && !pthread_equal (other_stream->host_thread, self))
1345 continue;
1347 e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1349 r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1350 if (r != CUDA_SUCCESS)
1351 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1353 /* Record an event on the waited-for stream. */
1354 r = cuEventRecord (*e, other_stream->stream);
1355 if (r != CUDA_SUCCESS)
1356 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1358 event_add (PTX_EVT_SYNC, e, NULL);
1360 r = cuStreamWaitEvent (waiting_stream->stream, *e, 0);
1361 if (r != CUDA_SUCCESS)
1362 GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuda_error (r));
1365 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1368 static void *
1369 nvptx_get_current_cuda_device (void)
1371 struct nvptx_thread *nvthd = nvptx_thread ();
1373 if (!nvthd || !nvthd->ptx_dev)
1374 return NULL;
1376 return &nvthd->ptx_dev->dev;
1379 static void *
1380 nvptx_get_current_cuda_context (void)
1382 struct nvptx_thread *nvthd = nvptx_thread ();
1384 if (!nvthd || !nvthd->ptx_dev)
1385 return NULL;
1387 return nvthd->ptx_dev->ctx;
1390 static void *
1391 nvptx_get_cuda_stream (int async)
1393 struct ptx_stream *s;
1394 struct nvptx_thread *nvthd = nvptx_thread ();
1396 if (!nvthd || !nvthd->ptx_dev)
1397 return NULL;
1399 s = select_stream_for_async (async, pthread_self (), false, NULL);
1401 return s ? s->stream : NULL;
1404 static int
1405 nvptx_set_cuda_stream (int async, void *stream)
1407 struct ptx_stream *oldstream;
1408 pthread_t self = pthread_self ();
1409 struct nvptx_thread *nvthd = nvptx_thread ();
1411 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1413 if (async < 0)
1414 GOMP_PLUGIN_fatal ("bad async %d", async);
1416 /* We have a list of active streams and an array mapping async values to
1417 entries of that list. We need to take "ownership" of the passed-in stream,
1418 and add it to our list, removing the previous entry also (if there was one)
1419 in order to prevent resource leaks. Note the potential for surprise
1420 here: maybe we should keep track of passed-in streams and leave it up to
1421 the user to tidy those up, but that doesn't work for stream handles
1422 returned from acc_get_cuda_stream above... */
1424 oldstream = select_stream_for_async (async, self, false, NULL);
1426 if (oldstream)
1428 if (nvthd->ptx_dev->active_streams == oldstream)
1429 nvthd->ptx_dev->active_streams = nvthd->ptx_dev->active_streams->next;
1430 else
1432 struct ptx_stream *s = nvthd->ptx_dev->active_streams;
1433 while (s->next != oldstream)
1434 s = s->next;
1435 s->next = s->next->next;
1438 cuStreamDestroy (oldstream->stream);
1439 map_fini (oldstream);
1440 free (oldstream);
1443 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1445 (void) select_stream_for_async (async, self, true, (CUstream) stream);
1447 return 1;
1450 /* Plugin entry points. */
1452 const char *
1453 GOMP_OFFLOAD_get_name (void)
1455 return "nvptx";
1458 unsigned int
1459 GOMP_OFFLOAD_get_caps (void)
1461 return GOMP_OFFLOAD_CAP_OPENACC_200;
1465 GOMP_OFFLOAD_get_type (void)
1467 return OFFLOAD_TARGET_TYPE_NVIDIA_PTX;
1471 GOMP_OFFLOAD_get_num_devices (void)
1473 return nvptx_get_num_devices ();
1476 void
1477 GOMP_OFFLOAD_init_device (int n)
1479 pthread_mutex_lock (&ptx_dev_lock);
1481 if (!nvptx_init () || ptx_devices[n] != NULL)
1483 pthread_mutex_unlock (&ptx_dev_lock);
1484 return;
1487 ptx_devices[n] = nvptx_open_device (n);
1488 instantiated_devices++;
1490 pthread_mutex_unlock (&ptx_dev_lock);
1493 void
1494 GOMP_OFFLOAD_fini_device (int n)
1496 pthread_mutex_lock (&ptx_dev_lock);
1498 if (ptx_devices[n] != NULL)
1500 nvptx_attach_host_thread_to_device (n);
1501 nvptx_close_device (ptx_devices[n]);
1502 ptx_devices[n] = NULL;
1503 instantiated_devices--;
1506 pthread_mutex_unlock (&ptx_dev_lock);
1509 /* Return the libgomp version number we're compatible with. There is
1510 no requirement for cross-version compatibility. */
1512 unsigned
1513 GOMP_OFFLOAD_version (void)
1515 return GOMP_VERSION;
1518 /* Load the (partial) program described by TARGET_DATA to device
1519 number ORD. Allocate and return TARGET_TABLE. */
1522 GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
1523 struct addr_pair **target_table)
1525 CUmodule module;
1526 const char *const *var_names;
1527 const struct targ_fn_launch *fn_descs;
1528 unsigned int fn_entries, var_entries, i, j;
1529 CUresult r;
1530 struct targ_fn_descriptor *targ_fns;
1531 struct addr_pair *targ_tbl;
1532 const nvptx_tdata_t *img_header = (const nvptx_tdata_t *) target_data;
1533 struct ptx_image_data *new_image;
1534 struct ptx_device *dev;
1536 if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1537 GOMP_PLUGIN_fatal ("Offload data incompatible with PTX plugin"
1538 " (expected %u, received %u)",
1539 GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1541 GOMP_OFFLOAD_init_device (ord);
1543 dev = ptx_devices[ord];
1545 nvptx_attach_host_thread_to_device (ord);
1547 link_ptx (&module, img_header->ptx_objs, img_header->ptx_num);
1549 /* The mkoffload utility emits a struct of pointers/integers at the
1550 start of each offload image. The array of kernel names and the
1551 functions addresses form a one-to-one correspondence. */
1553 var_entries = img_header->var_num;
1554 var_names = img_header->var_names;
1555 fn_entries = img_header->fn_num;
1556 fn_descs = img_header->fn_descs;
1558 targ_tbl = GOMP_PLUGIN_malloc (sizeof (struct addr_pair)
1559 * (fn_entries + var_entries));
1560 targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
1561 * fn_entries);
1563 *target_table = targ_tbl;
1565 new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data));
1566 new_image->target_data = target_data;
1567 new_image->module = module;
1568 new_image->fns = targ_fns;
1570 pthread_mutex_lock (&dev->image_lock);
1571 new_image->next = dev->images;
1572 dev->images = new_image;
1573 pthread_mutex_unlock (&dev->image_lock);
1575 for (i = 0; i < fn_entries; i++, targ_fns++, targ_tbl++)
1577 CUfunction function;
1579 r = cuModuleGetFunction (&function, module, fn_descs[i].fn);
1580 if (r != CUDA_SUCCESS)
1581 GOMP_PLUGIN_fatal ("cuModuleGetFunction error: %s", cuda_error (r));
1583 targ_fns->fn = function;
1584 targ_fns->launch = &fn_descs[i];
1586 targ_tbl->start = (uintptr_t) targ_fns;
1587 targ_tbl->end = targ_tbl->start + 1;
1590 for (j = 0; j < var_entries; j++, targ_tbl++)
1592 CUdeviceptr var;
1593 size_t bytes;
1595 r = cuModuleGetGlobal (&var, &bytes, module, var_names[j]);
1596 if (r != CUDA_SUCCESS)
1597 GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
1599 targ_tbl->start = (uintptr_t) var;
1600 targ_tbl->end = targ_tbl->start + bytes;
1603 return fn_entries + var_entries;
1606 /* Unload the program described by TARGET_DATA. DEV_DATA is the
1607 function descriptors allocated by G_O_load_image. */
1609 void
1610 GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data)
1612 struct ptx_image_data *image, **prev_p;
1613 struct ptx_device *dev = ptx_devices[ord];
1615 if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1616 return;
1618 pthread_mutex_lock (&dev->image_lock);
1619 for (prev_p = &dev->images; (image = *prev_p) != 0; prev_p = &image->next)
1620 if (image->target_data == target_data)
1622 *prev_p = image->next;
1623 cuModuleUnload (image->module);
1624 free (image->fns);
1625 free (image);
1626 break;
1628 pthread_mutex_unlock (&dev->image_lock);
1631 void *
1632 GOMP_OFFLOAD_alloc (int ord, size_t size)
1634 nvptx_attach_host_thread_to_device (ord);
1635 return nvptx_alloc (size);
1638 void
1639 GOMP_OFFLOAD_free (int ord, void *ptr)
1641 nvptx_attach_host_thread_to_device (ord);
1642 nvptx_free (ptr);
1645 void *
1646 GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
1648 nvptx_attach_host_thread_to_device (ord);
1649 return nvptx_dev2host (dst, src, n);
1652 void *
1653 GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
1655 nvptx_attach_host_thread_to_device (ord);
1656 return nvptx_host2dev (dst, src, n);
1659 void (*device_run) (int n, void *fn_ptr, void *vars) = NULL;
1661 void
1662 GOMP_OFFLOAD_openacc_parallel (void (*fn) (void *), size_t mapnum,
1663 void **hostaddrs, void **devaddrs,
1664 int async, unsigned *dims, void *targ_mem_desc)
1666 nvptx_exec (fn, mapnum, hostaddrs, devaddrs, async, dims, targ_mem_desc);
1669 void
1670 GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc)
1672 CUevent *e;
1673 CUresult r;
1674 struct nvptx_thread *nvthd = nvptx_thread ();
1676 e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1678 r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1679 if (r != CUDA_SUCCESS)
1680 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1682 r = cuEventRecord (*e, nvthd->current_stream->stream);
1683 if (r != CUDA_SUCCESS)
1684 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1686 event_add (PTX_EVT_ASYNC_CLEANUP, e, targ_mem_desc);
1690 GOMP_OFFLOAD_openacc_async_test (int async)
1692 return nvptx_async_test (async);
1696 GOMP_OFFLOAD_openacc_async_test_all (void)
1698 return nvptx_async_test_all ();
1701 void
1702 GOMP_OFFLOAD_openacc_async_wait (int async)
1704 nvptx_wait (async);
1707 void
1708 GOMP_OFFLOAD_openacc_async_wait_async (int async1, int async2)
1710 nvptx_wait_async (async1, async2);
1713 void
1714 GOMP_OFFLOAD_openacc_async_wait_all (void)
1716 nvptx_wait_all ();
1719 void
1720 GOMP_OFFLOAD_openacc_async_wait_all_async (int async)
1722 nvptx_wait_all_async (async);
1725 void
1726 GOMP_OFFLOAD_openacc_async_set_async (int async)
1728 nvptx_set_async (async);
1731 void *
1732 GOMP_OFFLOAD_openacc_create_thread_data (int ord)
1734 struct ptx_device *ptx_dev;
1735 struct nvptx_thread *nvthd
1736 = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
1737 CUresult r;
1738 CUcontext thd_ctx;
1740 ptx_dev = ptx_devices[ord];
1742 assert (ptx_dev);
1744 r = cuCtxGetCurrent (&thd_ctx);
1745 if (r != CUDA_SUCCESS)
1746 GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));
1748 assert (ptx_dev->ctx);
1750 if (!thd_ctx)
1752 r = cuCtxPushCurrent (ptx_dev->ctx);
1753 if (r != CUDA_SUCCESS)
1754 GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s", cuda_error (r));
1757 nvthd->current_stream = ptx_dev->null_stream;
1758 nvthd->ptx_dev = ptx_dev;
1760 return (void *) nvthd;
1763 void
1764 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
1766 free (data);
1769 void *
1770 GOMP_OFFLOAD_openacc_get_current_cuda_device (void)
1772 return nvptx_get_current_cuda_device ();
1775 void *
1776 GOMP_OFFLOAD_openacc_get_current_cuda_context (void)
1778 return nvptx_get_current_cuda_context ();
1781 /* NOTE: This returns a CUstream, not a ptx_stream pointer. */
1783 void *
1784 GOMP_OFFLOAD_openacc_get_cuda_stream (int async)
1786 return nvptx_get_cuda_stream (async);
1789 /* NOTE: This takes a CUstream, not a ptx_stream pointer. */
1792 GOMP_OFFLOAD_openacc_set_cuda_stream (int async, void *stream)
1794 return nvptx_set_cuda_stream (async, stream);