* libgomp.h (struct acc_dispatch_t): Remove args from exec_func.
[official-gcc.git] / libgomp / plugin / plugin-nvptx.c
blob657028e161dfb4b930d0ac519cd60186a1c00476
1 /* Plugin for NVPTX execution.
3 Copyright (C) 2013-2015 Free Software Foundation, Inc.
5 Contributed by Mentor Embedded.
7 This file is part of the GNU Offloading and Multi Processing Library
8 (libgomp).
10 Libgomp is free software; you can redistribute it and/or modify it
11 under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 3, or (at your option)
13 any later version.
15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
18 more details.
20 Under Section 7 of GPL version 3, you are granted additional
21 permissions described in the GCC Runtime Library Exception, version
22 3.1, as published by the Free Software Foundation.
24 You should have received a copy of the GNU General Public License and
25 a copy of the GCC Runtime Library Exception along with this program;
26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
27 <http://www.gnu.org/licenses/>. */
29 /* Nvidia PTX-specific parts of OpenACC support. The cuda driver
30 library appears to hold some implicit state, but the documentation
31 is not clear as to what that state might be. Or how one might
32 propagate it from one thread to another. */
34 #include "openacc.h"
35 #include "config.h"
36 #include "libgomp-plugin.h"
37 #include "oacc-plugin.h"
38 #include "gomp-constants.h"
40 #include <pthread.h>
41 #include <cuda.h>
42 #include <stdbool.h>
43 #include <stdint.h>
44 #include <string.h>
45 #include <stdio.h>
46 #include <unistd.h>
47 #include <assert.h>
49 static const char *
50 cuda_error (CUresult r)
52 #if CUDA_VERSION < 7000
53 /* Specified in documentation and present in library from at least
54 5.5. Not declared in header file prior to 7.0. */
55 extern CUresult cuGetErrorString (CUresult, const char **);
56 #endif
57 const char *desc;
59 r = cuGetErrorString (r, &desc);
60 if (r != CUDA_SUCCESS)
61 desc = "unknown cuda error";
63 return desc;
66 static unsigned int instantiated_devices = 0;
67 static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
69 struct ptx_stream
71 CUstream stream;
72 pthread_t host_thread;
73 bool multithreaded;
75 CUdeviceptr d;
76 void *h;
77 void *h_begin;
78 void *h_end;
79 void *h_next;
80 void *h_prev;
81 void *h_tail;
83 struct ptx_stream *next;
86 /* Thread-specific data for PTX. */
88 struct nvptx_thread
90 struct ptx_stream *current_stream;
91 struct ptx_device *ptx_dev;
94 struct map
96 int async;
97 size_t size;
98 char mappings[0];
101 static void
102 map_init (struct ptx_stream *s)
104 CUresult r;
106 int size = getpagesize ();
108 assert (s);
109 assert (!s->d);
110 assert (!s->h);
112 r = cuMemAllocHost (&s->h, size);
113 if (r != CUDA_SUCCESS)
114 GOMP_PLUGIN_fatal ("cuMemAllocHost error: %s", cuda_error (r));
116 r = cuMemHostGetDevicePointer (&s->d, s->h, 0);
117 if (r != CUDA_SUCCESS)
118 GOMP_PLUGIN_fatal ("cuMemHostGetDevicePointer error: %s", cuda_error (r));
120 assert (s->h);
122 s->h_begin = s->h;
123 s->h_end = s->h_begin + size;
124 s->h_next = s->h_prev = s->h_tail = s->h_begin;
126 assert (s->h_next);
127 assert (s->h_end);
130 static void
131 map_fini (struct ptx_stream *s)
133 CUresult r;
135 r = cuMemFreeHost (s->h);
136 if (r != CUDA_SUCCESS)
137 GOMP_PLUGIN_fatal ("cuMemFreeHost error: %s", cuda_error (r));
140 static void
141 map_pop (struct ptx_stream *s)
143 struct map *m;
145 assert (s != NULL);
146 assert (s->h_next);
147 assert (s->h_prev);
148 assert (s->h_tail);
150 m = s->h_tail;
152 s->h_tail += m->size;
154 if (s->h_tail >= s->h_end)
155 s->h_tail = s->h_begin + (int) (s->h_tail - s->h_end);
157 if (s->h_next == s->h_tail)
158 s->h_prev = s->h_next;
160 assert (s->h_next >= s->h_begin);
161 assert (s->h_tail >= s->h_begin);
162 assert (s->h_prev >= s->h_begin);
164 assert (s->h_next <= s->h_end);
165 assert (s->h_tail <= s->h_end);
166 assert (s->h_prev <= s->h_end);
169 static void
170 map_push (struct ptx_stream *s, int async, size_t size, void **h, void **d)
172 int left;
173 int offset;
174 struct map *m;
176 assert (s != NULL);
178 left = s->h_end - s->h_next;
179 size += sizeof (struct map);
181 assert (s->h_prev);
182 assert (s->h_next);
184 if (size >= left)
186 m = s->h_prev;
187 m->size += left;
188 s->h_next = s->h_begin;
190 if (s->h_next + size > s->h_end)
191 GOMP_PLUGIN_fatal ("unable to push map");
194 assert (s->h_next);
196 m = s->h_next;
197 m->async = async;
198 m->size = size;
200 offset = (void *)&m->mappings[0] - s->h;
202 *d = (void *)(s->d + offset);
203 *h = (void *)(s->h + offset);
205 s->h_prev = s->h_next;
206 s->h_next += size;
208 assert (s->h_prev);
209 assert (s->h_next);
211 assert (s->h_next >= s->h_begin);
212 assert (s->h_tail >= s->h_begin);
213 assert (s->h_prev >= s->h_begin);
214 assert (s->h_next <= s->h_end);
215 assert (s->h_tail <= s->h_end);
216 assert (s->h_prev <= s->h_end);
218 return;
221 /* Target data function launch information. */
223 struct targ_fn_launch
225 const char *fn;
226 unsigned short dim[GOMP_DIM_MAX];
229 /* Target PTX object information. */
231 struct targ_ptx_obj
233 const char *code;
234 size_t size;
237 /* Target data image information. */
239 typedef struct nvptx_tdata
241 const struct targ_ptx_obj *ptx_objs;
242 unsigned ptx_num;
244 const char *const *var_names;
245 unsigned var_num;
247 const struct targ_fn_launch *fn_descs;
248 unsigned fn_num;
249 } nvptx_tdata_t;
251 /* Descriptor of a loaded function. */
253 struct targ_fn_descriptor
255 CUfunction fn;
256 const struct targ_fn_launch *launch;
259 /* A loaded PTX image. */
260 struct ptx_image_data
262 const void *target_data;
263 CUmodule module;
265 struct targ_fn_descriptor *fns; /* Array of functions. */
267 struct ptx_image_data *next;
270 struct ptx_device
272 CUcontext ctx;
273 bool ctx_shared;
274 CUdevice dev;
275 struct ptx_stream *null_stream;
276 /* All non-null streams associated with this device (actually context),
277 either created implicitly or passed in from the user (via
278 acc_set_cuda_stream). */
279 struct ptx_stream *active_streams;
280 struct {
281 struct ptx_stream **arr;
282 int size;
283 } async_streams;
284 /* A lock for use when manipulating the above stream list and array. */
285 pthread_mutex_t stream_lock;
286 int ord;
287 bool overlap;
288 bool map;
289 bool concur;
290 int mode;
291 bool mkern;
293 struct ptx_image_data *images; /* Images loaded on device. */
294 pthread_mutex_t image_lock; /* Lock for above list. */
296 struct ptx_device *next;
299 enum ptx_event_type
301 PTX_EVT_MEM,
302 PTX_EVT_KNL,
303 PTX_EVT_SYNC,
304 PTX_EVT_ASYNC_CLEANUP
307 struct ptx_event
309 CUevent *evt;
310 int type;
311 void *addr;
312 int ord;
314 struct ptx_event *next;
317 static pthread_mutex_t ptx_event_lock;
318 static struct ptx_event *ptx_events;
320 static struct ptx_device **ptx_devices;
322 static inline struct nvptx_thread *
323 nvptx_thread (void)
325 return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
328 static void
329 init_streams_for_device (struct ptx_device *ptx_dev, int concurrency)
331 int i;
332 struct ptx_stream *null_stream
333 = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
335 null_stream->stream = NULL;
336 null_stream->host_thread = pthread_self ();
337 null_stream->multithreaded = true;
338 null_stream->d = (CUdeviceptr) NULL;
339 null_stream->h = NULL;
340 map_init (null_stream);
341 ptx_dev->null_stream = null_stream;
343 ptx_dev->active_streams = NULL;
344 pthread_mutex_init (&ptx_dev->stream_lock, NULL);
346 if (concurrency < 1)
347 concurrency = 1;
349 /* This is just a guess -- make space for as many async streams as the
350 current device is capable of concurrently executing. This can grow
351 later as necessary. No streams are created yet. */
352 ptx_dev->async_streams.arr
353 = GOMP_PLUGIN_malloc (concurrency * sizeof (struct ptx_stream *));
354 ptx_dev->async_streams.size = concurrency;
356 for (i = 0; i < concurrency; i++)
357 ptx_dev->async_streams.arr[i] = NULL;
360 static void
361 fini_streams_for_device (struct ptx_device *ptx_dev)
363 free (ptx_dev->async_streams.arr);
365 while (ptx_dev->active_streams != NULL)
367 struct ptx_stream *s = ptx_dev->active_streams;
368 ptx_dev->active_streams = ptx_dev->active_streams->next;
370 map_fini (s);
371 cuStreamDestroy (s->stream);
372 free (s);
375 map_fini (ptx_dev->null_stream);
376 free (ptx_dev->null_stream);
379 /* Select a stream for (OpenACC-semantics) ASYNC argument for the current
380 thread THREAD (and also current device/context). If CREATE is true, create
381 the stream if it does not exist (or use EXISTING if it is non-NULL), and
382 associate the stream with the same thread argument. Returns stream to use
383 as result. */
385 static struct ptx_stream *
386 select_stream_for_async (int async, pthread_t thread, bool create,
387 CUstream existing)
389 struct nvptx_thread *nvthd = nvptx_thread ();
390 /* Local copy of TLS variable. */
391 struct ptx_device *ptx_dev = nvthd->ptx_dev;
392 struct ptx_stream *stream = NULL;
393 int orig_async = async;
395 /* The special value acc_async_noval (-1) maps (for now) to an
396 implicitly-created stream, which is then handled the same as any other
397 numbered async stream. Other options are available, e.g. using the null
398 stream for anonymous async operations, or choosing an idle stream from an
399 active set. But, stick with this for now. */
400 if (async > acc_async_sync)
401 async++;
403 if (create)
404 pthread_mutex_lock (&ptx_dev->stream_lock);
406 /* NOTE: AFAICT there's no particular need for acc_async_sync to map to the
407 null stream, and in fact better performance may be obtainable if it doesn't
408 (because the null stream enforces overly-strict synchronisation with
409 respect to other streams for legacy reasons, and that's probably not
410 needed with OpenACC). Maybe investigate later. */
411 if (async == acc_async_sync)
412 stream = ptx_dev->null_stream;
413 else if (async >= 0 && async < ptx_dev->async_streams.size
414 && ptx_dev->async_streams.arr[async] && !(create && existing))
415 stream = ptx_dev->async_streams.arr[async];
416 else if (async >= 0 && create)
418 if (async >= ptx_dev->async_streams.size)
420 int i, newsize = ptx_dev->async_streams.size * 2;
422 if (async >= newsize)
423 newsize = async + 1;
425 ptx_dev->async_streams.arr
426 = GOMP_PLUGIN_realloc (ptx_dev->async_streams.arr,
427 newsize * sizeof (struct ptx_stream *));
429 for (i = ptx_dev->async_streams.size; i < newsize; i++)
430 ptx_dev->async_streams.arr[i] = NULL;
432 ptx_dev->async_streams.size = newsize;
435 /* Create a new stream on-demand if there isn't one already, or if we're
436 setting a particular async value to an existing (externally-provided)
437 stream. */
438 if (!ptx_dev->async_streams.arr[async] || existing)
440 CUresult r;
441 struct ptx_stream *s
442 = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
444 if (existing)
445 s->stream = existing;
446 else
448 r = cuStreamCreate (&s->stream, CU_STREAM_DEFAULT);
449 if (r != CUDA_SUCCESS)
450 GOMP_PLUGIN_fatal ("cuStreamCreate error: %s", cuda_error (r));
453 /* If CREATE is true, we're going to be queueing some work on this
454 stream. Associate it with the current host thread. */
455 s->host_thread = thread;
456 s->multithreaded = false;
458 s->d = (CUdeviceptr) NULL;
459 s->h = NULL;
460 map_init (s);
462 s->next = ptx_dev->active_streams;
463 ptx_dev->active_streams = s;
464 ptx_dev->async_streams.arr[async] = s;
467 stream = ptx_dev->async_streams.arr[async];
469 else if (async < 0)
470 GOMP_PLUGIN_fatal ("bad async %d", async);
472 if (create)
474 assert (stream != NULL);
476 /* If we're trying to use the same stream from different threads
477 simultaneously, set stream->multithreaded to true. This affects the
478 behaviour of acc_async_test_all and acc_wait_all, which are supposed to
479 only wait for asynchronous launches from the same host thread they are
480 invoked on. If multiple threads use the same async value, we make note
481 of that here and fall back to testing/waiting for all threads in those
482 functions. */
483 if (thread != stream->host_thread)
484 stream->multithreaded = true;
486 pthread_mutex_unlock (&ptx_dev->stream_lock);
488 else if (stream && !stream->multithreaded
489 && !pthread_equal (stream->host_thread, thread))
490 GOMP_PLUGIN_fatal ("async %d used on wrong thread", orig_async);
492 return stream;
495 /* Initialize the device. Return TRUE on success, else FALSE. PTX_DEV_LOCK
496 should be locked on entry and remains locked on exit. */
498 static bool
499 nvptx_init (void)
501 CUresult r;
502 int ndevs;
504 if (instantiated_devices != 0)
505 return true;
507 r = cuInit (0);
508 if (r != CUDA_SUCCESS)
509 GOMP_PLUGIN_fatal ("cuInit error: %s", cuda_error (r));
511 ptx_events = NULL;
513 pthread_mutex_init (&ptx_event_lock, NULL);
515 r = cuDeviceGetCount (&ndevs);
516 if (r != CUDA_SUCCESS)
517 GOMP_PLUGIN_fatal ("cuDeviceGetCount error: %s", cuda_error (r));
519 ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *)
520 * ndevs);
522 return true;
525 /* Select the N'th PTX device for the current host thread. The device must
526 have been previously opened before calling this function. */
528 static void
529 nvptx_attach_host_thread_to_device (int n)
531 CUdevice dev;
532 CUresult r;
533 struct ptx_device *ptx_dev;
534 CUcontext thd_ctx;
536 r = cuCtxGetDevice (&dev);
537 if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
538 GOMP_PLUGIN_fatal ("cuCtxGetDevice error: %s", cuda_error (r));
540 if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n)
541 return;
542 else
544 CUcontext old_ctx;
546 ptx_dev = ptx_devices[n];
547 assert (ptx_dev);
549 r = cuCtxGetCurrent (&thd_ctx);
550 if (r != CUDA_SUCCESS)
551 GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));
553 /* We don't necessarily have a current context (e.g. if it has been
554 destroyed. Pop it if we do though. */
555 if (thd_ctx != NULL)
557 r = cuCtxPopCurrent (&old_ctx);
558 if (r != CUDA_SUCCESS)
559 GOMP_PLUGIN_fatal ("cuCtxPopCurrent error: %s", cuda_error (r));
562 r = cuCtxPushCurrent (ptx_dev->ctx);
563 if (r != CUDA_SUCCESS)
564 GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s", cuda_error (r));
568 static struct ptx_device *
569 nvptx_open_device (int n)
571 struct ptx_device *ptx_dev;
572 CUdevice dev, ctx_dev;
573 CUresult r;
574 int async_engines, pi;
576 r = cuDeviceGet (&dev, n);
577 if (r != CUDA_SUCCESS)
578 GOMP_PLUGIN_fatal ("cuDeviceGet error: %s", cuda_error (r));
580 ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device));
582 ptx_dev->ord = n;
583 ptx_dev->dev = dev;
584 ptx_dev->ctx_shared = false;
586 r = cuCtxGetDevice (&ctx_dev);
587 if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
588 GOMP_PLUGIN_fatal ("cuCtxGetDevice error: %s", cuda_error (r));
590 if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev)
592 /* The current host thread has an active context for a different device.
593 Detach it. */
594 CUcontext old_ctx;
596 r = cuCtxPopCurrent (&old_ctx);
597 if (r != CUDA_SUCCESS)
598 GOMP_PLUGIN_fatal ("cuCtxPopCurrent error: %s", cuda_error (r));
601 r = cuCtxGetCurrent (&ptx_dev->ctx);
602 if (r != CUDA_SUCCESS)
603 GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));
605 if (!ptx_dev->ctx)
607 r = cuCtxCreate (&ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
608 if (r != CUDA_SUCCESS)
609 GOMP_PLUGIN_fatal ("cuCtxCreate error: %s", cuda_error (r));
611 else
612 ptx_dev->ctx_shared = true;
614 r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
615 if (r != CUDA_SUCCESS)
616 GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
618 ptx_dev->overlap = pi;
620 r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
621 if (r != CUDA_SUCCESS)
622 GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
624 ptx_dev->map = pi;
626 r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
627 if (r != CUDA_SUCCESS)
628 GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
630 ptx_dev->concur = pi;
632 r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
633 if (r != CUDA_SUCCESS)
634 GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
636 ptx_dev->mode = pi;
638 r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
639 if (r != CUDA_SUCCESS)
640 GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
642 ptx_dev->mkern = pi;
644 r = cuDeviceGetAttribute (&async_engines,
645 CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
646 if (r != CUDA_SUCCESS)
647 async_engines = 1;
649 ptx_dev->images = NULL;
650 pthread_mutex_init (&ptx_dev->image_lock, NULL);
652 init_streams_for_device (ptx_dev, async_engines);
654 return ptx_dev;
657 static void
658 nvptx_close_device (struct ptx_device *ptx_dev)
660 CUresult r;
662 if (!ptx_dev)
663 return;
665 fini_streams_for_device (ptx_dev);
667 pthread_mutex_destroy (&ptx_dev->image_lock);
669 if (!ptx_dev->ctx_shared)
671 r = cuCtxDestroy (ptx_dev->ctx);
672 if (r != CUDA_SUCCESS)
673 GOMP_PLUGIN_fatal ("cuCtxDestroy error: %s", cuda_error (r));
676 free (ptx_dev);
679 static int
680 nvptx_get_num_devices (void)
682 int n;
683 CUresult r;
685 /* PR libgomp/65099: Currently, we only support offloading in 64-bit
686 configurations. */
687 if (sizeof (void *) != 8)
688 return 0;
690 /* This function will be called before the plugin has been initialized in
691 order to enumerate available devices, but CUDA API routines can't be used
692 until cuInit has been called. Just call it now (but don't yet do any
693 further initialization). */
694 if (instantiated_devices == 0)
696 r = cuInit (0);
697 /* This is not an error: e.g. we may have CUDA libraries installed but
698 no devices available. */
699 if (r != CUDA_SUCCESS)
700 return 0;
703 r = cuDeviceGetCount (&n);
704 if (r!= CUDA_SUCCESS)
705 GOMP_PLUGIN_fatal ("cuDeviceGetCount error: %s", cuda_error (r));
707 return n;
711 static void
712 link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs,
713 unsigned num_objs)
715 CUjit_option opts[7];
716 void *optvals[7];
717 float elapsed = 0.0;
718 #define LOGSIZE 8192
719 char elog[LOGSIZE];
720 char ilog[LOGSIZE];
721 unsigned long logsize = LOGSIZE;
722 CUlinkState linkstate;
723 CUresult r;
724 void *linkout;
725 size_t linkoutsize __attribute__ ((unused));
727 opts[0] = CU_JIT_WALL_TIME;
728 optvals[0] = &elapsed;
730 opts[1] = CU_JIT_INFO_LOG_BUFFER;
731 optvals[1] = &ilog[0];
733 opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
734 optvals[2] = (void *) logsize;
736 opts[3] = CU_JIT_ERROR_LOG_BUFFER;
737 optvals[3] = &elog[0];
739 opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
740 optvals[4] = (void *) logsize;
742 opts[5] = CU_JIT_LOG_VERBOSE;
743 optvals[5] = (void *) 1;
745 opts[6] = CU_JIT_TARGET;
746 optvals[6] = (void *) CU_TARGET_COMPUTE_30;
748 r = cuLinkCreate (7, opts, optvals, &linkstate);
749 if (r != CUDA_SUCCESS)
750 GOMP_PLUGIN_fatal ("cuLinkCreate error: %s", cuda_error (r));
752 for (; num_objs--; ptx_objs++)
754 /* cuLinkAddData's 'data' argument erroneously omits the const
755 qualifier. */
756 GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs->code);
757 r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, (char*)ptx_objs->code,
758 ptx_objs->size, 0, 0, 0, 0);
759 if (r != CUDA_SUCCESS)
761 GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
762 GOMP_PLUGIN_fatal ("cuLinkAddData (ptx_code) error: %s",
763 cuda_error (r));
767 GOMP_PLUGIN_debug (0, "Linking\n");
768 r = cuLinkComplete (linkstate, &linkout, &linkoutsize);
770 GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed);
771 GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]);
773 if (r != CUDA_SUCCESS)
774 GOMP_PLUGIN_fatal ("cuLinkComplete error: %s", cuda_error (r));
776 r = cuModuleLoadData (module, linkout);
777 if (r != CUDA_SUCCESS)
778 GOMP_PLUGIN_fatal ("cuModuleLoadData error: %s", cuda_error (r));
780 r = cuLinkDestroy (linkstate);
781 if (r != CUDA_SUCCESS)
782 GOMP_PLUGIN_fatal ("cuLinkDestory error: %s", cuda_error (r));
785 static void
786 event_gc (bool memmap_lockable)
788 struct ptx_event *ptx_event = ptx_events;
789 struct nvptx_thread *nvthd = nvptx_thread ();
791 pthread_mutex_lock (&ptx_event_lock);
793 while (ptx_event != NULL)
795 CUresult r;
796 struct ptx_event *e = ptx_event;
798 ptx_event = ptx_event->next;
800 if (e->ord != nvthd->ptx_dev->ord)
801 continue;
803 r = cuEventQuery (*e->evt);
804 if (r == CUDA_SUCCESS)
806 CUevent *te;
808 te = e->evt;
810 switch (e->type)
812 case PTX_EVT_MEM:
813 case PTX_EVT_SYNC:
814 break;
816 case PTX_EVT_KNL:
817 map_pop (e->addr);
818 break;
820 case PTX_EVT_ASYNC_CLEANUP:
822 /* The function gomp_plugin_async_unmap_vars needs to claim the
823 memory-map splay tree lock for the current device, so we
824 can't call it when one of our callers has already claimed
825 the lock. In that case, just delay the GC for this event
826 until later. */
827 if (!memmap_lockable)
828 continue;
830 GOMP_PLUGIN_async_unmap_vars (e->addr);
832 break;
835 cuEventDestroy (*te);
836 free ((void *)te);
838 if (ptx_events == e)
839 ptx_events = ptx_events->next;
840 else
842 struct ptx_event *e_ = ptx_events;
843 while (e_->next != e)
844 e_ = e_->next;
845 e_->next = e_->next->next;
848 free (e);
852 pthread_mutex_unlock (&ptx_event_lock);
855 static void
856 event_add (enum ptx_event_type type, CUevent *e, void *h)
858 struct ptx_event *ptx_event;
859 struct nvptx_thread *nvthd = nvptx_thread ();
861 assert (type == PTX_EVT_MEM || type == PTX_EVT_KNL || type == PTX_EVT_SYNC
862 || type == PTX_EVT_ASYNC_CLEANUP);
864 ptx_event = GOMP_PLUGIN_malloc (sizeof (struct ptx_event));
865 ptx_event->type = type;
866 ptx_event->evt = e;
867 ptx_event->addr = h;
868 ptx_event->ord = nvthd->ptx_dev->ord;
870 pthread_mutex_lock (&ptx_event_lock);
872 ptx_event->next = ptx_events;
873 ptx_events = ptx_event;
875 pthread_mutex_unlock (&ptx_event_lock);
878 void
879 nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
880 int async, unsigned *dims, void *targ_mem_desc)
882 struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
883 CUfunction function;
884 CUresult r;
885 int i;
886 struct ptx_stream *dev_str;
887 void *kargs[1];
888 void *hp, *dp;
889 struct nvptx_thread *nvthd = nvptx_thread ();
890 const char *maybe_abort_msg = "(perhaps abort was called)";
892 function = targ_fn->fn;
894 dev_str = select_stream_for_async (async, pthread_self (), false, NULL);
895 assert (dev_str == nvthd->current_stream);
897 /* Initialize the launch dimensions. Typically this is constant,
898 provided by the device compiler, but we must permit runtime
899 values. */
900 for (i = 0; i != 3; i++)
901 if (targ_fn->launch->dim[i])
902 dims[i] = targ_fn->launch->dim[i];
904 /* This reserves a chunk of a pre-allocated page of memory mapped on both
905 the host and the device. HP is a host pointer to the new chunk, and DP is
906 the corresponding device pointer. */
907 map_push (dev_str, async, mapnum * sizeof (void *), &hp, &dp);
909 GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__);
911 /* Copy the array of arguments to the mapped page. */
912 for (i = 0; i < mapnum; i++)
913 ((void **) hp)[i] = devaddrs[i];
915 /* Copy the (device) pointers to arguments to the device (dp and hp might in
916 fact have the same value on a unified-memory system). */
917 r = cuMemcpy ((CUdeviceptr)dp, (CUdeviceptr)hp, mapnum * sizeof (void *));
918 if (r != CUDA_SUCCESS)
919 GOMP_PLUGIN_fatal ("cuMemcpy failed: %s", cuda_error (r));
921 GOMP_PLUGIN_debug (0, " %s: kernel %s: launch"
922 " gangs=%u, workers=%u, vectors=%u\n",
923 __FUNCTION__, targ_fn->launch->fn,
924 dims[0], dims[1], dims[2]);
926 // OpenACC CUDA
928 // num_gangs nctaid.x
929 // num_workers ntid.y
930 // vector length ntid.x
932 kargs[0] = &dp;
933 r = cuLaunchKernel (function,
934 dims[GOMP_DIM_GANG], 1, 1,
935 dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1,
936 0, dev_str->stream, kargs, 0);
937 if (r != CUDA_SUCCESS)
938 GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
940 #ifndef DISABLE_ASYNC
941 if (async < acc_async_noval)
943 r = cuStreamSynchronize (dev_str->stream);
944 if (r == CUDA_ERROR_LAUNCH_FAILED)
945 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
946 maybe_abort_msg);
947 else if (r != CUDA_SUCCESS)
948 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
950 else
952 CUevent *e;
954 e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
956 r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
957 if (r == CUDA_ERROR_LAUNCH_FAILED)
958 GOMP_PLUGIN_fatal ("cuEventCreate error: %s %s\n", cuda_error (r),
959 maybe_abort_msg);
960 else if (r != CUDA_SUCCESS)
961 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
963 event_gc (true);
965 r = cuEventRecord (*e, dev_str->stream);
966 if (r != CUDA_SUCCESS)
967 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
969 event_add (PTX_EVT_KNL, e, (void *)dev_str);
971 #else
972 r = cuCtxSynchronize ();
973 if (r == CUDA_ERROR_LAUNCH_FAILED)
974 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
975 maybe_abort_msg);
976 else if (r != CUDA_SUCCESS)
977 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
978 #endif
980 GOMP_PLUGIN_debug (0, " %s: kernel %s: finished\n", __FUNCTION__,
981 targ_fn->launch->fn);
983 #ifndef DISABLE_ASYNC
984 if (async < acc_async_noval)
985 #endif
986 map_pop (dev_str);
989 void * openacc_get_current_cuda_context (void);
991 static void *
992 nvptx_alloc (size_t s)
994 CUdeviceptr d;
995 CUresult r;
997 r = cuMemAlloc (&d, s);
998 if (r == CUDA_ERROR_OUT_OF_MEMORY)
999 return 0;
1000 if (r != CUDA_SUCCESS)
1001 GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
1002 return (void *)d;
1005 static void
1006 nvptx_free (void *p)
1008 CUresult r;
1009 CUdeviceptr pb;
1010 size_t ps;
1012 r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)p);
1013 if (r != CUDA_SUCCESS)
1014 GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r));
1016 if ((CUdeviceptr)p != pb)
1017 GOMP_PLUGIN_fatal ("invalid device address");
1019 r = cuMemFree ((CUdeviceptr)p);
1020 if (r != CUDA_SUCCESS)
1021 GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
1024 static void *
1025 nvptx_host2dev (void *d, const void *h, size_t s)
1027 CUresult r;
1028 CUdeviceptr pb;
1029 size_t ps;
1030 struct nvptx_thread *nvthd = nvptx_thread ();
1032 if (!s)
1033 return 0;
1035 if (!d)
1036 GOMP_PLUGIN_fatal ("invalid device address");
1038 r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)d);
1039 if (r != CUDA_SUCCESS)
1040 GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r));
1042 if (!pb)
1043 GOMP_PLUGIN_fatal ("invalid device address");
1045 if (!h)
1046 GOMP_PLUGIN_fatal ("invalid host address");
1048 if (d == h)
1049 GOMP_PLUGIN_fatal ("invalid host or device address");
1051 if ((void *)(d + s) > (void *)(pb + ps))
1052 GOMP_PLUGIN_fatal ("invalid size");
1054 #ifndef DISABLE_ASYNC
1055 if (nvthd->current_stream != nvthd->ptx_dev->null_stream)
1057 CUevent *e;
1059 e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1061 r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1062 if (r != CUDA_SUCCESS)
1063 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1065 event_gc (false);
1067 r = cuMemcpyHtoDAsync ((CUdeviceptr)d, h, s,
1068 nvthd->current_stream->stream);
1069 if (r != CUDA_SUCCESS)
1070 GOMP_PLUGIN_fatal ("cuMemcpyHtoDAsync error: %s", cuda_error (r));
1072 r = cuEventRecord (*e, nvthd->current_stream->stream);
1073 if (r != CUDA_SUCCESS)
1074 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1076 event_add (PTX_EVT_MEM, e, (void *)h);
1078 else
1079 #endif
1081 r = cuMemcpyHtoD ((CUdeviceptr)d, h, s);
1082 if (r != CUDA_SUCCESS)
1083 GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
1086 return 0;
1089 static void *
1090 nvptx_dev2host (void *h, const void *d, size_t s)
1092 CUresult r;
1093 CUdeviceptr pb;
1094 size_t ps;
1095 struct nvptx_thread *nvthd = nvptx_thread ();
1097 if (!s)
1098 return 0;
1100 if (!d)
1101 GOMP_PLUGIN_fatal ("invalid device address");
1103 r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)d);
1104 if (r != CUDA_SUCCESS)
1105 GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r));
1107 if (!pb)
1108 GOMP_PLUGIN_fatal ("invalid device address");
1110 if (!h)
1111 GOMP_PLUGIN_fatal ("invalid host address");
1113 if (d == h)
1114 GOMP_PLUGIN_fatal ("invalid host or device address");
1116 if ((void *)(d + s) > (void *)(pb + ps))
1117 GOMP_PLUGIN_fatal ("invalid size");
1119 #ifndef DISABLE_ASYNC
1120 if (nvthd->current_stream != nvthd->ptx_dev->null_stream)
1122 CUevent *e;
1124 e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1126 r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1127 if (r != CUDA_SUCCESS)
1128 GOMP_PLUGIN_fatal ("cuEventCreate error: %s\n", cuda_error (r));
1130 event_gc (false);
1132 r = cuMemcpyDtoHAsync (h, (CUdeviceptr)d, s,
1133 nvthd->current_stream->stream);
1134 if (r != CUDA_SUCCESS)
1135 GOMP_PLUGIN_fatal ("cuMemcpyDtoHAsync error: %s", cuda_error (r));
1137 r = cuEventRecord (*e, nvthd->current_stream->stream);
1138 if (r != CUDA_SUCCESS)
1139 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1141 event_add (PTX_EVT_MEM, e, (void *)h);
1143 else
1144 #endif
1146 r = cuMemcpyDtoH (h, (CUdeviceptr)d, s);
1147 if (r != CUDA_SUCCESS)
1148 GOMP_PLUGIN_fatal ("cuMemcpyDtoH error: %s", cuda_error (r));
1151 return 0;
1154 static void
1155 nvptx_set_async (int async)
1157 struct nvptx_thread *nvthd = nvptx_thread ();
1158 nvthd->current_stream
1159 = select_stream_for_async (async, pthread_self (), true, NULL);
1162 static int
1163 nvptx_async_test (int async)
1165 CUresult r;
1166 struct ptx_stream *s;
1168 s = select_stream_for_async (async, pthread_self (), false, NULL);
1170 if (!s)
1171 GOMP_PLUGIN_fatal ("unknown async %d", async);
1173 r = cuStreamQuery (s->stream);
1174 if (r == CUDA_SUCCESS)
1176 /* The oacc-parallel.c:goacc_wait function calls this hook to determine
1177 whether all work has completed on this stream, and if so omits the call
1178 to the wait hook. If that happens, event_gc might not get called
1179 (which prevents variables from getting unmapped and their associated
1180 device storage freed), so call it here. */
1181 event_gc (true);
1182 return 1;
1184 else if (r == CUDA_ERROR_NOT_READY)
1185 return 0;
1187 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
1189 return 0;
1192 static int
1193 nvptx_async_test_all (void)
1195 struct ptx_stream *s;
1196 pthread_t self = pthread_self ();
1197 struct nvptx_thread *nvthd = nvptx_thread ();
1199 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1201 for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
1203 if ((s->multithreaded || pthread_equal (s->host_thread, self))
1204 && cuStreamQuery (s->stream) == CUDA_ERROR_NOT_READY)
1206 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1207 return 0;
1211 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1213 event_gc (true);
1215 return 1;
1218 static void
1219 nvptx_wait (int async)
1221 CUresult r;
1222 struct ptx_stream *s;
1224 s = select_stream_for_async (async, pthread_self (), false, NULL);
1226 if (!s)
1227 GOMP_PLUGIN_fatal ("unknown async %d", async);
1229 r = cuStreamSynchronize (s->stream);
1230 if (r != CUDA_SUCCESS)
1231 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1233 event_gc (true);
1236 static void
1237 nvptx_wait_async (int async1, int async2)
1239 CUresult r;
1240 CUevent *e;
1241 struct ptx_stream *s1, *s2;
1242 pthread_t self = pthread_self ();
1244 /* The stream that is waiting (rather than being waited for) doesn't
1245 necessarily have to exist already. */
1246 s2 = select_stream_for_async (async2, self, true, NULL);
1248 s1 = select_stream_for_async (async1, self, false, NULL);
1249 if (!s1)
1250 GOMP_PLUGIN_fatal ("invalid async 1\n");
1252 if (s1 == s2)
1253 GOMP_PLUGIN_fatal ("identical parameters");
1255 e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1257 r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1258 if (r != CUDA_SUCCESS)
1259 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1261 event_gc (true);
1263 r = cuEventRecord (*e, s1->stream);
1264 if (r != CUDA_SUCCESS)
1265 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1267 event_add (PTX_EVT_SYNC, e, NULL);
1269 r = cuStreamWaitEvent (s2->stream, *e, 0);
1270 if (r != CUDA_SUCCESS)
1271 GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuda_error (r));
1274 static void
1275 nvptx_wait_all (void)
1277 CUresult r;
1278 struct ptx_stream *s;
1279 pthread_t self = pthread_self ();
1280 struct nvptx_thread *nvthd = nvptx_thread ();
1282 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1284 /* Wait for active streams initiated by this thread (or by multiple threads)
1285 to complete. */
1286 for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
1288 if (s->multithreaded || pthread_equal (s->host_thread, self))
1290 r = cuStreamQuery (s->stream);
1291 if (r == CUDA_SUCCESS)
1292 continue;
1293 else if (r != CUDA_ERROR_NOT_READY)
1294 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
1296 r = cuStreamSynchronize (s->stream);
1297 if (r != CUDA_SUCCESS)
1298 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1302 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1304 event_gc (true);
1307 static void
1308 nvptx_wait_all_async (int async)
1310 CUresult r;
1311 struct ptx_stream *waiting_stream, *other_stream;
1312 CUevent *e;
1313 struct nvptx_thread *nvthd = nvptx_thread ();
1314 pthread_t self = pthread_self ();
1316 /* The stream doing the waiting. This could be the first mention of the
1317 stream, so create it if necessary. */
1318 waiting_stream
1319 = select_stream_for_async (async, pthread_self (), true, NULL);
1321 /* Launches on the null stream already block on other streams in the
1322 context. */
1323 if (!waiting_stream || waiting_stream == nvthd->ptx_dev->null_stream)
1324 return;
1326 event_gc (true);
1328 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1330 for (other_stream = nvthd->ptx_dev->active_streams;
1331 other_stream != NULL;
1332 other_stream = other_stream->next)
1334 if (!other_stream->multithreaded
1335 && !pthread_equal (other_stream->host_thread, self))
1336 continue;
1338 e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1340 r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1341 if (r != CUDA_SUCCESS)
1342 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1344 /* Record an event on the waited-for stream. */
1345 r = cuEventRecord (*e, other_stream->stream);
1346 if (r != CUDA_SUCCESS)
1347 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1349 event_add (PTX_EVT_SYNC, e, NULL);
1351 r = cuStreamWaitEvent (waiting_stream->stream, *e, 0);
1352 if (r != CUDA_SUCCESS)
1353 GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuda_error (r));
1356 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1359 static void *
1360 nvptx_get_current_cuda_device (void)
1362 struct nvptx_thread *nvthd = nvptx_thread ();
1364 if (!nvthd || !nvthd->ptx_dev)
1365 return NULL;
1367 return &nvthd->ptx_dev->dev;
1370 static void *
1371 nvptx_get_current_cuda_context (void)
1373 struct nvptx_thread *nvthd = nvptx_thread ();
1375 if (!nvthd || !nvthd->ptx_dev)
1376 return NULL;
1378 return nvthd->ptx_dev->ctx;
1381 static void *
1382 nvptx_get_cuda_stream (int async)
1384 struct ptx_stream *s;
1385 struct nvptx_thread *nvthd = nvptx_thread ();
1387 if (!nvthd || !nvthd->ptx_dev)
1388 return NULL;
1390 s = select_stream_for_async (async, pthread_self (), false, NULL);
1392 return s ? s->stream : NULL;
1395 static int
1396 nvptx_set_cuda_stream (int async, void *stream)
1398 struct ptx_stream *oldstream;
1399 pthread_t self = pthread_self ();
1400 struct nvptx_thread *nvthd = nvptx_thread ();
1402 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1404 if (async < 0)
1405 GOMP_PLUGIN_fatal ("bad async %d", async);
1407 /* We have a list of active streams and an array mapping async values to
1408 entries of that list. We need to take "ownership" of the passed-in stream,
1409 and add it to our list, removing the previous entry also (if there was one)
1410 in order to prevent resource leaks. Note the potential for surprise
1411 here: maybe we should keep track of passed-in streams and leave it up to
1412 the user to tidy those up, but that doesn't work for stream handles
1413 returned from acc_get_cuda_stream above... */
1415 oldstream = select_stream_for_async (async, self, false, NULL);
1417 if (oldstream)
1419 if (nvthd->ptx_dev->active_streams == oldstream)
1420 nvthd->ptx_dev->active_streams = nvthd->ptx_dev->active_streams->next;
1421 else
1423 struct ptx_stream *s = nvthd->ptx_dev->active_streams;
1424 while (s->next != oldstream)
1425 s = s->next;
1426 s->next = s->next->next;
1429 cuStreamDestroy (oldstream->stream);
1430 map_fini (oldstream);
1431 free (oldstream);
1434 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1436 (void) select_stream_for_async (async, self, true, (CUstream) stream);
1438 return 1;
1441 /* Plugin entry points. */
1443 const char *
1444 GOMP_OFFLOAD_get_name (void)
1446 return "nvptx";
1449 unsigned int
1450 GOMP_OFFLOAD_get_caps (void)
1452 return GOMP_OFFLOAD_CAP_OPENACC_200;
1456 GOMP_OFFLOAD_get_type (void)
1458 return OFFLOAD_TARGET_TYPE_NVIDIA_PTX;
1462 GOMP_OFFLOAD_get_num_devices (void)
1464 return nvptx_get_num_devices ();
1467 void
1468 GOMP_OFFLOAD_init_device (int n)
1470 pthread_mutex_lock (&ptx_dev_lock);
1472 if (!nvptx_init () || ptx_devices[n] != NULL)
1474 pthread_mutex_unlock (&ptx_dev_lock);
1475 return;
1478 ptx_devices[n] = nvptx_open_device (n);
1479 instantiated_devices++;
1481 pthread_mutex_unlock (&ptx_dev_lock);
1484 void
1485 GOMP_OFFLOAD_fini_device (int n)
1487 pthread_mutex_lock (&ptx_dev_lock);
1489 if (ptx_devices[n] != NULL)
1491 nvptx_attach_host_thread_to_device (n);
1492 nvptx_close_device (ptx_devices[n]);
1493 ptx_devices[n] = NULL;
1494 instantiated_devices--;
1497 pthread_mutex_unlock (&ptx_dev_lock);
1500 /* Return the libgomp version number we're compatible with. There is
1501 no requirement for cross-version compatibility. */
1503 unsigned
1504 GOMP_OFFLOAD_version (void)
1506 return GOMP_VERSION;
1509 /* Load the (partial) program described by TARGET_DATA to device
1510 number ORD. Allocate and return TARGET_TABLE. */
1513 GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
1514 struct addr_pair **target_table)
1516 CUmodule module;
1517 const char *const *var_names;
1518 const struct targ_fn_launch *fn_descs;
1519 unsigned int fn_entries, var_entries, i, j;
1520 CUresult r;
1521 struct targ_fn_descriptor *targ_fns;
1522 struct addr_pair *targ_tbl;
1523 const nvptx_tdata_t *img_header = (const nvptx_tdata_t *) target_data;
1524 struct ptx_image_data *new_image;
1525 struct ptx_device *dev;
1527 if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1528 GOMP_PLUGIN_fatal ("Offload data incompatible with PTX plugin"
1529 " (expected %u, received %u)",
1530 GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1532 GOMP_OFFLOAD_init_device (ord);
1534 dev = ptx_devices[ord];
1536 nvptx_attach_host_thread_to_device (ord);
1538 link_ptx (&module, img_header->ptx_objs, img_header->ptx_num);
1540 /* The mkoffload utility emits a struct of pointers/integers at the
1541 start of each offload image. The array of kernel names and the
1542 functions addresses form a one-to-one correspondence. */
1544 var_entries = img_header->var_num;
1545 var_names = img_header->var_names;
1546 fn_entries = img_header->fn_num;
1547 fn_descs = img_header->fn_descs;
1549 targ_tbl = GOMP_PLUGIN_malloc (sizeof (struct addr_pair)
1550 * (fn_entries + var_entries));
1551 targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
1552 * fn_entries);
1554 *target_table = targ_tbl;
1556 new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data));
1557 new_image->target_data = target_data;
1558 new_image->module = module;
1559 new_image->fns = targ_fns;
1561 pthread_mutex_lock (&dev->image_lock);
1562 new_image->next = dev->images;
1563 dev->images = new_image;
1564 pthread_mutex_unlock (&dev->image_lock);
1566 for (i = 0; i < fn_entries; i++, targ_fns++, targ_tbl++)
1568 CUfunction function;
1570 r = cuModuleGetFunction (&function, module, fn_descs[i].fn);
1571 if (r != CUDA_SUCCESS)
1572 GOMP_PLUGIN_fatal ("cuModuleGetFunction error: %s", cuda_error (r));
1574 targ_fns->fn = function;
1575 targ_fns->launch = &fn_descs[i];
1577 targ_tbl->start = (uintptr_t) targ_fns;
1578 targ_tbl->end = targ_tbl->start + 1;
1581 for (j = 0; j < var_entries; j++, targ_tbl++)
1583 CUdeviceptr var;
1584 size_t bytes;
1586 r = cuModuleGetGlobal (&var, &bytes, module, var_names[j]);
1587 if (r != CUDA_SUCCESS)
1588 GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
1590 targ_tbl->start = (uintptr_t) var;
1591 targ_tbl->end = targ_tbl->start + bytes;
1594 return fn_entries + var_entries;
1597 /* Unload the program described by TARGET_DATA. DEV_DATA is the
1598 function descriptors allocated by G_O_load_image. */
1600 void
1601 GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data)
1603 struct ptx_image_data *image, **prev_p;
1604 struct ptx_device *dev = ptx_devices[ord];
1606 if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1607 return;
1609 pthread_mutex_lock (&dev->image_lock);
1610 for (prev_p = &dev->images; (image = *prev_p) != 0; prev_p = &image->next)
1611 if (image->target_data == target_data)
1613 *prev_p = image->next;
1614 cuModuleUnload (image->module);
1615 free (image->fns);
1616 free (image);
1617 break;
1619 pthread_mutex_unlock (&dev->image_lock);
1622 void *
1623 GOMP_OFFLOAD_alloc (int ord, size_t size)
1625 nvptx_attach_host_thread_to_device (ord);
1626 return nvptx_alloc (size);
1629 void
1630 GOMP_OFFLOAD_free (int ord, void *ptr)
1632 nvptx_attach_host_thread_to_device (ord);
1633 nvptx_free (ptr);
1636 void *
1637 GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
1639 nvptx_attach_host_thread_to_device (ord);
1640 return nvptx_dev2host (dst, src, n);
1643 void *
1644 GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
1646 nvptx_attach_host_thread_to_device (ord);
1647 return nvptx_host2dev (dst, src, n);
1650 void (*device_run) (int n, void *fn_ptr, void *vars) = NULL;
1652 void
1653 GOMP_OFFLOAD_openacc_parallel (void (*fn) (void *), size_t mapnum,
1654 void **hostaddrs, void **devaddrs,
1655 int async, unsigned *dims, void *targ_mem_desc)
1657 nvptx_exec (fn, mapnum, hostaddrs, devaddrs, async, dims, targ_mem_desc);
1660 void
1661 GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc)
1663 CUevent *e;
1664 CUresult r;
1665 struct nvptx_thread *nvthd = nvptx_thread ();
1667 e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1669 r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1670 if (r != CUDA_SUCCESS)
1671 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1673 r = cuEventRecord (*e, nvthd->current_stream->stream);
1674 if (r != CUDA_SUCCESS)
1675 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1677 event_add (PTX_EVT_ASYNC_CLEANUP, e, targ_mem_desc);
1681 GOMP_OFFLOAD_openacc_async_test (int async)
1683 return nvptx_async_test (async);
1687 GOMP_OFFLOAD_openacc_async_test_all (void)
1689 return nvptx_async_test_all ();
1692 void
1693 GOMP_OFFLOAD_openacc_async_wait (int async)
1695 nvptx_wait (async);
1698 void
1699 GOMP_OFFLOAD_openacc_async_wait_async (int async1, int async2)
1701 nvptx_wait_async (async1, async2);
1704 void
1705 GOMP_OFFLOAD_openacc_async_wait_all (void)
1707 nvptx_wait_all ();
1710 void
1711 GOMP_OFFLOAD_openacc_async_wait_all_async (int async)
1713 nvptx_wait_all_async (async);
1716 void
1717 GOMP_OFFLOAD_openacc_async_set_async (int async)
1719 nvptx_set_async (async);
1722 void *
1723 GOMP_OFFLOAD_openacc_create_thread_data (int ord)
1725 struct ptx_device *ptx_dev;
1726 struct nvptx_thread *nvthd
1727 = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
1728 CUresult r;
1729 CUcontext thd_ctx;
1731 ptx_dev = ptx_devices[ord];
1733 assert (ptx_dev);
1735 r = cuCtxGetCurrent (&thd_ctx);
1736 if (r != CUDA_SUCCESS)
1737 GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));
1739 assert (ptx_dev->ctx);
1741 if (!thd_ctx)
1743 r = cuCtxPushCurrent (ptx_dev->ctx);
1744 if (r != CUDA_SUCCESS)
1745 GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s", cuda_error (r));
1748 nvthd->current_stream = ptx_dev->null_stream;
1749 nvthd->ptx_dev = ptx_dev;
1751 return (void *) nvthd;
1754 void
1755 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
1757 free (data);
1760 void *
1761 GOMP_OFFLOAD_openacc_get_current_cuda_device (void)
1763 return nvptx_get_current_cuda_device ();
1766 void *
1767 GOMP_OFFLOAD_openacc_get_current_cuda_context (void)
1769 return nvptx_get_current_cuda_context ();
1772 /* NOTE: This returns a CUstream, not a ptx_stream pointer. */
1774 void *
1775 GOMP_OFFLOAD_openacc_get_cuda_stream (int async)
1777 return nvptx_get_cuda_stream (async);
1780 /* NOTE: This takes a CUstream, not a ptx_stream pointer. */
1783 GOMP_OFFLOAD_openacc_set_cuda_stream (int async, void *stream)
1785 return nvptx_set_cuda_stream (async, stream);