* plugin/plugin-nvptx.c (ARRAYSIZE): Delete.
[official-gcc.git] / libgomp / plugin / plugin-nvptx.c
blob41fd9b454ebda1ad80e144ab6bd4d1d73aee11b0
1 /* Plugin for NVPTX execution.
3 Copyright (C) 2013-2015 Free Software Foundation, Inc.
5 Contributed by Mentor Embedded.
7 This file is part of the GNU Offloading and Multi Processing Library
8 (libgomp).
10 Libgomp is free software; you can redistribute it and/or modify it
11 under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 3, or (at your option)
13 any later version.
15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
18 more details.
20 Under Section 7 of GPL version 3, you are granted additional
21 permissions described in the GCC Runtime Library Exception, version
22 3.1, as published by the Free Software Foundation.
24 You should have received a copy of the GNU General Public License and
25 a copy of the GCC Runtime Library Exception along with this program;
26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
27 <http://www.gnu.org/licenses/>. */
29 /* Nvidia PTX-specific parts of OpenACC support. The cuda driver
30 library appears to hold some implicit state, but the documentation
31 is not clear as to what that state might be. Or how one might
32 propagate it from one thread to another. */
34 #include "openacc.h"
35 #include "config.h"
36 #include "libgomp-plugin.h"
37 #include "oacc-ptx.h"
38 #include "oacc-plugin.h"
39 #include "gomp-constants.h"
41 #include <pthread.h>
42 #include <cuda.h>
43 #include <stdbool.h>
44 #include <stdint.h>
45 #include <string.h>
46 #include <stdio.h>
47 #include <unistd.h>
48 #include <assert.h>
50 static const char *
51 cuda_error (CUresult r)
53 #if CUDA_VERSION < 7000
54 /* Specified in documentation and present in library from at least
55 5.5. Not declared in header file prior to 7.0. */
56 extern CUresult cuGetErrorString (CUresult, const char **);
57 #endif
58 const char *desc;
60 r = cuGetErrorString (r, &desc);
61 if (r != CUDA_SUCCESS)
62 desc = "unknown cuda error";
64 return desc;
67 static unsigned int instantiated_devices = 0;
68 static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
70 struct ptx_stream
72 CUstream stream;
73 pthread_t host_thread;
74 bool multithreaded;
76 CUdeviceptr d;
77 void *h;
78 void *h_begin;
79 void *h_end;
80 void *h_next;
81 void *h_prev;
82 void *h_tail;
84 struct ptx_stream *next;
87 /* Thread-specific data for PTX. */
89 struct nvptx_thread
91 struct ptx_stream *current_stream;
92 struct ptx_device *ptx_dev;
95 struct map
97 int async;
98 size_t size;
99 char mappings[0];
102 static void
103 map_init (struct ptx_stream *s)
105 CUresult r;
107 int size = getpagesize ();
109 assert (s);
110 assert (!s->d);
111 assert (!s->h);
113 r = cuMemAllocHost (&s->h, size);
114 if (r != CUDA_SUCCESS)
115 GOMP_PLUGIN_fatal ("cuMemAllocHost error: %s", cuda_error (r));
117 r = cuMemHostGetDevicePointer (&s->d, s->h, 0);
118 if (r != CUDA_SUCCESS)
119 GOMP_PLUGIN_fatal ("cuMemHostGetDevicePointer error: %s", cuda_error (r));
121 assert (s->h);
123 s->h_begin = s->h;
124 s->h_end = s->h_begin + size;
125 s->h_next = s->h_prev = s->h_tail = s->h_begin;
127 assert (s->h_next);
128 assert (s->h_end);
131 static void
132 map_fini (struct ptx_stream *s)
134 CUresult r;
136 r = cuMemFreeHost (s->h);
137 if (r != CUDA_SUCCESS)
138 GOMP_PLUGIN_fatal ("cuMemFreeHost error: %s", cuda_error (r));
141 static void
142 map_pop (struct ptx_stream *s)
144 struct map *m;
146 assert (s != NULL);
147 assert (s->h_next);
148 assert (s->h_prev);
149 assert (s->h_tail);
151 m = s->h_tail;
153 s->h_tail += m->size;
155 if (s->h_tail >= s->h_end)
156 s->h_tail = s->h_begin + (int) (s->h_tail - s->h_end);
158 if (s->h_next == s->h_tail)
159 s->h_prev = s->h_next;
161 assert (s->h_next >= s->h_begin);
162 assert (s->h_tail >= s->h_begin);
163 assert (s->h_prev >= s->h_begin);
165 assert (s->h_next <= s->h_end);
166 assert (s->h_tail <= s->h_end);
167 assert (s->h_prev <= s->h_end);
170 static void
171 map_push (struct ptx_stream *s, int async, size_t size, void **h, void **d)
173 int left;
174 int offset;
175 struct map *m;
177 assert (s != NULL);
179 left = s->h_end - s->h_next;
180 size += sizeof (struct map);
182 assert (s->h_prev);
183 assert (s->h_next);
185 if (size >= left)
187 m = s->h_prev;
188 m->size += left;
189 s->h_next = s->h_begin;
191 if (s->h_next + size > s->h_end)
192 GOMP_PLUGIN_fatal ("unable to push map");
195 assert (s->h_next);
197 m = s->h_next;
198 m->async = async;
199 m->size = size;
201 offset = (void *)&m->mappings[0] - s->h;
203 *d = (void *)(s->d + offset);
204 *h = (void *)(s->h + offset);
206 s->h_prev = s->h_next;
207 s->h_next += size;
209 assert (s->h_prev);
210 assert (s->h_next);
212 assert (s->h_next >= s->h_begin);
213 assert (s->h_tail >= s->h_begin);
214 assert (s->h_prev >= s->h_begin);
215 assert (s->h_next <= s->h_end);
216 assert (s->h_tail <= s->h_end);
217 assert (s->h_prev <= s->h_end);
219 return;
222 /* Target data function launch information. */
224 struct targ_fn_launch
226 const char *fn;
227 unsigned short dim[3];
230 /* Descriptor of a loaded function. */
232 struct targ_fn_descriptor
234 CUfunction fn;
235 const struct targ_fn_launch *launch;
238 /* A loaded PTX image. */
239 struct ptx_image_data
241 const void *target_data;
242 CUmodule module;
244 struct targ_fn_descriptor *fns; /* Array of functions. */
246 struct ptx_image_data *next;
249 struct ptx_device
251 CUcontext ctx;
252 bool ctx_shared;
253 CUdevice dev;
254 struct ptx_stream *null_stream;
255 /* All non-null streams associated with this device (actually context),
256 either created implicitly or passed in from the user (via
257 acc_set_cuda_stream). */
258 struct ptx_stream *active_streams;
259 struct {
260 struct ptx_stream **arr;
261 int size;
262 } async_streams;
263 /* A lock for use when manipulating the above stream list and array. */
264 pthread_mutex_t stream_lock;
265 int ord;
266 bool overlap;
267 bool map;
268 bool concur;
269 int mode;
270 bool mkern;
272 struct ptx_image_data *images; /* Images loaded on device. */
273 pthread_mutex_t image_lock; /* Lock for above list. */
275 struct ptx_device *next;
278 enum ptx_event_type
280 PTX_EVT_MEM,
281 PTX_EVT_KNL,
282 PTX_EVT_SYNC,
283 PTX_EVT_ASYNC_CLEANUP
286 struct ptx_event
288 CUevent *evt;
289 int type;
290 void *addr;
291 int ord;
293 struct ptx_event *next;
296 static pthread_mutex_t ptx_event_lock;
297 static struct ptx_event *ptx_events;
299 static struct ptx_device **ptx_devices;
301 static inline struct nvptx_thread *
302 nvptx_thread (void)
304 return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
307 static void
308 init_streams_for_device (struct ptx_device *ptx_dev, int concurrency)
310 int i;
311 struct ptx_stream *null_stream
312 = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
314 null_stream->stream = NULL;
315 null_stream->host_thread = pthread_self ();
316 null_stream->multithreaded = true;
317 null_stream->d = (CUdeviceptr) NULL;
318 null_stream->h = NULL;
319 map_init (null_stream);
320 ptx_dev->null_stream = null_stream;
322 ptx_dev->active_streams = NULL;
323 pthread_mutex_init (&ptx_dev->stream_lock, NULL);
325 if (concurrency < 1)
326 concurrency = 1;
328 /* This is just a guess -- make space for as many async streams as the
329 current device is capable of concurrently executing. This can grow
330 later as necessary. No streams are created yet. */
331 ptx_dev->async_streams.arr
332 = GOMP_PLUGIN_malloc (concurrency * sizeof (struct ptx_stream *));
333 ptx_dev->async_streams.size = concurrency;
335 for (i = 0; i < concurrency; i++)
336 ptx_dev->async_streams.arr[i] = NULL;
339 static void
340 fini_streams_for_device (struct ptx_device *ptx_dev)
342 free (ptx_dev->async_streams.arr);
344 while (ptx_dev->active_streams != NULL)
346 struct ptx_stream *s = ptx_dev->active_streams;
347 ptx_dev->active_streams = ptx_dev->active_streams->next;
349 map_fini (s);
350 cuStreamDestroy (s->stream);
351 free (s);
354 map_fini (ptx_dev->null_stream);
355 free (ptx_dev->null_stream);
358 /* Select a stream for (OpenACC-semantics) ASYNC argument for the current
359 thread THREAD (and also current device/context). If CREATE is true, create
360 the stream if it does not exist (or use EXISTING if it is non-NULL), and
361 associate the stream with the same thread argument. Returns stream to use
362 as result. */
364 static struct ptx_stream *
365 select_stream_for_async (int async, pthread_t thread, bool create,
366 CUstream existing)
368 struct nvptx_thread *nvthd = nvptx_thread ();
369 /* Local copy of TLS variable. */
370 struct ptx_device *ptx_dev = nvthd->ptx_dev;
371 struct ptx_stream *stream = NULL;
372 int orig_async = async;
374 /* The special value acc_async_noval (-1) maps (for now) to an
375 implicitly-created stream, which is then handled the same as any other
376 numbered async stream. Other options are available, e.g. using the null
377 stream for anonymous async operations, or choosing an idle stream from an
378 active set. But, stick with this for now. */
379 if (async > acc_async_sync)
380 async++;
382 if (create)
383 pthread_mutex_lock (&ptx_dev->stream_lock);
385 /* NOTE: AFAICT there's no particular need for acc_async_sync to map to the
386 null stream, and in fact better performance may be obtainable if it doesn't
387 (because the null stream enforces overly-strict synchronisation with
388 respect to other streams for legacy reasons, and that's probably not
389 needed with OpenACC). Maybe investigate later. */
390 if (async == acc_async_sync)
391 stream = ptx_dev->null_stream;
392 else if (async >= 0 && async < ptx_dev->async_streams.size
393 && ptx_dev->async_streams.arr[async] && !(create && existing))
394 stream = ptx_dev->async_streams.arr[async];
395 else if (async >= 0 && create)
397 if (async >= ptx_dev->async_streams.size)
399 int i, newsize = ptx_dev->async_streams.size * 2;
401 if (async >= newsize)
402 newsize = async + 1;
404 ptx_dev->async_streams.arr
405 = GOMP_PLUGIN_realloc (ptx_dev->async_streams.arr,
406 newsize * sizeof (struct ptx_stream *));
408 for (i = ptx_dev->async_streams.size; i < newsize; i++)
409 ptx_dev->async_streams.arr[i] = NULL;
411 ptx_dev->async_streams.size = newsize;
414 /* Create a new stream on-demand if there isn't one already, or if we're
415 setting a particular async value to an existing (externally-provided)
416 stream. */
417 if (!ptx_dev->async_streams.arr[async] || existing)
419 CUresult r;
420 struct ptx_stream *s
421 = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
423 if (existing)
424 s->stream = existing;
425 else
427 r = cuStreamCreate (&s->stream, CU_STREAM_DEFAULT);
428 if (r != CUDA_SUCCESS)
429 GOMP_PLUGIN_fatal ("cuStreamCreate error: %s", cuda_error (r));
432 /* If CREATE is true, we're going to be queueing some work on this
433 stream. Associate it with the current host thread. */
434 s->host_thread = thread;
435 s->multithreaded = false;
437 s->d = (CUdeviceptr) NULL;
438 s->h = NULL;
439 map_init (s);
441 s->next = ptx_dev->active_streams;
442 ptx_dev->active_streams = s;
443 ptx_dev->async_streams.arr[async] = s;
446 stream = ptx_dev->async_streams.arr[async];
448 else if (async < 0)
449 GOMP_PLUGIN_fatal ("bad async %d", async);
451 if (create)
453 assert (stream != NULL);
455 /* If we're trying to use the same stream from different threads
456 simultaneously, set stream->multithreaded to true. This affects the
457 behaviour of acc_async_test_all and acc_wait_all, which are supposed to
458 only wait for asynchronous launches from the same host thread they are
459 invoked on. If multiple threads use the same async value, we make note
460 of that here and fall back to testing/waiting for all threads in those
461 functions. */
462 if (thread != stream->host_thread)
463 stream->multithreaded = true;
465 pthread_mutex_unlock (&ptx_dev->stream_lock);
467 else if (stream && !stream->multithreaded
468 && !pthread_equal (stream->host_thread, thread))
469 GOMP_PLUGIN_fatal ("async %d used on wrong thread", orig_async);
471 return stream;
474 /* Initialize the device. Return TRUE on success, else FALSE. PTX_DEV_LOCK
475 should be locked on entry and remains locked on exit. */
477 static bool
478 nvptx_init (void)
480 CUresult r;
481 int ndevs;
483 if (instantiated_devices != 0)
484 return true;
486 r = cuInit (0);
487 if (r != CUDA_SUCCESS)
488 GOMP_PLUGIN_fatal ("cuInit error: %s", cuda_error (r));
490 ptx_events = NULL;
492 pthread_mutex_init (&ptx_event_lock, NULL);
494 r = cuDeviceGetCount (&ndevs);
495 if (r != CUDA_SUCCESS)
496 GOMP_PLUGIN_fatal ("cuDeviceGetCount error: %s", cuda_error (r));
498 ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *)
499 * ndevs);
501 return true;
504 /* Select the N'th PTX device for the current host thread. The device must
505 have been previously opened before calling this function. */
507 static void
508 nvptx_attach_host_thread_to_device (int n)
510 CUdevice dev;
511 CUresult r;
512 struct ptx_device *ptx_dev;
513 CUcontext thd_ctx;
515 r = cuCtxGetDevice (&dev);
516 if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
517 GOMP_PLUGIN_fatal ("cuCtxGetDevice error: %s", cuda_error (r));
519 if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n)
520 return;
521 else
523 CUcontext old_ctx;
525 ptx_dev = ptx_devices[n];
526 assert (ptx_dev);
528 r = cuCtxGetCurrent (&thd_ctx);
529 if (r != CUDA_SUCCESS)
530 GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));
532 /* We don't necessarily have a current context (e.g. if it has been
533 destroyed. Pop it if we do though. */
534 if (thd_ctx != NULL)
536 r = cuCtxPopCurrent (&old_ctx);
537 if (r != CUDA_SUCCESS)
538 GOMP_PLUGIN_fatal ("cuCtxPopCurrent error: %s", cuda_error (r));
541 r = cuCtxPushCurrent (ptx_dev->ctx);
542 if (r != CUDA_SUCCESS)
543 GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s", cuda_error (r));
547 static struct ptx_device *
548 nvptx_open_device (int n)
550 struct ptx_device *ptx_dev;
551 CUdevice dev, ctx_dev;
552 CUresult r;
553 int async_engines, pi;
555 r = cuDeviceGet (&dev, n);
556 if (r != CUDA_SUCCESS)
557 GOMP_PLUGIN_fatal ("cuDeviceGet error: %s", cuda_error (r));
559 ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device));
561 ptx_dev->ord = n;
562 ptx_dev->dev = dev;
563 ptx_dev->ctx_shared = false;
565 r = cuCtxGetDevice (&ctx_dev);
566 if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
567 GOMP_PLUGIN_fatal ("cuCtxGetDevice error: %s", cuda_error (r));
569 if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev)
571 /* The current host thread has an active context for a different device.
572 Detach it. */
573 CUcontext old_ctx;
575 r = cuCtxPopCurrent (&old_ctx);
576 if (r != CUDA_SUCCESS)
577 GOMP_PLUGIN_fatal ("cuCtxPopCurrent error: %s", cuda_error (r));
580 r = cuCtxGetCurrent (&ptx_dev->ctx);
581 if (r != CUDA_SUCCESS)
582 GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));
584 if (!ptx_dev->ctx)
586 r = cuCtxCreate (&ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
587 if (r != CUDA_SUCCESS)
588 GOMP_PLUGIN_fatal ("cuCtxCreate error: %s", cuda_error (r));
590 else
591 ptx_dev->ctx_shared = true;
593 r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
594 if (r != CUDA_SUCCESS)
595 GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
597 ptx_dev->overlap = pi;
599 r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
600 if (r != CUDA_SUCCESS)
601 GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
603 ptx_dev->map = pi;
605 r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
606 if (r != CUDA_SUCCESS)
607 GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
609 ptx_dev->concur = pi;
611 r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
612 if (r != CUDA_SUCCESS)
613 GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
615 ptx_dev->mode = pi;
617 r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
618 if (r != CUDA_SUCCESS)
619 GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
621 ptx_dev->mkern = pi;
623 r = cuDeviceGetAttribute (&async_engines,
624 CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
625 if (r != CUDA_SUCCESS)
626 async_engines = 1;
628 ptx_dev->images = NULL;
629 pthread_mutex_init (&ptx_dev->image_lock, NULL);
631 init_streams_for_device (ptx_dev, async_engines);
633 return ptx_dev;
636 static void
637 nvptx_close_device (struct ptx_device *ptx_dev)
639 CUresult r;
641 if (!ptx_dev)
642 return;
644 fini_streams_for_device (ptx_dev);
646 pthread_mutex_destroy (&ptx_dev->image_lock);
648 if (!ptx_dev->ctx_shared)
650 r = cuCtxDestroy (ptx_dev->ctx);
651 if (r != CUDA_SUCCESS)
652 GOMP_PLUGIN_fatal ("cuCtxDestroy error: %s", cuda_error (r));
655 free (ptx_dev);
658 static int
659 nvptx_get_num_devices (void)
661 int n;
662 CUresult r;
664 /* PR libgomp/65099: Currently, we only support offloading in 64-bit
665 configurations. */
666 if (sizeof (void *) != 8)
667 return 0;
669 /* This function will be called before the plugin has been initialized in
670 order to enumerate available devices, but CUDA API routines can't be used
671 until cuInit has been called. Just call it now (but don't yet do any
672 further initialization). */
673 if (instantiated_devices == 0)
675 r = cuInit (0);
676 /* This is not an error: e.g. we may have CUDA libraries installed but
677 no devices available. */
678 if (r != CUDA_SUCCESS)
679 return 0;
682 r = cuDeviceGetCount (&n);
683 if (r!= CUDA_SUCCESS)
684 GOMP_PLUGIN_fatal ("cuDeviceGetCount error: %s", cuda_error (r));
686 return n;
690 static void
691 link_ptx (CUmodule *module, const char *ptx_code)
693 CUjit_option opts[7];
694 void *optvals[7];
695 float elapsed = 0.0;
696 #define LOGSIZE 8192
697 char elog[LOGSIZE];
698 char ilog[LOGSIZE];
699 unsigned long logsize = LOGSIZE;
700 CUlinkState linkstate;
701 CUresult r;
702 void *linkout;
703 size_t linkoutsize __attribute__ ((unused));
705 GOMP_PLUGIN_debug (0, "attempting to load:\n---\n%s\n---\n", ptx_code);
707 opts[0] = CU_JIT_WALL_TIME;
708 optvals[0] = &elapsed;
710 opts[1] = CU_JIT_INFO_LOG_BUFFER;
711 optvals[1] = &ilog[0];
713 opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
714 optvals[2] = (void *) logsize;
716 opts[3] = CU_JIT_ERROR_LOG_BUFFER;
717 optvals[3] = &elog[0];
719 opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
720 optvals[4] = (void *) logsize;
722 opts[5] = CU_JIT_LOG_VERBOSE;
723 optvals[5] = (void *) 1;
725 opts[6] = CU_JIT_TARGET;
726 optvals[6] = (void *) CU_TARGET_COMPUTE_30;
728 r = cuLinkCreate (7, opts, optvals, &linkstate);
729 if (r != CUDA_SUCCESS)
730 GOMP_PLUGIN_fatal ("cuLinkCreate error: %s", cuda_error (r));
732 char *abort_ptx = ABORT_PTX;
733 r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, abort_ptx,
734 strlen (abort_ptx) + 1, 0, 0, 0, 0);
735 if (r != CUDA_SUCCESS)
737 GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
738 GOMP_PLUGIN_fatal ("cuLinkAddData (abort) error: %s", cuda_error (r));
741 char *acc_on_device_ptx = ACC_ON_DEVICE_PTX;
742 r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, acc_on_device_ptx,
743 strlen (acc_on_device_ptx) + 1, 0, 0, 0, 0);
744 if (r != CUDA_SUCCESS)
746 GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
747 GOMP_PLUGIN_fatal ("cuLinkAddData (acc_on_device) error: %s",
748 cuda_error (r));
751 char *goacc_internal_ptx = GOACC_INTERNAL_PTX;
752 r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, goacc_internal_ptx,
753 strlen (goacc_internal_ptx) + 1, 0, 0, 0, 0);
754 if (r != CUDA_SUCCESS)
756 GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
757 GOMP_PLUGIN_fatal ("cuLinkAddData (goacc_internal_ptx) error: %s",
758 cuda_error (r));
761 /* cuLinkAddData's 'data' argument erroneously omits the const qualifier. */
762 r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, (char *)ptx_code,
763 strlen (ptx_code) + 1, 0, 0, 0, 0);
764 if (r != CUDA_SUCCESS)
766 GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
767 GOMP_PLUGIN_fatal ("cuLinkAddData (ptx_code) error: %s", cuda_error (r));
770 r = cuLinkComplete (linkstate, &linkout, &linkoutsize);
771 if (r != CUDA_SUCCESS)
772 GOMP_PLUGIN_fatal ("cuLinkComplete error: %s", cuda_error (r));
774 GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed);
775 GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]);
777 r = cuModuleLoadData (module, linkout);
778 if (r != CUDA_SUCCESS)
779 GOMP_PLUGIN_fatal ("cuModuleLoadData error: %s", cuda_error (r));
782 static void
783 event_gc (bool memmap_lockable)
785 struct ptx_event *ptx_event = ptx_events;
786 struct nvptx_thread *nvthd = nvptx_thread ();
788 pthread_mutex_lock (&ptx_event_lock);
790 while (ptx_event != NULL)
792 CUresult r;
793 struct ptx_event *e = ptx_event;
795 ptx_event = ptx_event->next;
797 if (e->ord != nvthd->ptx_dev->ord)
798 continue;
800 r = cuEventQuery (*e->evt);
801 if (r == CUDA_SUCCESS)
803 CUevent *te;
805 te = e->evt;
807 switch (e->type)
809 case PTX_EVT_MEM:
810 case PTX_EVT_SYNC:
811 break;
813 case PTX_EVT_KNL:
814 map_pop (e->addr);
815 break;
817 case PTX_EVT_ASYNC_CLEANUP:
819 /* The function gomp_plugin_async_unmap_vars needs to claim the
820 memory-map splay tree lock for the current device, so we
821 can't call it when one of our callers has already claimed
822 the lock. In that case, just delay the GC for this event
823 until later. */
824 if (!memmap_lockable)
825 continue;
827 GOMP_PLUGIN_async_unmap_vars (e->addr);
829 break;
832 cuEventDestroy (*te);
833 free ((void *)te);
835 if (ptx_events == e)
836 ptx_events = ptx_events->next;
837 else
839 struct ptx_event *e_ = ptx_events;
840 while (e_->next != e)
841 e_ = e_->next;
842 e_->next = e_->next->next;
845 free (e);
849 pthread_mutex_unlock (&ptx_event_lock);
852 static void
853 event_add (enum ptx_event_type type, CUevent *e, void *h)
855 struct ptx_event *ptx_event;
856 struct nvptx_thread *nvthd = nvptx_thread ();
858 assert (type == PTX_EVT_MEM || type == PTX_EVT_KNL || type == PTX_EVT_SYNC
859 || type == PTX_EVT_ASYNC_CLEANUP);
861 ptx_event = GOMP_PLUGIN_malloc (sizeof (struct ptx_event));
862 ptx_event->type = type;
863 ptx_event->evt = e;
864 ptx_event->addr = h;
865 ptx_event->ord = nvthd->ptx_dev->ord;
867 pthread_mutex_lock (&ptx_event_lock);
869 ptx_event->next = ptx_events;
870 ptx_events = ptx_event;
872 pthread_mutex_unlock (&ptx_event_lock);
875 void
876 nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
877 size_t *sizes, unsigned short *kinds, int async, unsigned *dims,
878 void *targ_mem_desc)
880 struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
881 CUfunction function;
882 CUresult r;
883 int i;
884 struct ptx_stream *dev_str;
885 void *kargs[1];
886 void *hp, *dp;
887 struct nvptx_thread *nvthd = nvptx_thread ();
888 const char *maybe_abort_msg = "(perhaps abort was called)";
890 function = targ_fn->fn;
892 dev_str = select_stream_for_async (async, pthread_self (), false, NULL);
893 assert (dev_str == nvthd->current_stream);
895 /* Initialize the launch dimensions. Typically this is constant,
896 provided by the device compiler, but we must permit runtime
897 values. */
898 for (i = 0; i != 3; i++)
899 if (targ_fn->launch->dim[i])
900 dims[i] = targ_fn->launch->dim[i];
902 if (dims[GOMP_DIM_GANG] != 1)
903 GOMP_PLUGIN_fatal ("non-unity num_gangs (%d) not supported",
904 dims[GOMP_DIM_GANG]);
905 if (dims[GOMP_DIM_WORKER] != 1)
906 GOMP_PLUGIN_fatal ("non-unity num_workers (%d) not supported",
907 dims[GOMP_DIM_WORKER]);
909 /* This reserves a chunk of a pre-allocated page of memory mapped on both
910 the host and the device. HP is a host pointer to the new chunk, and DP is
911 the corresponding device pointer. */
912 map_push (dev_str, async, mapnum * sizeof (void *), &hp, &dp);
914 GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__);
916 /* Copy the array of arguments to the mapped page. */
917 for (i = 0; i < mapnum; i++)
918 ((void **) hp)[i] = devaddrs[i];
920 /* Copy the (device) pointers to arguments to the device (dp and hp might in
921 fact have the same value on a unified-memory system). */
922 r = cuMemcpy ((CUdeviceptr)dp, (CUdeviceptr)hp, mapnum * sizeof (void *));
923 if (r != CUDA_SUCCESS)
924 GOMP_PLUGIN_fatal ("cuMemcpy failed: %s", cuda_error (r));
926 GOMP_PLUGIN_debug (0, " %s: kernel %s: launch"
927 " gangs=%u, workers=%u, vectors=%u\n",
928 __FUNCTION__, targ_fn->launch->fn,
929 dims[0], dims[1], dims[2]);
931 // OpenACC CUDA
933 // num_gangs nctaid.x
934 // num_workers ntid.y
935 // vector length ntid.x
937 kargs[0] = &dp;
938 r = cuLaunchKernel (function,
939 dims[GOMP_DIM_GANG], 1, 1,
940 dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1,
941 0, dev_str->stream, kargs, 0);
942 if (r != CUDA_SUCCESS)
943 GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
945 #ifndef DISABLE_ASYNC
946 if (async < acc_async_noval)
948 r = cuStreamSynchronize (dev_str->stream);
949 if (r == CUDA_ERROR_LAUNCH_FAILED)
950 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
951 maybe_abort_msg);
952 else if (r != CUDA_SUCCESS)
953 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
955 else
957 CUevent *e;
959 e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
961 r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
962 if (r == CUDA_ERROR_LAUNCH_FAILED)
963 GOMP_PLUGIN_fatal ("cuEventCreate error: %s %s\n", cuda_error (r),
964 maybe_abort_msg);
965 else if (r != CUDA_SUCCESS)
966 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
968 event_gc (true);
970 r = cuEventRecord (*e, dev_str->stream);
971 if (r != CUDA_SUCCESS)
972 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
974 event_add (PTX_EVT_KNL, e, (void *)dev_str);
976 #else
977 r = cuCtxSynchronize ();
978 if (r == CUDA_ERROR_LAUNCH_FAILED)
979 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
980 maybe_abort_msg);
981 else if (r != CUDA_SUCCESS)
982 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
983 #endif
985 GOMP_PLUGIN_debug (0, " %s: kernel %s: finished\n", __FUNCTION__,
986 targ_fn->launch->fn);
988 #ifndef DISABLE_ASYNC
989 if (async < acc_async_noval)
990 #endif
991 map_pop (dev_str);
994 void * openacc_get_current_cuda_context (void);
996 static void *
997 nvptx_alloc (size_t s)
999 CUdeviceptr d;
1000 CUresult r;
1002 r = cuMemAlloc (&d, s);
1003 if (r == CUDA_ERROR_OUT_OF_MEMORY)
1004 return 0;
1005 if (r != CUDA_SUCCESS)
1006 GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
1007 return (void *)d;
1010 static void
1011 nvptx_free (void *p)
1013 CUresult r;
1014 CUdeviceptr pb;
1015 size_t ps;
1017 r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)p);
1018 if (r != CUDA_SUCCESS)
1019 GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r));
1021 if ((CUdeviceptr)p != pb)
1022 GOMP_PLUGIN_fatal ("invalid device address");
1024 r = cuMemFree ((CUdeviceptr)p);
1025 if (r != CUDA_SUCCESS)
1026 GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
1029 static void *
1030 nvptx_host2dev (void *d, const void *h, size_t s)
1032 CUresult r;
1033 CUdeviceptr pb;
1034 size_t ps;
1035 struct nvptx_thread *nvthd = nvptx_thread ();
1037 if (!s)
1038 return 0;
1040 if (!d)
1041 GOMP_PLUGIN_fatal ("invalid device address");
1043 r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)d);
1044 if (r != CUDA_SUCCESS)
1045 GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r));
1047 if (!pb)
1048 GOMP_PLUGIN_fatal ("invalid device address");
1050 if (!h)
1051 GOMP_PLUGIN_fatal ("invalid host address");
1053 if (d == h)
1054 GOMP_PLUGIN_fatal ("invalid host or device address");
1056 if ((void *)(d + s) > (void *)(pb + ps))
1057 GOMP_PLUGIN_fatal ("invalid size");
1059 #ifndef DISABLE_ASYNC
1060 if (nvthd->current_stream != nvthd->ptx_dev->null_stream)
1062 CUevent *e;
1064 e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1066 r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1067 if (r != CUDA_SUCCESS)
1068 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1070 event_gc (false);
1072 r = cuMemcpyHtoDAsync ((CUdeviceptr)d, h, s,
1073 nvthd->current_stream->stream);
1074 if (r != CUDA_SUCCESS)
1075 GOMP_PLUGIN_fatal ("cuMemcpyHtoDAsync error: %s", cuda_error (r));
1077 r = cuEventRecord (*e, nvthd->current_stream->stream);
1078 if (r != CUDA_SUCCESS)
1079 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1081 event_add (PTX_EVT_MEM, e, (void *)h);
1083 else
1084 #endif
1086 r = cuMemcpyHtoD ((CUdeviceptr)d, h, s);
1087 if (r != CUDA_SUCCESS)
1088 GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
1091 return 0;
1094 static void *
1095 nvptx_dev2host (void *h, const void *d, size_t s)
1097 CUresult r;
1098 CUdeviceptr pb;
1099 size_t ps;
1100 struct nvptx_thread *nvthd = nvptx_thread ();
1102 if (!s)
1103 return 0;
1105 if (!d)
1106 GOMP_PLUGIN_fatal ("invalid device address");
1108 r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)d);
1109 if (r != CUDA_SUCCESS)
1110 GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r));
1112 if (!pb)
1113 GOMP_PLUGIN_fatal ("invalid device address");
1115 if (!h)
1116 GOMP_PLUGIN_fatal ("invalid host address");
1118 if (d == h)
1119 GOMP_PLUGIN_fatal ("invalid host or device address");
1121 if ((void *)(d + s) > (void *)(pb + ps))
1122 GOMP_PLUGIN_fatal ("invalid size");
1124 #ifndef DISABLE_ASYNC
1125 if (nvthd->current_stream != nvthd->ptx_dev->null_stream)
1127 CUevent *e;
1129 e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1131 r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1132 if (r != CUDA_SUCCESS)
1133 GOMP_PLUGIN_fatal ("cuEventCreate error: %s\n", cuda_error (r));
1135 event_gc (false);
1137 r = cuMemcpyDtoHAsync (h, (CUdeviceptr)d, s,
1138 nvthd->current_stream->stream);
1139 if (r != CUDA_SUCCESS)
1140 GOMP_PLUGIN_fatal ("cuMemcpyDtoHAsync error: %s", cuda_error (r));
1142 r = cuEventRecord (*e, nvthd->current_stream->stream);
1143 if (r != CUDA_SUCCESS)
1144 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1146 event_add (PTX_EVT_MEM, e, (void *)h);
1148 else
1149 #endif
1151 r = cuMemcpyDtoH (h, (CUdeviceptr)d, s);
1152 if (r != CUDA_SUCCESS)
1153 GOMP_PLUGIN_fatal ("cuMemcpyDtoH error: %s", cuda_error (r));
1156 return 0;
1159 static void
1160 nvptx_set_async (int async)
1162 struct nvptx_thread *nvthd = nvptx_thread ();
1163 nvthd->current_stream
1164 = select_stream_for_async (async, pthread_self (), true, NULL);
1167 static int
1168 nvptx_async_test (int async)
1170 CUresult r;
1171 struct ptx_stream *s;
1173 s = select_stream_for_async (async, pthread_self (), false, NULL);
1175 if (!s)
1176 GOMP_PLUGIN_fatal ("unknown async %d", async);
1178 r = cuStreamQuery (s->stream);
1179 if (r == CUDA_SUCCESS)
1181 /* The oacc-parallel.c:goacc_wait function calls this hook to determine
1182 whether all work has completed on this stream, and if so omits the call
1183 to the wait hook. If that happens, event_gc might not get called
1184 (which prevents variables from getting unmapped and their associated
1185 device storage freed), so call it here. */
1186 event_gc (true);
1187 return 1;
1189 else if (r == CUDA_ERROR_NOT_READY)
1190 return 0;
1192 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
1194 return 0;
1197 static int
1198 nvptx_async_test_all (void)
1200 struct ptx_stream *s;
1201 pthread_t self = pthread_self ();
1202 struct nvptx_thread *nvthd = nvptx_thread ();
1204 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1206 for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
1208 if ((s->multithreaded || pthread_equal (s->host_thread, self))
1209 && cuStreamQuery (s->stream) == CUDA_ERROR_NOT_READY)
1211 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1212 return 0;
1216 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1218 event_gc (true);
1220 return 1;
1223 static void
1224 nvptx_wait (int async)
1226 CUresult r;
1227 struct ptx_stream *s;
1229 s = select_stream_for_async (async, pthread_self (), false, NULL);
1231 if (!s)
1232 GOMP_PLUGIN_fatal ("unknown async %d", async);
1234 r = cuStreamSynchronize (s->stream);
1235 if (r != CUDA_SUCCESS)
1236 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1238 event_gc (true);
1241 static void
1242 nvptx_wait_async (int async1, int async2)
1244 CUresult r;
1245 CUevent *e;
1246 struct ptx_stream *s1, *s2;
1247 pthread_t self = pthread_self ();
1249 /* The stream that is waiting (rather than being waited for) doesn't
1250 necessarily have to exist already. */
1251 s2 = select_stream_for_async (async2, self, true, NULL);
1253 s1 = select_stream_for_async (async1, self, false, NULL);
1254 if (!s1)
1255 GOMP_PLUGIN_fatal ("invalid async 1\n");
1257 if (s1 == s2)
1258 GOMP_PLUGIN_fatal ("identical parameters");
1260 e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1262 r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1263 if (r != CUDA_SUCCESS)
1264 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1266 event_gc (true);
1268 r = cuEventRecord (*e, s1->stream);
1269 if (r != CUDA_SUCCESS)
1270 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1272 event_add (PTX_EVT_SYNC, e, NULL);
1274 r = cuStreamWaitEvent (s2->stream, *e, 0);
1275 if (r != CUDA_SUCCESS)
1276 GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuda_error (r));
1279 static void
1280 nvptx_wait_all (void)
1282 CUresult r;
1283 struct ptx_stream *s;
1284 pthread_t self = pthread_self ();
1285 struct nvptx_thread *nvthd = nvptx_thread ();
1287 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1289 /* Wait for active streams initiated by this thread (or by multiple threads)
1290 to complete. */
1291 for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
1293 if (s->multithreaded || pthread_equal (s->host_thread, self))
1295 r = cuStreamQuery (s->stream);
1296 if (r == CUDA_SUCCESS)
1297 continue;
1298 else if (r != CUDA_ERROR_NOT_READY)
1299 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
1301 r = cuStreamSynchronize (s->stream);
1302 if (r != CUDA_SUCCESS)
1303 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1307 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1309 event_gc (true);
1312 static void
1313 nvptx_wait_all_async (int async)
1315 CUresult r;
1316 struct ptx_stream *waiting_stream, *other_stream;
1317 CUevent *e;
1318 struct nvptx_thread *nvthd = nvptx_thread ();
1319 pthread_t self = pthread_self ();
1321 /* The stream doing the waiting. This could be the first mention of the
1322 stream, so create it if necessary. */
1323 waiting_stream
1324 = select_stream_for_async (async, pthread_self (), true, NULL);
1326 /* Launches on the null stream already block on other streams in the
1327 context. */
1328 if (!waiting_stream || waiting_stream == nvthd->ptx_dev->null_stream)
1329 return;
1331 event_gc (true);
1333 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1335 for (other_stream = nvthd->ptx_dev->active_streams;
1336 other_stream != NULL;
1337 other_stream = other_stream->next)
1339 if (!other_stream->multithreaded
1340 && !pthread_equal (other_stream->host_thread, self))
1341 continue;
1343 e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1345 r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1346 if (r != CUDA_SUCCESS)
1347 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1349 /* Record an event on the waited-for stream. */
1350 r = cuEventRecord (*e, other_stream->stream);
1351 if (r != CUDA_SUCCESS)
1352 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1354 event_add (PTX_EVT_SYNC, e, NULL);
1356 r = cuStreamWaitEvent (waiting_stream->stream, *e, 0);
1357 if (r != CUDA_SUCCESS)
1358 GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuda_error (r));
1361 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1364 static void *
1365 nvptx_get_current_cuda_device (void)
1367 struct nvptx_thread *nvthd = nvptx_thread ();
1369 if (!nvthd || !nvthd->ptx_dev)
1370 return NULL;
1372 return &nvthd->ptx_dev->dev;
1375 static void *
1376 nvptx_get_current_cuda_context (void)
1378 struct nvptx_thread *nvthd = nvptx_thread ();
1380 if (!nvthd || !nvthd->ptx_dev)
1381 return NULL;
1383 return nvthd->ptx_dev->ctx;
1386 static void *
1387 nvptx_get_cuda_stream (int async)
1389 struct ptx_stream *s;
1390 struct nvptx_thread *nvthd = nvptx_thread ();
1392 if (!nvthd || !nvthd->ptx_dev)
1393 return NULL;
1395 s = select_stream_for_async (async, pthread_self (), false, NULL);
1397 return s ? s->stream : NULL;
1400 static int
1401 nvptx_set_cuda_stream (int async, void *stream)
1403 struct ptx_stream *oldstream;
1404 pthread_t self = pthread_self ();
1405 struct nvptx_thread *nvthd = nvptx_thread ();
1407 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1409 if (async < 0)
1410 GOMP_PLUGIN_fatal ("bad async %d", async);
1412 /* We have a list of active streams and an array mapping async values to
1413 entries of that list. We need to take "ownership" of the passed-in stream,
1414 and add it to our list, removing the previous entry also (if there was one)
1415 in order to prevent resource leaks. Note the potential for surprise
1416 here: maybe we should keep track of passed-in streams and leave it up to
1417 the user to tidy those up, but that doesn't work for stream handles
1418 returned from acc_get_cuda_stream above... */
1420 oldstream = select_stream_for_async (async, self, false, NULL);
1422 if (oldstream)
1424 if (nvthd->ptx_dev->active_streams == oldstream)
1425 nvthd->ptx_dev->active_streams = nvthd->ptx_dev->active_streams->next;
1426 else
1428 struct ptx_stream *s = nvthd->ptx_dev->active_streams;
1429 while (s->next != oldstream)
1430 s = s->next;
1431 s->next = s->next->next;
1434 cuStreamDestroy (oldstream->stream);
1435 map_fini (oldstream);
1436 free (oldstream);
1439 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1441 (void) select_stream_for_async (async, self, true, (CUstream) stream);
1443 return 1;
1446 /* Plugin entry points. */
1448 const char *
1449 GOMP_OFFLOAD_get_name (void)
1451 return "nvptx";
1454 unsigned int
1455 GOMP_OFFLOAD_get_caps (void)
1457 return GOMP_OFFLOAD_CAP_OPENACC_200;
1461 GOMP_OFFLOAD_get_type (void)
1463 return OFFLOAD_TARGET_TYPE_NVIDIA_PTX;
1467 GOMP_OFFLOAD_get_num_devices (void)
1469 return nvptx_get_num_devices ();
1472 void
1473 GOMP_OFFLOAD_init_device (int n)
1475 pthread_mutex_lock (&ptx_dev_lock);
1477 if (!nvptx_init () || ptx_devices[n] != NULL)
1479 pthread_mutex_unlock (&ptx_dev_lock);
1480 return;
1483 ptx_devices[n] = nvptx_open_device (n);
1484 instantiated_devices++;
1486 pthread_mutex_unlock (&ptx_dev_lock);
1489 void
1490 GOMP_OFFLOAD_fini_device (int n)
1492 pthread_mutex_lock (&ptx_dev_lock);
1494 if (ptx_devices[n] != NULL)
1496 nvptx_attach_host_thread_to_device (n);
1497 nvptx_close_device (ptx_devices[n]);
1498 ptx_devices[n] = NULL;
1499 instantiated_devices--;
1502 pthread_mutex_unlock (&ptx_dev_lock);
1505 /* Data emitted by mkoffload. */
1507 typedef struct nvptx_tdata
1509 const char *ptx_src;
1511 const char *const *var_names;
1512 size_t var_num;
1514 const struct targ_fn_launch *fn_descs;
1515 size_t fn_num;
1516 } nvptx_tdata_t;
1518 /* Return the libgomp version number we're compatible with. There is
1519 no requirement for cross-version compatibility. */
1521 unsigned
1522 GOMP_OFFLOAD_version (void)
1524 return GOMP_VERSION;
1527 /* Load the (partial) program described by TARGET_DATA to device
1528 number ORD. Allocate and return TARGET_TABLE. */
1531 GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
1532 struct addr_pair **target_table)
1534 CUmodule module;
1535 const char *const *var_names;
1536 const struct targ_fn_launch *fn_descs;
1537 unsigned int fn_entries, var_entries, i, j;
1538 CUresult r;
1539 struct targ_fn_descriptor *targ_fns;
1540 struct addr_pair *targ_tbl;
1541 const nvptx_tdata_t *img_header = (const nvptx_tdata_t *) target_data;
1542 struct ptx_image_data *new_image;
1543 struct ptx_device *dev;
1545 if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1546 GOMP_PLUGIN_fatal ("Offload data incompatible with PTX plugin"
1547 " (expected %u, received %u)",
1548 GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1550 GOMP_OFFLOAD_init_device (ord);
1552 dev = ptx_devices[ord];
1554 nvptx_attach_host_thread_to_device (ord);
1556 link_ptx (&module, img_header->ptx_src);
1558 /* The mkoffload utility emits a struct of pointers/integers at the
1559 start of each offload image. The array of kernel names and the
1560 functions addresses form a one-to-one correspondence. */
1562 var_entries = img_header->var_num;
1563 var_names = img_header->var_names;
1564 fn_entries = img_header->fn_num;
1565 fn_descs = img_header->fn_descs;
1567 targ_tbl = GOMP_PLUGIN_malloc (sizeof (struct addr_pair)
1568 * (fn_entries + var_entries));
1569 targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
1570 * fn_entries);
1572 *target_table = targ_tbl;
1574 new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data));
1575 new_image->target_data = target_data;
1576 new_image->module = module;
1577 new_image->fns = targ_fns;
1579 pthread_mutex_lock (&dev->image_lock);
1580 new_image->next = dev->images;
1581 dev->images = new_image;
1582 pthread_mutex_unlock (&dev->image_lock);
1584 for (i = 0; i < fn_entries; i++, targ_fns++, targ_tbl++)
1586 CUfunction function;
1588 r = cuModuleGetFunction (&function, module, fn_descs[i].fn);
1589 if (r != CUDA_SUCCESS)
1590 GOMP_PLUGIN_fatal ("cuModuleGetFunction error: %s", cuda_error (r));
1592 targ_fns->fn = function;
1593 targ_fns->launch = &fn_descs[i];
1595 targ_tbl->start = (uintptr_t) targ_fns;
1596 targ_tbl->end = targ_tbl->start + 1;
1599 for (j = 0; j < var_entries; j++, targ_tbl++)
1601 CUdeviceptr var;
1602 size_t bytes;
1604 r = cuModuleGetGlobal (&var, &bytes, module, var_names[j]);
1605 if (r != CUDA_SUCCESS)
1606 GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
1608 targ_tbl->start = (uintptr_t) var;
1609 targ_tbl->end = targ_tbl->start + bytes;
1612 return fn_entries + var_entries;
1615 /* Unload the program described by TARGET_DATA. DEV_DATA is the
1616 function descriptors allocated by G_O_load_image. */
1618 void
1619 GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data)
1621 struct ptx_image_data *image, **prev_p;
1622 struct ptx_device *dev = ptx_devices[ord];
1624 if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1625 return;
1627 pthread_mutex_lock (&dev->image_lock);
1628 for (prev_p = &dev->images; (image = *prev_p) != 0; prev_p = &image->next)
1629 if (image->target_data == target_data)
1631 *prev_p = image->next;
1632 cuModuleUnload (image->module);
1633 free (image->fns);
1634 free (image);
1635 break;
1637 pthread_mutex_unlock (&dev->image_lock);
1640 void *
1641 GOMP_OFFLOAD_alloc (int ord, size_t size)
1643 nvptx_attach_host_thread_to_device (ord);
1644 return nvptx_alloc (size);
1647 void
1648 GOMP_OFFLOAD_free (int ord, void *ptr)
1650 nvptx_attach_host_thread_to_device (ord);
1651 nvptx_free (ptr);
1654 void *
1655 GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
1657 nvptx_attach_host_thread_to_device (ord);
1658 return nvptx_dev2host (dst, src, n);
1661 void *
1662 GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
1664 nvptx_attach_host_thread_to_device (ord);
1665 return nvptx_host2dev (dst, src, n);
1668 void (*device_run) (int n, void *fn_ptr, void *vars) = NULL;
1670 void
1671 GOMP_OFFLOAD_openacc_parallel (void (*fn) (void *), size_t mapnum,
1672 void **hostaddrs, void **devaddrs,
1673 size_t *sizes, unsigned short *kinds,
1674 int async, unsigned *dims, void *targ_mem_desc)
1676 nvptx_exec (fn, mapnum, hostaddrs, devaddrs, sizes, kinds,
1677 async, dims, targ_mem_desc);
1680 void
1681 GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc)
1683 CUevent *e;
1684 CUresult r;
1685 struct nvptx_thread *nvthd = nvptx_thread ();
1687 e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1689 r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1690 if (r != CUDA_SUCCESS)
1691 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1693 r = cuEventRecord (*e, nvthd->current_stream->stream);
1694 if (r != CUDA_SUCCESS)
1695 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1697 event_add (PTX_EVT_ASYNC_CLEANUP, e, targ_mem_desc);
1701 GOMP_OFFLOAD_openacc_async_test (int async)
1703 return nvptx_async_test (async);
1707 GOMP_OFFLOAD_openacc_async_test_all (void)
1709 return nvptx_async_test_all ();
1712 void
1713 GOMP_OFFLOAD_openacc_async_wait (int async)
1715 nvptx_wait (async);
1718 void
1719 GOMP_OFFLOAD_openacc_async_wait_async (int async1, int async2)
1721 nvptx_wait_async (async1, async2);
1724 void
1725 GOMP_OFFLOAD_openacc_async_wait_all (void)
1727 nvptx_wait_all ();
1730 void
1731 GOMP_OFFLOAD_openacc_async_wait_all_async (int async)
1733 nvptx_wait_all_async (async);
1736 void
1737 GOMP_OFFLOAD_openacc_async_set_async (int async)
1739 nvptx_set_async (async);
1742 void *
1743 GOMP_OFFLOAD_openacc_create_thread_data (int ord)
1745 struct ptx_device *ptx_dev;
1746 struct nvptx_thread *nvthd
1747 = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
1748 CUresult r;
1749 CUcontext thd_ctx;
1751 ptx_dev = ptx_devices[ord];
1753 assert (ptx_dev);
1755 r = cuCtxGetCurrent (&thd_ctx);
1756 if (r != CUDA_SUCCESS)
1757 GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));
1759 assert (ptx_dev->ctx);
1761 if (!thd_ctx)
1763 r = cuCtxPushCurrent (ptx_dev->ctx);
1764 if (r != CUDA_SUCCESS)
1765 GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s", cuda_error (r));
1768 nvthd->current_stream = ptx_dev->null_stream;
1769 nvthd->ptx_dev = ptx_dev;
1771 return (void *) nvthd;
1774 void
1775 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
1777 free (data);
1780 void *
1781 GOMP_OFFLOAD_openacc_get_current_cuda_device (void)
1783 return nvptx_get_current_cuda_device ();
1786 void *
1787 GOMP_OFFLOAD_openacc_get_current_cuda_context (void)
1789 return nvptx_get_current_cuda_context ();
1792 /* NOTE: This returns a CUstream, not a ptx_stream pointer. */
1794 void *
1795 GOMP_OFFLOAD_openacc_get_cuda_stream (int async)
1797 return nvptx_get_cuda_stream (async);
1800 /* NOTE: This takes a CUstream, not a ptx_stream pointer. */
1803 GOMP_OFFLOAD_openacc_set_cuda_stream (int async, void *stream)
1805 return nvptx_set_cuda_stream (async, stream);