2018-05-07 Edward Smith-Rowland <3dw4rd@verizon.net>
[official-gcc.git] / libgomp / plugin / plugin-nvptx.c
blob89326e5774136f0b47289d9f3adb7ead4ed6aee7
1 /* Plugin for NVPTX execution.
3 Copyright (C) 2013-2018 Free Software Foundation, Inc.
5 Contributed by Mentor Embedded.
7 This file is part of the GNU Offloading and Multi Processing Library
8 (libgomp).
10 Libgomp is free software; you can redistribute it and/or modify it
11 under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 3, or (at your option)
13 any later version.
15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
18 more details.
20 Under Section 7 of GPL version 3, you are granted additional
21 permissions described in the GCC Runtime Library Exception, version
22 3.1, as published by the Free Software Foundation.
24 You should have received a copy of the GNU General Public License and
25 a copy of the GCC Runtime Library Exception along with this program;
26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
27 <http://www.gnu.org/licenses/>. */
29 /* Nvidia PTX-specific parts of OpenACC support. The cuda driver
30 library appears to hold some implicit state, but the documentation
31 is not clear as to what that state might be. Or how one might
32 propagate it from one thread to another. */
34 #define _GNU_SOURCE
35 #include "openacc.h"
36 #include "config.h"
37 #include "libgomp-plugin.h"
38 #include "oacc-plugin.h"
39 #include "gomp-constants.h"
41 #include <pthread.h>
42 #include <cuda.h>
43 #include <stdbool.h>
44 #include <stdint.h>
45 #include <limits.h>
46 #include <string.h>
47 #include <stdio.h>
48 #include <unistd.h>
49 #include <assert.h>
50 #include <errno.h>
52 #if PLUGIN_NVPTX_DYNAMIC
53 # include <dlfcn.h>
55 # define CUDA_CALLS \
56 CUDA_ONE_CALL (cuCtxCreate) \
57 CUDA_ONE_CALL (cuCtxDestroy) \
58 CUDA_ONE_CALL (cuCtxGetCurrent) \
59 CUDA_ONE_CALL (cuCtxGetDevice) \
60 CUDA_ONE_CALL (cuCtxPopCurrent) \
61 CUDA_ONE_CALL (cuCtxPushCurrent) \
62 CUDA_ONE_CALL (cuCtxSynchronize) \
63 CUDA_ONE_CALL (cuDeviceGet) \
64 CUDA_ONE_CALL (cuDeviceGetAttribute) \
65 CUDA_ONE_CALL (cuDeviceGetCount) \
66 CUDA_ONE_CALL (cuEventCreate) \
67 CUDA_ONE_CALL (cuEventDestroy) \
68 CUDA_ONE_CALL (cuEventElapsedTime) \
69 CUDA_ONE_CALL (cuEventQuery) \
70 CUDA_ONE_CALL (cuEventRecord) \
71 CUDA_ONE_CALL (cuEventSynchronize) \
72 CUDA_ONE_CALL (cuFuncGetAttribute) \
73 CUDA_ONE_CALL (cuGetErrorString) \
74 CUDA_ONE_CALL (cuInit) \
75 CUDA_ONE_CALL (cuLaunchKernel) \
76 CUDA_ONE_CALL (cuLinkAddData) \
77 CUDA_ONE_CALL (cuLinkComplete) \
78 CUDA_ONE_CALL (cuLinkCreate) \
79 CUDA_ONE_CALL (cuLinkDestroy) \
80 CUDA_ONE_CALL (cuMemAlloc) \
81 CUDA_ONE_CALL (cuMemAllocHost) \
82 CUDA_ONE_CALL (cuMemcpy) \
83 CUDA_ONE_CALL (cuMemcpyDtoDAsync) \
84 CUDA_ONE_CALL (cuMemcpyDtoH) \
85 CUDA_ONE_CALL (cuMemcpyDtoHAsync) \
86 CUDA_ONE_CALL (cuMemcpyHtoD) \
87 CUDA_ONE_CALL (cuMemcpyHtoDAsync) \
88 CUDA_ONE_CALL (cuMemFree) \
89 CUDA_ONE_CALL (cuMemFreeHost) \
90 CUDA_ONE_CALL (cuMemGetAddressRange) \
91 CUDA_ONE_CALL (cuMemHostGetDevicePointer)\
92 CUDA_ONE_CALL (cuModuleGetFunction) \
93 CUDA_ONE_CALL (cuModuleGetGlobal) \
94 CUDA_ONE_CALL (cuModuleLoad) \
95 CUDA_ONE_CALL (cuModuleLoadData) \
96 CUDA_ONE_CALL (cuModuleUnload) \
97 CUDA_ONE_CALL (cuStreamCreate) \
98 CUDA_ONE_CALL (cuStreamDestroy) \
99 CUDA_ONE_CALL (cuStreamQuery) \
100 CUDA_ONE_CALL (cuStreamSynchronize) \
101 CUDA_ONE_CALL (cuStreamWaitEvent)
102 # define CUDA_ONE_CALL(call) \
103 __typeof (call) *call;
104 struct cuda_lib_s {
105 CUDA_CALLS
106 } cuda_lib;
108 /* -1 if init_cuda_lib has not been called yet, false
109 if it has been and failed, true if it has been and succeeded. */
110 static signed char cuda_lib_inited = -1;
112 /* Dynamically load the CUDA runtime library and initialize function
113 pointers, return false if unsuccessful, true if successful. */
114 static bool
115 init_cuda_lib (void)
117 if (cuda_lib_inited != -1)
118 return cuda_lib_inited;
119 const char *cuda_runtime_lib = "libcuda.so.1";
120 void *h = dlopen (cuda_runtime_lib, RTLD_LAZY);
121 cuda_lib_inited = false;
122 if (h == NULL)
123 return false;
124 # undef CUDA_ONE_CALL
125 # define CUDA_ONE_CALL(call) CUDA_ONE_CALL_1 (call)
126 # define CUDA_ONE_CALL_1(call) \
127 cuda_lib.call = dlsym (h, #call); \
128 if (cuda_lib.call == NULL) \
129 return false;
130 CUDA_CALLS
131 cuda_lib_inited = true;
132 return true;
134 # undef CUDA_ONE_CALL
135 # undef CUDA_ONE_CALL_1
136 # define CUDA_CALL_PREFIX cuda_lib.
137 #else
138 # define CUDA_CALL_PREFIX
139 # define init_cuda_lib() true
140 #endif
142 #include "secure_getenv.h"
144 /* Convenience macros for the frequently used CUDA library call and
145 error handling sequence as well as CUDA library calls that
146 do the error checking themselves or don't do it at all. */
148 #define CUDA_CALL_ERET(ERET, FN, ...) \
149 do { \
150 unsigned __r \
151 = CUDA_CALL_PREFIX FN (__VA_ARGS__); \
152 if (__r != CUDA_SUCCESS) \
154 GOMP_PLUGIN_error (#FN " error: %s", \
155 cuda_error (__r)); \
156 return ERET; \
158 } while (0)
160 #define CUDA_CALL(FN, ...) \
161 CUDA_CALL_ERET (false, FN, __VA_ARGS__)
163 #define CUDA_CALL_ASSERT(FN, ...) \
164 do { \
165 unsigned __r \
166 = CUDA_CALL_PREFIX FN (__VA_ARGS__); \
167 if (__r != CUDA_SUCCESS) \
169 GOMP_PLUGIN_fatal (#FN " error: %s", \
170 cuda_error (__r)); \
172 } while (0)
174 #define CUDA_CALL_NOCHECK(FN, ...) \
175 CUDA_CALL_PREFIX FN (__VA_ARGS__)
177 static const char *
178 cuda_error (CUresult r)
180 #if CUDA_VERSION < 7000
181 /* Specified in documentation and present in library from at least
182 5.5. Not declared in header file prior to 7.0. */
183 extern CUresult cuGetErrorString (CUresult, const char **);
184 #endif
185 const char *desc;
187 r = CUDA_CALL_NOCHECK (cuGetErrorString, r, &desc);
188 if (r != CUDA_SUCCESS)
189 desc = "unknown cuda error";
191 return desc;
194 static unsigned int instantiated_devices = 0;
195 static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
197 struct ptx_stream
199 CUstream stream;
200 pthread_t host_thread;
201 bool multithreaded;
203 CUdeviceptr d;
204 void *h;
205 void *h_begin;
206 void *h_end;
207 void *h_next;
208 void *h_prev;
209 void *h_tail;
211 struct ptx_stream *next;
214 /* Thread-specific data for PTX. */
216 struct nvptx_thread
218 struct ptx_stream *current_stream;
219 struct ptx_device *ptx_dev;
222 struct map
224 int async;
225 size_t size;
226 char mappings[0];
229 static bool
230 map_init (struct ptx_stream *s)
232 int size = getpagesize ();
234 assert (s);
235 assert (!s->d);
236 assert (!s->h);
238 CUDA_CALL (cuMemAllocHost, &s->h, size);
239 CUDA_CALL (cuMemHostGetDevicePointer, &s->d, s->h, 0);
241 assert (s->h);
243 s->h_begin = s->h;
244 s->h_end = s->h_begin + size;
245 s->h_next = s->h_prev = s->h_tail = s->h_begin;
247 assert (s->h_next);
248 assert (s->h_end);
249 return true;
252 static bool
253 map_fini (struct ptx_stream *s)
255 CUDA_CALL (cuMemFreeHost, s->h);
256 return true;
259 static void
260 map_pop (struct ptx_stream *s)
262 struct map *m;
264 assert (s != NULL);
265 assert (s->h_next);
266 assert (s->h_prev);
267 assert (s->h_tail);
269 m = s->h_tail;
271 s->h_tail += m->size;
273 if (s->h_tail >= s->h_end)
274 s->h_tail = s->h_begin + (int) (s->h_tail - s->h_end);
276 if (s->h_next == s->h_tail)
277 s->h_prev = s->h_next;
279 assert (s->h_next >= s->h_begin);
280 assert (s->h_tail >= s->h_begin);
281 assert (s->h_prev >= s->h_begin);
283 assert (s->h_next <= s->h_end);
284 assert (s->h_tail <= s->h_end);
285 assert (s->h_prev <= s->h_end);
288 static void
289 map_push (struct ptx_stream *s, int async, size_t size, void **h, void **d)
291 int left;
292 int offset;
293 struct map *m;
295 assert (s != NULL);
297 left = s->h_end - s->h_next;
298 size += sizeof (struct map);
300 assert (s->h_prev);
301 assert (s->h_next);
303 if (size >= left)
305 m = s->h_prev;
306 m->size += left;
307 s->h_next = s->h_begin;
309 if (s->h_next + size > s->h_end)
310 GOMP_PLUGIN_fatal ("unable to push map");
313 assert (s->h_next);
315 m = s->h_next;
316 m->async = async;
317 m->size = size;
319 offset = (void *)&m->mappings[0] - s->h;
321 *d = (void *)(s->d + offset);
322 *h = (void *)(s->h + offset);
324 s->h_prev = s->h_next;
325 s->h_next += size;
327 assert (s->h_prev);
328 assert (s->h_next);
330 assert (s->h_next >= s->h_begin);
331 assert (s->h_tail >= s->h_begin);
332 assert (s->h_prev >= s->h_begin);
333 assert (s->h_next <= s->h_end);
334 assert (s->h_tail <= s->h_end);
335 assert (s->h_prev <= s->h_end);
337 return;
340 /* Target data function launch information. */
342 struct targ_fn_launch
344 const char *fn;
345 unsigned short dim[GOMP_DIM_MAX];
348 /* Target PTX object information. */
350 struct targ_ptx_obj
352 const char *code;
353 size_t size;
356 /* Target data image information. */
358 typedef struct nvptx_tdata
360 const struct targ_ptx_obj *ptx_objs;
361 unsigned ptx_num;
363 const char *const *var_names;
364 unsigned var_num;
366 const struct targ_fn_launch *fn_descs;
367 unsigned fn_num;
368 } nvptx_tdata_t;
370 /* Descriptor of a loaded function. */
372 struct targ_fn_descriptor
374 CUfunction fn;
375 const struct targ_fn_launch *launch;
376 int regs_per_thread;
377 int max_threads_per_block;
380 /* A loaded PTX image. */
381 struct ptx_image_data
383 const void *target_data;
384 CUmodule module;
386 struct targ_fn_descriptor *fns; /* Array of functions. */
388 struct ptx_image_data *next;
391 struct ptx_device
393 CUcontext ctx;
394 bool ctx_shared;
395 CUdevice dev;
396 struct ptx_stream *null_stream;
397 /* All non-null streams associated with this device (actually context),
398 either created implicitly or passed in from the user (via
399 acc_set_cuda_stream). */
400 struct ptx_stream *active_streams;
401 struct {
402 struct ptx_stream **arr;
403 int size;
404 } async_streams;
405 /* A lock for use when manipulating the above stream list and array. */
406 pthread_mutex_t stream_lock;
407 int ord;
408 bool overlap;
409 bool map;
410 bool concur;
411 bool mkern;
412 int mode;
413 int clock_khz;
414 int num_sms;
415 int regs_per_block;
416 int regs_per_sm;
418 struct ptx_image_data *images; /* Images loaded on device. */
419 pthread_mutex_t image_lock; /* Lock for above list. */
421 struct ptx_device *next;
424 enum ptx_event_type
426 PTX_EVT_MEM,
427 PTX_EVT_KNL,
428 PTX_EVT_SYNC,
429 PTX_EVT_ASYNC_CLEANUP
432 struct ptx_event
434 CUevent *evt;
435 int type;
436 void *addr;
437 int ord;
438 int val;
440 struct ptx_event *next;
443 static pthread_mutex_t ptx_event_lock;
444 static struct ptx_event *ptx_events;
446 static struct ptx_device **ptx_devices;
448 static inline struct nvptx_thread *
449 nvptx_thread (void)
451 return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
454 static bool
455 init_streams_for_device (struct ptx_device *ptx_dev, int concurrency)
457 int i;
458 struct ptx_stream *null_stream
459 = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
461 null_stream->stream = NULL;
462 null_stream->host_thread = pthread_self ();
463 null_stream->multithreaded = true;
464 null_stream->d = (CUdeviceptr) NULL;
465 null_stream->h = NULL;
466 if (!map_init (null_stream))
467 return false;
469 ptx_dev->null_stream = null_stream;
470 ptx_dev->active_streams = NULL;
471 pthread_mutex_init (&ptx_dev->stream_lock, NULL);
473 if (concurrency < 1)
474 concurrency = 1;
476 /* This is just a guess -- make space for as many async streams as the
477 current device is capable of concurrently executing. This can grow
478 later as necessary. No streams are created yet. */
479 ptx_dev->async_streams.arr
480 = GOMP_PLUGIN_malloc (concurrency * sizeof (struct ptx_stream *));
481 ptx_dev->async_streams.size = concurrency;
483 for (i = 0; i < concurrency; i++)
484 ptx_dev->async_streams.arr[i] = NULL;
486 return true;
489 static bool
490 fini_streams_for_device (struct ptx_device *ptx_dev)
492 free (ptx_dev->async_streams.arr);
494 bool ret = true;
495 while (ptx_dev->active_streams != NULL)
497 struct ptx_stream *s = ptx_dev->active_streams;
498 ptx_dev->active_streams = ptx_dev->active_streams->next;
500 ret &= map_fini (s);
502 CUresult r = CUDA_CALL_NOCHECK (cuStreamDestroy, s->stream);
503 if (r != CUDA_SUCCESS)
505 GOMP_PLUGIN_error ("cuStreamDestroy error: %s", cuda_error (r));
506 ret = false;
508 free (s);
511 ret &= map_fini (ptx_dev->null_stream);
512 free (ptx_dev->null_stream);
513 return ret;
516 /* Select a stream for (OpenACC-semantics) ASYNC argument for the current
517 thread THREAD (and also current device/context). If CREATE is true, create
518 the stream if it does not exist (or use EXISTING if it is non-NULL), and
519 associate the stream with the same thread argument. Returns stream to use
520 as result. */
522 static struct ptx_stream *
523 select_stream_for_async (int async, pthread_t thread, bool create,
524 CUstream existing)
526 struct nvptx_thread *nvthd = nvptx_thread ();
527 /* Local copy of TLS variable. */
528 struct ptx_device *ptx_dev = nvthd->ptx_dev;
529 struct ptx_stream *stream = NULL;
530 int orig_async = async;
532 /* The special value acc_async_noval (-1) maps (for now) to an
533 implicitly-created stream, which is then handled the same as any other
534 numbered async stream. Other options are available, e.g. using the null
535 stream for anonymous async operations, or choosing an idle stream from an
536 active set. But, stick with this for now. */
537 if (async > acc_async_sync)
538 async++;
540 if (create)
541 pthread_mutex_lock (&ptx_dev->stream_lock);
543 /* NOTE: AFAICT there's no particular need for acc_async_sync to map to the
544 null stream, and in fact better performance may be obtainable if it doesn't
545 (because the null stream enforces overly-strict synchronisation with
546 respect to other streams for legacy reasons, and that's probably not
547 needed with OpenACC). Maybe investigate later. */
548 if (async == acc_async_sync)
549 stream = ptx_dev->null_stream;
550 else if (async >= 0 && async < ptx_dev->async_streams.size
551 && ptx_dev->async_streams.arr[async] && !(create && existing))
552 stream = ptx_dev->async_streams.arr[async];
553 else if (async >= 0 && create)
555 if (async >= ptx_dev->async_streams.size)
557 int i, newsize = ptx_dev->async_streams.size * 2;
559 if (async >= newsize)
560 newsize = async + 1;
562 ptx_dev->async_streams.arr
563 = GOMP_PLUGIN_realloc (ptx_dev->async_streams.arr,
564 newsize * sizeof (struct ptx_stream *));
566 for (i = ptx_dev->async_streams.size; i < newsize; i++)
567 ptx_dev->async_streams.arr[i] = NULL;
569 ptx_dev->async_streams.size = newsize;
572 /* Create a new stream on-demand if there isn't one already, or if we're
573 setting a particular async value to an existing (externally-provided)
574 stream. */
575 if (!ptx_dev->async_streams.arr[async] || existing)
577 CUresult r;
578 struct ptx_stream *s
579 = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
581 if (existing)
582 s->stream = existing;
583 else
585 r = CUDA_CALL_NOCHECK (cuStreamCreate, &s->stream,
586 CU_STREAM_DEFAULT);
587 if (r != CUDA_SUCCESS)
589 pthread_mutex_unlock (&ptx_dev->stream_lock);
590 GOMP_PLUGIN_fatal ("cuStreamCreate error: %s",
591 cuda_error (r));
595 /* If CREATE is true, we're going to be queueing some work on this
596 stream. Associate it with the current host thread. */
597 s->host_thread = thread;
598 s->multithreaded = false;
600 s->d = (CUdeviceptr) NULL;
601 s->h = NULL;
602 if (!map_init (s))
604 pthread_mutex_unlock (&ptx_dev->stream_lock);
605 GOMP_PLUGIN_fatal ("map_init fail");
608 s->next = ptx_dev->active_streams;
609 ptx_dev->active_streams = s;
610 ptx_dev->async_streams.arr[async] = s;
613 stream = ptx_dev->async_streams.arr[async];
615 else if (async < 0)
617 if (create)
618 pthread_mutex_unlock (&ptx_dev->stream_lock);
619 GOMP_PLUGIN_fatal ("bad async %d", async);
622 if (create)
624 assert (stream != NULL);
626 /* If we're trying to use the same stream from different threads
627 simultaneously, set stream->multithreaded to true. This affects the
628 behaviour of acc_async_test_all and acc_wait_all, which are supposed to
629 only wait for asynchronous launches from the same host thread they are
630 invoked on. If multiple threads use the same async value, we make note
631 of that here and fall back to testing/waiting for all threads in those
632 functions. */
633 if (thread != stream->host_thread)
634 stream->multithreaded = true;
636 pthread_mutex_unlock (&ptx_dev->stream_lock);
638 else if (stream && !stream->multithreaded
639 && !pthread_equal (stream->host_thread, thread))
640 GOMP_PLUGIN_fatal ("async %d used on wrong thread", orig_async);
642 return stream;
645 /* Initialize the device. Return TRUE on success, else FALSE. PTX_DEV_LOCK
646 should be locked on entry and remains locked on exit. */
648 static bool
649 nvptx_init (void)
651 int ndevs;
653 if (instantiated_devices != 0)
654 return true;
656 ptx_events = NULL;
657 pthread_mutex_init (&ptx_event_lock, NULL);
659 if (!init_cuda_lib ())
660 return false;
662 CUDA_CALL (cuInit, 0);
664 CUDA_CALL (cuDeviceGetCount, &ndevs);
665 ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *)
666 * ndevs);
667 return true;
670 /* Select the N'th PTX device for the current host thread. The device must
671 have been previously opened before calling this function. */
673 static bool
674 nvptx_attach_host_thread_to_device (int n)
676 CUdevice dev;
677 CUresult r;
678 struct ptx_device *ptx_dev;
679 CUcontext thd_ctx;
681 r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &dev);
682 if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
684 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
685 return false;
688 if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n)
689 return true;
690 else
692 CUcontext old_ctx;
694 ptx_dev = ptx_devices[n];
695 if (!ptx_dev)
697 GOMP_PLUGIN_error ("device %d not found", n);
698 return false;
701 CUDA_CALL (cuCtxGetCurrent, &thd_ctx);
703 /* We don't necessarily have a current context (e.g. if it has been
704 destroyed. Pop it if we do though. */
705 if (thd_ctx != NULL)
706 CUDA_CALL (cuCtxPopCurrent, &old_ctx);
708 CUDA_CALL (cuCtxPushCurrent, ptx_dev->ctx);
710 return true;
713 static struct ptx_device *
714 nvptx_open_device (int n)
716 struct ptx_device *ptx_dev;
717 CUdevice dev, ctx_dev;
718 CUresult r;
719 int async_engines, pi;
721 CUDA_CALL_ERET (NULL, cuDeviceGet, &dev, n);
723 ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device));
725 ptx_dev->ord = n;
726 ptx_dev->dev = dev;
727 ptx_dev->ctx_shared = false;
729 r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &ctx_dev);
730 if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
732 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
733 return NULL;
736 if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev)
738 /* The current host thread has an active context for a different device.
739 Detach it. */
740 CUcontext old_ctx;
741 CUDA_CALL_ERET (NULL, cuCtxPopCurrent, &old_ctx);
744 CUDA_CALL_ERET (NULL, cuCtxGetCurrent, &ptx_dev->ctx);
746 if (!ptx_dev->ctx)
747 CUDA_CALL_ERET (NULL, cuCtxCreate, &ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
748 else
749 ptx_dev->ctx_shared = true;
751 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
752 &pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
753 ptx_dev->overlap = pi;
755 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
756 &pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
757 ptx_dev->map = pi;
759 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
760 &pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
761 ptx_dev->concur = pi;
763 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
764 &pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
765 ptx_dev->mode = pi;
767 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
768 &pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
769 ptx_dev->mkern = pi;
771 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
772 &pi, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
773 ptx_dev->clock_khz = pi;
775 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
776 &pi, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
777 ptx_dev->num_sms = pi;
779 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
780 &pi, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev);
781 ptx_dev->regs_per_block = pi;
783 /* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82 is defined only
784 in CUDA 6.0 and newer. */
785 r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi, 82, dev);
786 /* Fallback: use limit of registers per block, which is usually equal. */
787 if (r == CUDA_ERROR_INVALID_VALUE)
788 pi = ptx_dev->regs_per_block;
789 else if (r != CUDA_SUCCESS)
791 GOMP_PLUGIN_error ("cuDeviceGetAttribute error: %s", cuda_error (r));
792 return NULL;
794 ptx_dev->regs_per_sm = pi;
796 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
797 &pi, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev);
798 if (pi != 32)
800 GOMP_PLUGIN_error ("Only warp size 32 is supported");
801 return NULL;
804 r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &async_engines,
805 CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
806 if (r != CUDA_SUCCESS)
807 async_engines = 1;
809 ptx_dev->images = NULL;
810 pthread_mutex_init (&ptx_dev->image_lock, NULL);
812 if (!init_streams_for_device (ptx_dev, async_engines))
813 return NULL;
815 return ptx_dev;
818 static bool
819 nvptx_close_device (struct ptx_device *ptx_dev)
821 if (!ptx_dev)
822 return true;
824 if (!fini_streams_for_device (ptx_dev))
825 return false;
827 pthread_mutex_destroy (&ptx_dev->image_lock);
829 if (!ptx_dev->ctx_shared)
830 CUDA_CALL (cuCtxDestroy, ptx_dev->ctx);
832 free (ptx_dev);
833 return true;
836 static int
837 nvptx_get_num_devices (void)
839 int n;
841 /* PR libgomp/65099: Currently, we only support offloading in 64-bit
842 configurations. */
843 if (sizeof (void *) != 8)
845 GOMP_PLUGIN_debug (0, "Disabling nvptx offloading;"
846 " only 64-bit configurations are supported\n");
847 return 0;
850 /* This function will be called before the plugin has been initialized in
851 order to enumerate available devices, but CUDA API routines can't be used
852 until cuInit has been called. Just call it now (but don't yet do any
853 further initialization). */
854 if (instantiated_devices == 0)
856 if (!init_cuda_lib ())
857 return 0;
858 CUresult r = CUDA_CALL_NOCHECK (cuInit, 0);
859 /* This is not an error: e.g. we may have CUDA libraries installed but
860 no devices available. */
861 if (r != CUDA_SUCCESS)
863 GOMP_PLUGIN_debug (0, "Disabling nvptx offloading; cuInit: %s\n",
864 cuda_error (r));
865 return 0;
869 CUDA_CALL_ERET (-1, cuDeviceGetCount, &n);
870 return n;
873 static void
874 notify_var (const char *var_name, const char *env_var)
876 if (env_var == NULL)
877 GOMP_PLUGIN_debug (0, "%s: <Not defined>\n", var_name);
878 else
879 GOMP_PLUGIN_debug (0, "%s: '%s'\n", var_name, env_var);
882 static void
883 process_GOMP_NVPTX_JIT (intptr_t *gomp_nvptx_o)
885 const char *var_name = "GOMP_NVPTX_JIT";
886 const char *env_var = secure_getenv (var_name);
887 notify_var (var_name, env_var);
889 if (env_var == NULL)
890 return;
892 const char *c = env_var;
893 while (*c != '\0')
895 while (*c == ' ')
896 c++;
898 if (c[0] == '-' && c[1] == 'O'
899 && '0' <= c[2] && c[2] <= '4'
900 && (c[3] == '\0' || c[3] == ' '))
902 *gomp_nvptx_o = c[2] - '0';
903 c += 3;
904 continue;
907 GOMP_PLUGIN_error ("Error parsing %s", var_name);
908 break;
912 static bool
913 link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs,
914 unsigned num_objs)
916 CUjit_option opts[7];
917 void *optvals[7];
918 float elapsed = 0.0;
919 char elog[1024];
920 char ilog[16384];
921 CUlinkState linkstate;
922 CUresult r;
923 void *linkout;
924 size_t linkoutsize __attribute__ ((unused));
926 opts[0] = CU_JIT_WALL_TIME;
927 optvals[0] = &elapsed;
929 opts[1] = CU_JIT_INFO_LOG_BUFFER;
930 optvals[1] = &ilog[0];
932 opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
933 optvals[2] = (void *) sizeof ilog;
935 opts[3] = CU_JIT_ERROR_LOG_BUFFER;
936 optvals[3] = &elog[0];
938 opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
939 optvals[4] = (void *) sizeof elog;
941 opts[5] = CU_JIT_LOG_VERBOSE;
942 optvals[5] = (void *) 1;
944 static intptr_t gomp_nvptx_o = -1;
946 static bool init_done = false;
947 if (!init_done)
949 process_GOMP_NVPTX_JIT (&gomp_nvptx_o);
950 init_done = true;
953 int nopts = 6;
954 if (gomp_nvptx_o != -1)
956 opts[nopts] = CU_JIT_OPTIMIZATION_LEVEL;
957 optvals[nopts] = (void *) gomp_nvptx_o;
958 nopts++;
961 CUDA_CALL (cuLinkCreate, nopts, opts, optvals, &linkstate);
963 for (; num_objs--; ptx_objs++)
965 /* cuLinkAddData's 'data' argument erroneously omits the const
966 qualifier. */
967 GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs->code);
968 r = CUDA_CALL_NOCHECK (cuLinkAddData, linkstate, CU_JIT_INPUT_PTX,
969 (char *) ptx_objs->code, ptx_objs->size,
970 0, 0, 0, 0);
971 if (r != CUDA_SUCCESS)
973 GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
974 GOMP_PLUGIN_error ("cuLinkAddData (ptx_code) error: %s",
975 cuda_error (r));
976 return false;
980 GOMP_PLUGIN_debug (0, "Linking\n");
981 r = CUDA_CALL_NOCHECK (cuLinkComplete, linkstate, &linkout, &linkoutsize);
983 GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed);
984 GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]);
986 if (r != CUDA_SUCCESS)
988 GOMP_PLUGIN_error ("cuLinkComplete error: %s", cuda_error (r));
989 return false;
992 CUDA_CALL (cuModuleLoadData, module, linkout);
993 CUDA_CALL (cuLinkDestroy, linkstate);
994 return true;
997 static void
998 event_gc (bool memmap_lockable)
1000 struct ptx_event *ptx_event = ptx_events;
1001 struct ptx_event *async_cleanups = NULL;
1002 struct nvptx_thread *nvthd = nvptx_thread ();
1004 pthread_mutex_lock (&ptx_event_lock);
1006 while (ptx_event != NULL)
1008 CUresult r;
1009 struct ptx_event *e = ptx_event;
1011 ptx_event = ptx_event->next;
1013 if (e->ord != nvthd->ptx_dev->ord)
1014 continue;
1016 r = CUDA_CALL_NOCHECK (cuEventQuery, *e->evt);
1017 if (r == CUDA_SUCCESS)
1019 bool append_async = false;
1020 CUevent *te;
1022 te = e->evt;
1024 switch (e->type)
1026 case PTX_EVT_MEM:
1027 case PTX_EVT_SYNC:
1028 break;
1030 case PTX_EVT_KNL:
1031 map_pop (e->addr);
1032 break;
1034 case PTX_EVT_ASYNC_CLEANUP:
1036 /* The function gomp_plugin_async_unmap_vars needs to claim the
1037 memory-map splay tree lock for the current device, so we
1038 can't call it when one of our callers has already claimed
1039 the lock. In that case, just delay the GC for this event
1040 until later. */
1041 if (!memmap_lockable)
1042 continue;
1044 append_async = true;
1046 break;
1049 CUDA_CALL_NOCHECK (cuEventDestroy, *te);
1050 free ((void *)te);
1052 /* Unlink 'e' from ptx_events list. */
1053 if (ptx_events == e)
1054 ptx_events = ptx_events->next;
1055 else
1057 struct ptx_event *e_ = ptx_events;
1058 while (e_->next != e)
1059 e_ = e_->next;
1060 e_->next = e_->next->next;
1063 if (append_async)
1065 e->next = async_cleanups;
1066 async_cleanups = e;
1068 else
1069 free (e);
1073 pthread_mutex_unlock (&ptx_event_lock);
1075 /* We have to do these here, after ptx_event_lock is released. */
1076 while (async_cleanups)
1078 struct ptx_event *e = async_cleanups;
1079 async_cleanups = async_cleanups->next;
1081 GOMP_PLUGIN_async_unmap_vars (e->addr, e->val);
1082 free (e);
1086 static void
1087 event_add (enum ptx_event_type type, CUevent *e, void *h, int val)
1089 struct ptx_event *ptx_event;
1090 struct nvptx_thread *nvthd = nvptx_thread ();
1092 assert (type == PTX_EVT_MEM || type == PTX_EVT_KNL || type == PTX_EVT_SYNC
1093 || type == PTX_EVT_ASYNC_CLEANUP);
1095 ptx_event = GOMP_PLUGIN_malloc (sizeof (struct ptx_event));
1096 ptx_event->type = type;
1097 ptx_event->evt = e;
1098 ptx_event->addr = h;
1099 ptx_event->ord = nvthd->ptx_dev->ord;
1100 ptx_event->val = val;
1102 pthread_mutex_lock (&ptx_event_lock);
1104 ptx_event->next = ptx_events;
1105 ptx_events = ptx_event;
1107 pthread_mutex_unlock (&ptx_event_lock);
1110 static void
1111 nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
1112 int async, unsigned *dims, void *targ_mem_desc)
1114 struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
1115 CUfunction function;
1116 CUresult r;
1117 int i;
1118 struct ptx_stream *dev_str;
1119 void *kargs[1];
1120 void *hp, *dp;
1121 struct nvptx_thread *nvthd = nvptx_thread ();
1122 const char *maybe_abort_msg = "(perhaps abort was called)";
1124 function = targ_fn->fn;
1126 dev_str = select_stream_for_async (async, pthread_self (), false, NULL);
1127 assert (dev_str == nvthd->current_stream);
1129 /* Initialize the launch dimensions. Typically this is constant,
1130 provided by the device compiler, but we must permit runtime
1131 values. */
1132 int seen_zero = 0;
1133 for (i = 0; i != GOMP_DIM_MAX; i++)
1135 if (targ_fn->launch->dim[i])
1136 dims[i] = targ_fn->launch->dim[i];
1137 if (!dims[i])
1138 seen_zero = 1;
1141 if (seen_zero)
1143 /* See if the user provided GOMP_OPENACC_DIM environment
1144 variable to specify runtime defaults. */
1145 static int default_dims[GOMP_DIM_MAX];
1147 pthread_mutex_lock (&ptx_dev_lock);
1148 if (!default_dims[0])
1150 for (int i = 0; i < GOMP_DIM_MAX; ++i)
1151 default_dims[i] = GOMP_PLUGIN_acc_default_dim (i);
1153 int warp_size, block_size, dev_size, cpu_size;
1154 CUdevice dev = nvptx_thread()->ptx_dev->dev;
1155 /* 32 is the default for known hardware. */
1156 int gang = 0, worker = 32, vector = 32;
1157 CUdevice_attribute cu_tpb, cu_ws, cu_mpc, cu_tpm;
1159 cu_tpb = CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK;
1160 cu_ws = CU_DEVICE_ATTRIBUTE_WARP_SIZE;
1161 cu_mpc = CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT;
1162 cu_tpm = CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR;
1164 if (CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &block_size, cu_tpb,
1165 dev) == CUDA_SUCCESS
1166 && CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &warp_size, cu_ws,
1167 dev) == CUDA_SUCCESS
1168 && CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &dev_size, cu_mpc,
1169 dev) == CUDA_SUCCESS
1170 && CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &cpu_size, cu_tpm,
1171 dev) == CUDA_SUCCESS)
1173 GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
1174 " dev_size=%d, cpu_size=%d\n",
1175 warp_size, block_size, dev_size, cpu_size);
1176 gang = (cpu_size / block_size) * dev_size;
1177 worker = block_size / warp_size;
1178 vector = warp_size;
1181 /* There is no upper bound on the gang size. The best size
1182 matches the hardware configuration. Logical gangs are
1183 scheduled onto physical hardware. To maximize usage, we
1184 should guess a large number. */
1185 if (default_dims[GOMP_DIM_GANG] < 1)
1186 default_dims[GOMP_DIM_GANG] = gang ? gang : 1024;
1187 /* The worker size must not exceed the hardware. */
1188 if (default_dims[GOMP_DIM_WORKER] < 1
1189 || (default_dims[GOMP_DIM_WORKER] > worker && gang))
1190 default_dims[GOMP_DIM_WORKER] = worker;
1191 /* The vector size must exactly match the hardware. */
1192 if (default_dims[GOMP_DIM_VECTOR] < 1
1193 || (default_dims[GOMP_DIM_VECTOR] != vector && gang))
1194 default_dims[GOMP_DIM_VECTOR] = vector;
1196 GOMP_PLUGIN_debug (0, " default dimensions [%d,%d,%d]\n",
1197 default_dims[GOMP_DIM_GANG],
1198 default_dims[GOMP_DIM_WORKER],
1199 default_dims[GOMP_DIM_VECTOR]);
1201 pthread_mutex_unlock (&ptx_dev_lock);
1203 for (i = 0; i != GOMP_DIM_MAX; i++)
1204 if (!dims[i])
1205 dims[i] = default_dims[i];
1208 /* This reserves a chunk of a pre-allocated page of memory mapped on both
1209 the host and the device. HP is a host pointer to the new chunk, and DP is
1210 the corresponding device pointer. */
1211 map_push (dev_str, async, mapnum * sizeof (void *), &hp, &dp);
1213 GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__);
1215 /* Copy the array of arguments to the mapped page. */
1216 for (i = 0; i < mapnum; i++)
1217 ((void **) hp)[i] = devaddrs[i];
1219 /* Copy the (device) pointers to arguments to the device (dp and hp might in
1220 fact have the same value on a unified-memory system). */
1221 CUDA_CALL_ASSERT (cuMemcpy, (CUdeviceptr) dp, (CUdeviceptr) hp,
1222 mapnum * sizeof (void *));
1223 GOMP_PLUGIN_debug (0, " %s: kernel %s: launch"
1224 " gangs=%u, workers=%u, vectors=%u\n",
1225 __FUNCTION__, targ_fn->launch->fn, dims[GOMP_DIM_GANG],
1226 dims[GOMP_DIM_WORKER], dims[GOMP_DIM_VECTOR]);
1228 // OpenACC CUDA
1230 // num_gangs nctaid.x
1231 // num_workers ntid.y
1232 // vector length ntid.x
1234 kargs[0] = &dp;
1235 CUDA_CALL_ASSERT (cuLaunchKernel, function,
1236 dims[GOMP_DIM_GANG], 1, 1,
1237 dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1,
1238 0, dev_str->stream, kargs, 0);
1240 #ifndef DISABLE_ASYNC
1241 if (async < acc_async_noval)
1243 r = CUDA_CALL_NOCHECK (cuStreamSynchronize, dev_str->stream);
1244 if (r == CUDA_ERROR_LAUNCH_FAILED)
1245 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
1246 maybe_abort_msg);
1247 else if (r != CUDA_SUCCESS)
1248 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1250 else
1252 CUevent *e;
1254 e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1256 r = CUDA_CALL_NOCHECK (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1257 if (r == CUDA_ERROR_LAUNCH_FAILED)
1258 GOMP_PLUGIN_fatal ("cuEventCreate error: %s %s\n", cuda_error (r),
1259 maybe_abort_msg);
1260 else if (r != CUDA_SUCCESS)
1261 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1263 event_gc (true);
1265 CUDA_CALL_ASSERT (cuEventRecord, *e, dev_str->stream);
1267 event_add (PTX_EVT_KNL, e, (void *)dev_str, 0);
1269 #else
1270 r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
1271 if (r == CUDA_ERROR_LAUNCH_FAILED)
1272 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
1273 maybe_abort_msg);
1274 else if (r != CUDA_SUCCESS)
1275 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
1276 #endif
1278 GOMP_PLUGIN_debug (0, " %s: kernel %s: finished\n", __FUNCTION__,
1279 targ_fn->launch->fn);
1281 #ifndef DISABLE_ASYNC
1282 if (async < acc_async_noval)
1283 #endif
1284 map_pop (dev_str);
1287 void * openacc_get_current_cuda_context (void);
1289 static void *
1290 nvptx_alloc (size_t s)
1292 CUdeviceptr d;
1294 CUDA_CALL_ERET (NULL, cuMemAlloc, &d, s);
1295 return (void *) d;
1298 static bool
1299 nvptx_free (void *p)
1301 CUdeviceptr pb;
1302 size_t ps;
1304 CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) p);
1305 if ((CUdeviceptr) p != pb)
1307 GOMP_PLUGIN_error ("invalid device address");
1308 return false;
1311 CUDA_CALL (cuMemFree, (CUdeviceptr) p);
1312 return true;
1316 static bool
1317 nvptx_host2dev (void *d, const void *h, size_t s)
1319 CUdeviceptr pb;
1320 size_t ps;
1321 struct nvptx_thread *nvthd = nvptx_thread ();
1323 if (!s)
1324 return true;
1325 if (!d)
1327 GOMP_PLUGIN_error ("invalid device address");
1328 return false;
1331 CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
1333 if (!pb)
1335 GOMP_PLUGIN_error ("invalid device address");
1336 return false;
1338 if (!h)
1340 GOMP_PLUGIN_error ("invalid host address");
1341 return false;
1343 if (d == h)
1345 GOMP_PLUGIN_error ("invalid host or device address");
1346 return false;
1348 if ((void *)(d + s) > (void *)(pb + ps))
1350 GOMP_PLUGIN_error ("invalid size");
1351 return false;
1354 #ifndef DISABLE_ASYNC
1355 if (nvthd && nvthd->current_stream != nvthd->ptx_dev->null_stream)
1357 CUevent *e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1358 CUDA_CALL (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1359 event_gc (false);
1360 CUDA_CALL (cuMemcpyHtoDAsync,
1361 (CUdeviceptr) d, h, s, nvthd->current_stream->stream);
1362 CUDA_CALL (cuEventRecord, *e, nvthd->current_stream->stream);
1363 event_add (PTX_EVT_MEM, e, (void *)h, 0);
1365 else
1366 #endif
1367 CUDA_CALL (cuMemcpyHtoD, (CUdeviceptr) d, h, s);
1369 return true;
1372 static bool
1373 nvptx_dev2host (void *h, const void *d, size_t s)
1375 CUdeviceptr pb;
1376 size_t ps;
1377 struct nvptx_thread *nvthd = nvptx_thread ();
1379 if (!s)
1380 return true;
1381 if (!d)
1383 GOMP_PLUGIN_error ("invalid device address");
1384 return false;
1387 CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
1389 if (!pb)
1391 GOMP_PLUGIN_error ("invalid device address");
1392 return false;
1394 if (!h)
1396 GOMP_PLUGIN_error ("invalid host address");
1397 return false;
1399 if (d == h)
1401 GOMP_PLUGIN_error ("invalid host or device address");
1402 return false;
1404 if ((void *)(d + s) > (void *)(pb + ps))
1406 GOMP_PLUGIN_error ("invalid size");
1407 return false;
1410 #ifndef DISABLE_ASYNC
1411 if (nvthd && nvthd->current_stream != nvthd->ptx_dev->null_stream)
1413 CUevent *e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1414 CUDA_CALL (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1415 event_gc (false);
1416 CUDA_CALL (cuMemcpyDtoHAsync,
1417 h, (CUdeviceptr) d, s, nvthd->current_stream->stream);
1418 CUDA_CALL (cuEventRecord, *e, nvthd->current_stream->stream);
1419 event_add (PTX_EVT_MEM, e, (void *)h, 0);
1421 else
1422 #endif
1423 CUDA_CALL (cuMemcpyDtoH, h, (CUdeviceptr) d, s);
1425 return true;
1428 static void
1429 nvptx_set_async (int async)
1431 struct nvptx_thread *nvthd = nvptx_thread ();
1432 nvthd->current_stream
1433 = select_stream_for_async (async, pthread_self (), true, NULL);
1436 static int
1437 nvptx_async_test (int async)
1439 CUresult r;
1440 struct ptx_stream *s;
1442 s = select_stream_for_async (async, pthread_self (), false, NULL);
1444 if (!s)
1445 GOMP_PLUGIN_fatal ("unknown async %d", async);
1447 r = CUDA_CALL_NOCHECK (cuStreamQuery, s->stream);
1448 if (r == CUDA_SUCCESS)
1450 /* The oacc-parallel.c:goacc_wait function calls this hook to determine
1451 whether all work has completed on this stream, and if so omits the call
1452 to the wait hook. If that happens, event_gc might not get called
1453 (which prevents variables from getting unmapped and their associated
1454 device storage freed), so call it here. */
1455 event_gc (true);
1456 return 1;
1458 else if (r == CUDA_ERROR_NOT_READY)
1459 return 0;
1461 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
1463 return 0;
1466 static int
1467 nvptx_async_test_all (void)
1469 struct ptx_stream *s;
1470 pthread_t self = pthread_self ();
1471 struct nvptx_thread *nvthd = nvptx_thread ();
1473 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1475 for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
1477 if ((s->multithreaded || pthread_equal (s->host_thread, self))
1478 && CUDA_CALL_NOCHECK (cuStreamQuery,
1479 s->stream) == CUDA_ERROR_NOT_READY)
1481 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1482 return 0;
1486 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1488 event_gc (true);
1490 return 1;
1493 static void
1494 nvptx_wait (int async)
1496 struct ptx_stream *s;
1498 s = select_stream_for_async (async, pthread_self (), false, NULL);
1499 if (!s)
1500 GOMP_PLUGIN_fatal ("unknown async %d", async);
1502 CUDA_CALL_ASSERT (cuStreamSynchronize, s->stream);
1504 event_gc (true);
1507 static void
1508 nvptx_wait_async (int async1, int async2)
1510 CUevent *e;
1511 struct ptx_stream *s1, *s2;
1512 pthread_t self = pthread_self ();
1514 /* The stream that is waiting (rather than being waited for) doesn't
1515 necessarily have to exist already. */
1516 s2 = select_stream_for_async (async2, self, true, NULL);
1518 s1 = select_stream_for_async (async1, self, false, NULL);
1519 if (!s1)
1520 GOMP_PLUGIN_fatal ("invalid async 1\n");
1522 if (s1 == s2)
1523 GOMP_PLUGIN_fatal ("identical parameters");
1525 e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1527 CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1529 event_gc (true);
1531 CUDA_CALL_ASSERT (cuEventRecord, *e, s1->stream);
1533 event_add (PTX_EVT_SYNC, e, NULL, 0);
1535 CUDA_CALL_ASSERT (cuStreamWaitEvent, s2->stream, *e, 0);
1538 static void
1539 nvptx_wait_all (void)
1541 CUresult r;
1542 struct ptx_stream *s;
1543 pthread_t self = pthread_self ();
1544 struct nvptx_thread *nvthd = nvptx_thread ();
1546 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1548 /* Wait for active streams initiated by this thread (or by multiple threads)
1549 to complete. */
1550 for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
1552 if (s->multithreaded || pthread_equal (s->host_thread, self))
1554 r = CUDA_CALL_NOCHECK (cuStreamQuery, s->stream);
1555 if (r == CUDA_SUCCESS)
1556 continue;
1557 else if (r != CUDA_ERROR_NOT_READY)
1558 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
1560 CUDA_CALL_ASSERT (cuStreamSynchronize, s->stream);
1564 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1566 event_gc (true);
1569 static void
1570 nvptx_wait_all_async (int async)
1572 struct ptx_stream *waiting_stream, *other_stream;
1573 CUevent *e;
1574 struct nvptx_thread *nvthd = nvptx_thread ();
1575 pthread_t self = pthread_self ();
1577 /* The stream doing the waiting. This could be the first mention of the
1578 stream, so create it if necessary. */
1579 waiting_stream
1580 = select_stream_for_async (async, pthread_self (), true, NULL);
1582 /* Launches on the null stream already block on other streams in the
1583 context. */
1584 if (!waiting_stream || waiting_stream == nvthd->ptx_dev->null_stream)
1585 return;
1587 event_gc (true);
1589 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1591 for (other_stream = nvthd->ptx_dev->active_streams;
1592 other_stream != NULL;
1593 other_stream = other_stream->next)
1595 if (!other_stream->multithreaded
1596 && !pthread_equal (other_stream->host_thread, self))
1597 continue;
1599 e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1601 CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1603 /* Record an event on the waited-for stream. */
1604 CUDA_CALL_ASSERT (cuEventRecord, *e, other_stream->stream);
1606 event_add (PTX_EVT_SYNC, e, NULL, 0);
1608 CUDA_CALL_ASSERT (cuStreamWaitEvent, waiting_stream->stream, *e, 0);
1611 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1614 static void *
1615 nvptx_get_current_cuda_device (void)
1617 struct nvptx_thread *nvthd = nvptx_thread ();
1619 if (!nvthd || !nvthd->ptx_dev)
1620 return NULL;
1622 return &nvthd->ptx_dev->dev;
1625 static void *
1626 nvptx_get_current_cuda_context (void)
1628 struct nvptx_thread *nvthd = nvptx_thread ();
1630 if (!nvthd || !nvthd->ptx_dev)
1631 return NULL;
1633 return nvthd->ptx_dev->ctx;
1636 static void *
1637 nvptx_get_cuda_stream (int async)
1639 struct ptx_stream *s;
1640 struct nvptx_thread *nvthd = nvptx_thread ();
1642 if (!nvthd || !nvthd->ptx_dev)
1643 return NULL;
1645 s = select_stream_for_async (async, pthread_self (), false, NULL);
1647 return s ? s->stream : NULL;
1650 static int
1651 nvptx_set_cuda_stream (int async, void *stream)
1653 struct ptx_stream *oldstream;
1654 pthread_t self = pthread_self ();
1655 struct nvptx_thread *nvthd = nvptx_thread ();
1657 if (async < 0)
1658 GOMP_PLUGIN_fatal ("bad async %d", async);
1660 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1662 /* We have a list of active streams and an array mapping async values to
1663 entries of that list. We need to take "ownership" of the passed-in stream,
1664 and add it to our list, removing the previous entry also (if there was one)
1665 in order to prevent resource leaks. Note the potential for surprise
1666 here: maybe we should keep track of passed-in streams and leave it up to
1667 the user to tidy those up, but that doesn't work for stream handles
1668 returned from acc_get_cuda_stream above... */
1670 oldstream = select_stream_for_async (async, self, false, NULL);
1672 if (oldstream)
1674 if (nvthd->ptx_dev->active_streams == oldstream)
1675 nvthd->ptx_dev->active_streams = nvthd->ptx_dev->active_streams->next;
1676 else
1678 struct ptx_stream *s = nvthd->ptx_dev->active_streams;
1679 while (s->next != oldstream)
1680 s = s->next;
1681 s->next = s->next->next;
1684 CUDA_CALL_ASSERT (cuStreamDestroy, oldstream->stream);
1686 if (!map_fini (oldstream))
1687 GOMP_PLUGIN_fatal ("error when freeing host memory");
1689 free (oldstream);
1692 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1694 (void) select_stream_for_async (async, self, true, (CUstream) stream);
1696 return 1;
1699 /* Plugin entry points. */
1701 const char *
1702 GOMP_OFFLOAD_get_name (void)
1704 return "nvptx";
1707 unsigned int
1708 GOMP_OFFLOAD_get_caps (void)
1710 return GOMP_OFFLOAD_CAP_OPENACC_200 | GOMP_OFFLOAD_CAP_OPENMP_400;
1714 GOMP_OFFLOAD_get_type (void)
1716 return OFFLOAD_TARGET_TYPE_NVIDIA_PTX;
1720 GOMP_OFFLOAD_get_num_devices (void)
1722 return nvptx_get_num_devices ();
1725 bool
1726 GOMP_OFFLOAD_init_device (int n)
1728 struct ptx_device *dev;
1730 pthread_mutex_lock (&ptx_dev_lock);
1732 if (!nvptx_init () || ptx_devices[n] != NULL)
1734 pthread_mutex_unlock (&ptx_dev_lock);
1735 return false;
1738 dev = nvptx_open_device (n);
1739 if (dev)
1741 ptx_devices[n] = dev;
1742 instantiated_devices++;
1745 pthread_mutex_unlock (&ptx_dev_lock);
1747 return dev != NULL;
1750 bool
1751 GOMP_OFFLOAD_fini_device (int n)
1753 pthread_mutex_lock (&ptx_dev_lock);
1755 if (ptx_devices[n] != NULL)
1757 if (!nvptx_attach_host_thread_to_device (n)
1758 || !nvptx_close_device (ptx_devices[n]))
1760 pthread_mutex_unlock (&ptx_dev_lock);
1761 return false;
1763 ptx_devices[n] = NULL;
1764 instantiated_devices--;
1767 pthread_mutex_unlock (&ptx_dev_lock);
1768 return true;
1771 /* Return the libgomp version number we're compatible with. There is
1772 no requirement for cross-version compatibility. */
1774 unsigned
1775 GOMP_OFFLOAD_version (void)
1777 return GOMP_VERSION;
1780 /* Initialize __nvptx_clocktick, if present in MODULE. */
1782 static void
1783 nvptx_set_clocktick (CUmodule module, struct ptx_device *dev)
1785 CUdeviceptr dptr;
1786 CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &dptr, NULL,
1787 module, "__nvptx_clocktick");
1788 if (r == CUDA_ERROR_NOT_FOUND)
1789 return;
1790 if (r != CUDA_SUCCESS)
1791 GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
1792 double __nvptx_clocktick = 1e-3 / dev->clock_khz;
1793 r = CUDA_CALL_NOCHECK (cuMemcpyHtoD, dptr, &__nvptx_clocktick,
1794 sizeof (__nvptx_clocktick));
1795 if (r != CUDA_SUCCESS)
1796 GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
1799 /* Load the (partial) program described by TARGET_DATA to device
1800 number ORD. Allocate and return TARGET_TABLE. */
1803 GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
1804 struct addr_pair **target_table)
1806 CUmodule module;
1807 const char *const *var_names;
1808 const struct targ_fn_launch *fn_descs;
1809 unsigned int fn_entries, var_entries, i, j;
1810 struct targ_fn_descriptor *targ_fns;
1811 struct addr_pair *targ_tbl;
1812 const nvptx_tdata_t *img_header = (const nvptx_tdata_t *) target_data;
1813 struct ptx_image_data *new_image;
1814 struct ptx_device *dev;
1816 if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1818 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1819 " (expected %u, received %u)",
1820 GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1821 return -1;
1824 if (!nvptx_attach_host_thread_to_device (ord)
1825 || !link_ptx (&module, img_header->ptx_objs, img_header->ptx_num))
1826 return -1;
1828 dev = ptx_devices[ord];
1830 /* The mkoffload utility emits a struct of pointers/integers at the
1831 start of each offload image. The array of kernel names and the
1832 functions addresses form a one-to-one correspondence. */
1834 var_entries = img_header->var_num;
1835 var_names = img_header->var_names;
1836 fn_entries = img_header->fn_num;
1837 fn_descs = img_header->fn_descs;
1839 targ_tbl = GOMP_PLUGIN_malloc (sizeof (struct addr_pair)
1840 * (fn_entries + var_entries));
1841 targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
1842 * fn_entries);
1844 *target_table = targ_tbl;
1846 new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data));
1847 new_image->target_data = target_data;
1848 new_image->module = module;
1849 new_image->fns = targ_fns;
1851 pthread_mutex_lock (&dev->image_lock);
1852 new_image->next = dev->images;
1853 dev->images = new_image;
1854 pthread_mutex_unlock (&dev->image_lock);
1856 for (i = 0; i < fn_entries; i++, targ_fns++, targ_tbl++)
1858 CUfunction function;
1859 int nregs, mthrs;
1861 CUDA_CALL_ERET (-1, cuModuleGetFunction, &function, module,
1862 fn_descs[i].fn);
1863 CUDA_CALL_ERET (-1, cuFuncGetAttribute, &nregs,
1864 CU_FUNC_ATTRIBUTE_NUM_REGS, function);
1865 CUDA_CALL_ERET (-1, cuFuncGetAttribute, &mthrs,
1866 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, function);
1868 targ_fns->fn = function;
1869 targ_fns->launch = &fn_descs[i];
1870 targ_fns->regs_per_thread = nregs;
1871 targ_fns->max_threads_per_block = mthrs;
1873 targ_tbl->start = (uintptr_t) targ_fns;
1874 targ_tbl->end = targ_tbl->start + 1;
1877 for (j = 0; j < var_entries; j++, targ_tbl++)
1879 CUdeviceptr var;
1880 size_t bytes;
1882 CUDA_CALL_ERET (-1, cuModuleGetGlobal,
1883 &var, &bytes, module, var_names[j]);
1885 targ_tbl->start = (uintptr_t) var;
1886 targ_tbl->end = targ_tbl->start + bytes;
1889 nvptx_set_clocktick (module, dev);
1891 return fn_entries + var_entries;
1894 /* Unload the program described by TARGET_DATA. DEV_DATA is the
1895 function descriptors allocated by G_O_load_image. */
1897 bool
1898 GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data)
1900 struct ptx_image_data *image, **prev_p;
1901 struct ptx_device *dev = ptx_devices[ord];
1903 if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1905 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1906 " (expected %u, received %u)",
1907 GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1908 return false;
1911 bool ret = true;
1912 pthread_mutex_lock (&dev->image_lock);
1913 for (prev_p = &dev->images; (image = *prev_p) != 0; prev_p = &image->next)
1914 if (image->target_data == target_data)
1916 *prev_p = image->next;
1917 if (CUDA_CALL_NOCHECK (cuModuleUnload, image->module) != CUDA_SUCCESS)
1918 ret = false;
1919 free (image->fns);
1920 free (image);
1921 break;
1923 pthread_mutex_unlock (&dev->image_lock);
1924 return ret;
1927 void *
1928 GOMP_OFFLOAD_alloc (int ord, size_t size)
1930 if (!nvptx_attach_host_thread_to_device (ord))
1931 return NULL;
1932 return nvptx_alloc (size);
1935 bool
1936 GOMP_OFFLOAD_free (int ord, void *ptr)
1938 return (nvptx_attach_host_thread_to_device (ord)
1939 && nvptx_free (ptr));
1942 bool
1943 GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
1945 return (nvptx_attach_host_thread_to_device (ord)
1946 && nvptx_dev2host (dst, src, n));
1949 bool
1950 GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
1952 return (nvptx_attach_host_thread_to_device (ord)
1953 && nvptx_host2dev (dst, src, n));
1956 bool
1957 GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n)
1959 struct ptx_device *ptx_dev = ptx_devices[ord];
1960 CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n,
1961 ptx_dev->null_stream->stream);
1962 return true;
1965 void (*device_run) (int n, void *fn_ptr, void *vars) = NULL;
1967 void
1968 GOMP_OFFLOAD_openacc_exec (void (*fn) (void *), size_t mapnum,
1969 void **hostaddrs, void **devaddrs,
1970 int async, unsigned *dims, void *targ_mem_desc)
1972 nvptx_exec (fn, mapnum, hostaddrs, devaddrs, async, dims, targ_mem_desc);
1975 void
1976 GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc, int async)
1978 struct nvptx_thread *nvthd = nvptx_thread ();
1979 CUevent *e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1981 CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1982 CUDA_CALL_ASSERT (cuEventRecord, *e, nvthd->current_stream->stream);
1983 event_add (PTX_EVT_ASYNC_CLEANUP, e, targ_mem_desc, async);
1987 GOMP_OFFLOAD_openacc_async_test (int async)
1989 return nvptx_async_test (async);
1993 GOMP_OFFLOAD_openacc_async_test_all (void)
1995 return nvptx_async_test_all ();
1998 void
1999 GOMP_OFFLOAD_openacc_async_wait (int async)
2001 nvptx_wait (async);
2004 void
2005 GOMP_OFFLOAD_openacc_async_wait_async (int async1, int async2)
2007 nvptx_wait_async (async1, async2);
2010 void
2011 GOMP_OFFLOAD_openacc_async_wait_all (void)
2013 nvptx_wait_all ();
2016 void
2017 GOMP_OFFLOAD_openacc_async_wait_all_async (int async)
2019 nvptx_wait_all_async (async);
2022 void
2023 GOMP_OFFLOAD_openacc_async_set_async (int async)
2025 nvptx_set_async (async);
2028 void *
2029 GOMP_OFFLOAD_openacc_create_thread_data (int ord)
2031 struct ptx_device *ptx_dev;
2032 struct nvptx_thread *nvthd
2033 = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
2034 CUcontext thd_ctx;
2036 ptx_dev = ptx_devices[ord];
2038 assert (ptx_dev);
2040 CUDA_CALL_ASSERT (cuCtxGetCurrent, &thd_ctx);
2042 assert (ptx_dev->ctx);
2044 if (!thd_ctx)
2045 CUDA_CALL_ASSERT (cuCtxPushCurrent, ptx_dev->ctx);
2047 nvthd->current_stream = ptx_dev->null_stream;
2048 nvthd->ptx_dev = ptx_dev;
2050 return (void *) nvthd;
2053 void
2054 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
2056 free (data);
2059 void *
2060 GOMP_OFFLOAD_openacc_cuda_get_current_device (void)
2062 return nvptx_get_current_cuda_device ();
2065 void *
2066 GOMP_OFFLOAD_openacc_cuda_get_current_context (void)
2068 return nvptx_get_current_cuda_context ();
2071 /* NOTE: This returns a CUstream, not a ptx_stream pointer. */
2073 void *
2074 GOMP_OFFLOAD_openacc_cuda_get_stream (int async)
2076 return nvptx_get_cuda_stream (async);
2079 /* NOTE: This takes a CUstream, not a ptx_stream pointer. */
2082 GOMP_OFFLOAD_openacc_cuda_set_stream (int async, void *stream)
2084 return nvptx_set_cuda_stream (async, stream);
2087 /* Adjust launch dimensions: pick good values for number of blocks and warps
2088 and ensure that number of warps does not exceed CUDA limits as well as GCC's
2089 own limits. */
2091 static void
2092 nvptx_adjust_launch_bounds (struct targ_fn_descriptor *fn,
2093 struct ptx_device *ptx_dev,
2094 int *teams_p, int *threads_p)
2096 int max_warps_block = fn->max_threads_per_block / 32;
2097 /* Maximum 32 warps per block is an implementation limit in NVPTX backend
2098 and libgcc, which matches documented limit of all GPUs as of 2015. */
2099 if (max_warps_block > 32)
2100 max_warps_block = 32;
2101 if (*threads_p <= 0)
2102 *threads_p = 8;
2103 if (*threads_p > max_warps_block)
2104 *threads_p = max_warps_block;
2106 int regs_per_block = fn->regs_per_thread * 32 * *threads_p;
2107 /* This is an estimate of how many blocks the device can host simultaneously.
2108 Actual limit, which may be lower, can be queried with "occupancy control"
2109 driver interface (since CUDA 6.0). */
2110 int max_blocks = ptx_dev->regs_per_sm / regs_per_block * ptx_dev->num_sms;
2111 if (*teams_p <= 0 || *teams_p > max_blocks)
2112 *teams_p = max_blocks;
2115 /* Return the size of per-warp stacks (see gcc -msoft-stack) to use for OpenMP
2116 target regions. */
2118 static size_t
2119 nvptx_stacks_size ()
2121 return 128 * 1024;
2124 /* Return contiguous storage for NUM stacks, each SIZE bytes. */
2126 static void *
2127 nvptx_stacks_alloc (size_t size, int num)
2129 CUdeviceptr stacks;
2130 CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &stacks, size * num);
2131 if (r != CUDA_SUCCESS)
2132 GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
2133 return (void *) stacks;
2136 /* Release storage previously allocated by nvptx_stacks_alloc. */
2138 static void
2139 nvptx_stacks_free (void *p, int num)
2141 CUresult r = CUDA_CALL_NOCHECK (cuMemFree, (CUdeviceptr) p);
2142 if (r != CUDA_SUCCESS)
2143 GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
2146 void
2147 GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
2149 CUfunction function = ((struct targ_fn_descriptor *) tgt_fn)->fn;
2150 CUresult r;
2151 struct ptx_device *ptx_dev = ptx_devices[ord];
2152 const char *maybe_abort_msg = "(perhaps abort was called)";
2153 int teams = 0, threads = 0;
2155 if (!args)
2156 GOMP_PLUGIN_fatal ("No target arguments provided");
2157 while (*args)
2159 intptr_t id = (intptr_t) *args++, val;
2160 if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
2161 val = (intptr_t) *args++;
2162 else
2163 val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
2164 if ((id & GOMP_TARGET_ARG_DEVICE_MASK) != GOMP_TARGET_ARG_DEVICE_ALL)
2165 continue;
2166 val = val > INT_MAX ? INT_MAX : val;
2167 id &= GOMP_TARGET_ARG_ID_MASK;
2168 if (id == GOMP_TARGET_ARG_NUM_TEAMS)
2169 teams = val;
2170 else if (id == GOMP_TARGET_ARG_THREAD_LIMIT)
2171 threads = val;
2173 nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads);
2175 size_t stack_size = nvptx_stacks_size ();
2176 void *stacks = nvptx_stacks_alloc (stack_size, teams * threads);
2177 void *fn_args[] = {tgt_vars, stacks, (void *) stack_size};
2178 size_t fn_args_size = sizeof fn_args;
2179 void *config[] = {
2180 CU_LAUNCH_PARAM_BUFFER_POINTER, fn_args,
2181 CU_LAUNCH_PARAM_BUFFER_SIZE, &fn_args_size,
2182 CU_LAUNCH_PARAM_END
2184 r = CUDA_CALL_NOCHECK (cuLaunchKernel, function, teams, 1, 1,
2185 32, threads, 1, 0, ptx_dev->null_stream->stream,
2186 NULL, config);
2187 if (r != CUDA_SUCCESS)
2188 GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
2190 r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
2191 if (r == CUDA_ERROR_LAUNCH_FAILED)
2192 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
2193 maybe_abort_msg);
2194 else if (r != CUDA_SUCCESS)
2195 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
2196 nvptx_stacks_free (stacks, teams * threads);
2199 void
2200 GOMP_OFFLOAD_async_run (int ord, void *tgt_fn, void *tgt_vars, void **args,
2201 void *async_data)
2203 GOMP_PLUGIN_fatal ("GOMP_OFFLOAD_async_run unimplemented");