[libgomp, nvptx] Fall back to cuLinkAddData/cuLinkCreate if _v2 not found
[official-gcc.git] / libgomp / plugin / plugin-nvptx.c
blob6799a264976d5102ebe0897c1ab8b53a4008fb30
1 /* Plugin for NVPTX execution.
3 Copyright (C) 2013-2018 Free Software Foundation, Inc.
5 Contributed by Mentor Embedded.
7 This file is part of the GNU Offloading and Multi Processing Library
8 (libgomp).
10 Libgomp is free software; you can redistribute it and/or modify it
11 under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 3, or (at your option)
13 any later version.
15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
18 more details.
20 Under Section 7 of GPL version 3, you are granted additional
21 permissions described in the GCC Runtime Library Exception, version
22 3.1, as published by the Free Software Foundation.
24 You should have received a copy of the GNU General Public License and
25 a copy of the GCC Runtime Library Exception along with this program;
26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
27 <http://www.gnu.org/licenses/>. */
29 /* Nvidia PTX-specific parts of OpenACC support. The cuda driver
30 library appears to hold some implicit state, but the documentation
31 is not clear as to what that state might be. Or how one might
32 propagate it from one thread to another. */
34 #define _GNU_SOURCE
35 #include "openacc.h"
36 #include "config.h"
37 #include "libgomp-plugin.h"
38 #include "oacc-plugin.h"
39 #include "gomp-constants.h"
41 #include <pthread.h>
42 #include <cuda.h>
43 #include <stdbool.h>
44 #include <stdint.h>
45 #include <limits.h>
46 #include <string.h>
47 #include <stdio.h>
48 #include <unistd.h>
49 #include <assert.h>
50 #include <errno.h>
52 #if CUDA_VERSION < 6000
53 extern CUresult cuGetErrorString (CUresult, const char **);
54 #define CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR 82
55 #endif
57 #if CUDA_VERSION >= 6050
58 #undef cuLinkCreate
59 #undef cuLinkAddData
60 CUresult cuLinkAddData (CUlinkState, CUjitInputType, void *, size_t,
61 const char *, unsigned, CUjit_option *, void **);
62 CUresult cuLinkCreate (unsigned, CUjit_option *, void **, CUlinkState *);
63 #else
64 CUresult cuLinkAddData_v2 (CUlinkState, CUjitInputType, void *, size_t,
65 const char *, unsigned, CUjit_option *, void **);
66 CUresult cuLinkCreate_v2 (unsigned, CUjit_option *, void **, CUlinkState *);
67 #endif
69 #define DO_PRAGMA(x) _Pragma (#x)
71 #if PLUGIN_NVPTX_DYNAMIC
72 # include <dlfcn.h>
74 struct cuda_lib_s {
76 # define CUDA_ONE_CALL(call) \
77 __typeof (call) *call;
78 # define CUDA_ONE_CALL_MAYBE_NULL(call) \
79 CUDA_ONE_CALL (call)
80 #include "cuda-lib.def"
81 # undef CUDA_ONE_CALL
82 # undef CUDA_ONE_CALL_MAYBE_NULL
84 } cuda_lib;
86 /* -1 if init_cuda_lib has not been called yet, false
87 if it has been and failed, true if it has been and succeeded. */
88 static signed char cuda_lib_inited = -1;
90 /* Dynamically load the CUDA runtime library and initialize function
91 pointers, return false if unsuccessful, true if successful. */
92 static bool
93 init_cuda_lib (void)
95 if (cuda_lib_inited != -1)
96 return cuda_lib_inited;
97 const char *cuda_runtime_lib = "libcuda.so.1";
98 void *h = dlopen (cuda_runtime_lib, RTLD_LAZY);
99 cuda_lib_inited = false;
100 if (h == NULL)
101 return false;
103 # define CUDA_ONE_CALL(call) CUDA_ONE_CALL_1 (call, false)
104 # define CUDA_ONE_CALL_MAYBE_NULL(call) CUDA_ONE_CALL_1 (call, true)
105 # define CUDA_ONE_CALL_1(call, allow_null) \
106 cuda_lib.call = dlsym (h, #call); \
107 if (!allow_null && cuda_lib.call == NULL) \
108 return false;
109 #include "cuda-lib.def"
110 # undef CUDA_ONE_CALL
111 # undef CUDA_ONE_CALL_1
112 # undef CUDA_ONE_CALL_MAYBE_NULL
114 cuda_lib_inited = true;
115 return true;
117 # define CUDA_CALL_PREFIX cuda_lib.
118 #else
120 # define CUDA_ONE_CALL(call)
121 # define CUDA_ONE_CALL_MAYBE_NULL(call) DO_PRAGMA (weak call)
122 #include "cuda-lib.def"
123 #undef CUDA_ONE_CALL_MAYBE_NULL
124 #undef CUDA_ONE_CALL
126 # define CUDA_CALL_PREFIX
127 # define init_cuda_lib() true
128 #endif
130 #include "secure_getenv.h"
132 #undef MIN
133 #undef MAX
134 #define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
135 #define MAX(X,Y) ((X) > (Y) ? (X) : (Y))
137 /* Convenience macros for the frequently used CUDA library call and
138 error handling sequence as well as CUDA library calls that
139 do the error checking themselves or don't do it at all. */
141 #define CUDA_CALL_ERET(ERET, FN, ...) \
142 do { \
143 unsigned __r \
144 = CUDA_CALL_PREFIX FN (__VA_ARGS__); \
145 if (__r != CUDA_SUCCESS) \
147 GOMP_PLUGIN_error (#FN " error: %s", \
148 cuda_error (__r)); \
149 return ERET; \
151 } while (0)
153 #define CUDA_CALL(FN, ...) \
154 CUDA_CALL_ERET (false, FN, __VA_ARGS__)
156 #define CUDA_CALL_ASSERT(FN, ...) \
157 do { \
158 unsigned __r \
159 = CUDA_CALL_PREFIX FN (__VA_ARGS__); \
160 if (__r != CUDA_SUCCESS) \
162 GOMP_PLUGIN_fatal (#FN " error: %s", \
163 cuda_error (__r)); \
165 } while (0)
167 #define CUDA_CALL_NOCHECK(FN, ...) \
168 CUDA_CALL_PREFIX FN (__VA_ARGS__)
170 #define CUDA_CALL_EXISTS(FN) \
171 CUDA_CALL_PREFIX FN
173 static const char *
174 cuda_error (CUresult r)
176 const char *fallback = "unknown cuda error";
177 const char *desc;
179 if (!CUDA_CALL_EXISTS (cuGetErrorString))
180 return fallback;
182 r = CUDA_CALL_NOCHECK (cuGetErrorString, r, &desc);
183 if (r == CUDA_SUCCESS)
184 return desc;
186 return fallback;
189 static unsigned int instantiated_devices = 0;
190 static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
192 struct ptx_stream
194 CUstream stream;
195 pthread_t host_thread;
196 bool multithreaded;
198 CUdeviceptr d;
199 void *h;
200 void *h_begin;
201 void *h_end;
202 void *h_next;
203 void *h_prev;
204 void *h_tail;
206 struct ptx_stream *next;
209 /* Thread-specific data for PTX. */
211 struct nvptx_thread
213 struct ptx_stream *current_stream;
214 struct ptx_device *ptx_dev;
217 static bool
218 map_init (struct ptx_stream *s)
220 int size = getpagesize ();
222 assert (s);
223 assert (!s->d);
224 assert (!s->h);
226 CUDA_CALL (cuMemAllocHost, &s->h, size);
227 CUDA_CALL (cuMemHostGetDevicePointer, &s->d, s->h, 0);
229 assert (s->h);
231 s->h_begin = s->h;
232 s->h_end = s->h_begin + size;
233 s->h_next = s->h_prev = s->h_tail = s->h_begin;
235 assert (s->h_next);
236 assert (s->h_end);
237 return true;
240 static bool
241 map_fini (struct ptx_stream *s)
243 CUDA_CALL (cuMemFreeHost, s->h);
244 return true;
247 static void
248 map_pop (struct ptx_stream *s)
250 assert (s != NULL);
251 assert (s->h_next);
252 assert (s->h_prev);
253 assert (s->h_tail);
255 s->h_tail = s->h_next;
257 if (s->h_tail >= s->h_end)
258 s->h_tail = s->h_begin + (int) (s->h_tail - s->h_end);
260 if (s->h_next == s->h_tail)
261 s->h_prev = s->h_next;
263 assert (s->h_next >= s->h_begin);
264 assert (s->h_tail >= s->h_begin);
265 assert (s->h_prev >= s->h_begin);
267 assert (s->h_next <= s->h_end);
268 assert (s->h_tail <= s->h_end);
269 assert (s->h_prev <= s->h_end);
272 static void
273 map_push (struct ptx_stream *s, size_t size, void **h, void **d)
275 int left;
276 int offset;
278 assert (s != NULL);
280 left = s->h_end - s->h_next;
282 assert (s->h_prev);
283 assert (s->h_next);
285 if (size >= left)
287 assert (s->h_next == s->h_prev);
288 s->h_next = s->h_prev = s->h_tail = s->h_begin;
291 assert (s->h_next);
293 offset = s->h_next - s->h;
295 *d = (void *)(s->d + offset);
296 *h = (void *)(s->h + offset);
298 s->h_prev = s->h_next;
299 s->h_next += size;
301 assert (s->h_prev);
302 assert (s->h_next);
304 assert (s->h_next >= s->h_begin);
305 assert (s->h_tail >= s->h_begin);
306 assert (s->h_prev >= s->h_begin);
307 assert (s->h_next <= s->h_end);
308 assert (s->h_tail <= s->h_end);
309 assert (s->h_prev <= s->h_end);
311 return;
314 /* Target data function launch information. */
316 struct targ_fn_launch
318 const char *fn;
319 unsigned short dim[GOMP_DIM_MAX];
322 /* Target PTX object information. */
324 struct targ_ptx_obj
326 const char *code;
327 size_t size;
330 /* Target data image information. */
332 typedef struct nvptx_tdata
334 const struct targ_ptx_obj *ptx_objs;
335 unsigned ptx_num;
337 const char *const *var_names;
338 unsigned var_num;
340 const struct targ_fn_launch *fn_descs;
341 unsigned fn_num;
342 } nvptx_tdata_t;
344 /* Descriptor of a loaded function. */
346 struct targ_fn_descriptor
348 CUfunction fn;
349 const struct targ_fn_launch *launch;
350 int regs_per_thread;
351 int max_threads_per_block;
354 /* A loaded PTX image. */
355 struct ptx_image_data
357 const void *target_data;
358 CUmodule module;
360 struct targ_fn_descriptor *fns; /* Array of functions. */
362 struct ptx_image_data *next;
365 struct ptx_device
367 CUcontext ctx;
368 bool ctx_shared;
369 CUdevice dev;
370 struct ptx_stream *null_stream;
371 /* All non-null streams associated with this device (actually context),
372 either created implicitly or passed in from the user (via
373 acc_set_cuda_stream). */
374 struct ptx_stream *active_streams;
375 struct {
376 struct ptx_stream **arr;
377 int size;
378 } async_streams;
379 /* A lock for use when manipulating the above stream list and array. */
380 pthread_mutex_t stream_lock;
381 int ord;
382 bool overlap;
383 bool map;
384 bool concur;
385 bool mkern;
386 int mode;
387 int clock_khz;
388 int num_sms;
389 int regs_per_block;
390 int regs_per_sm;
391 int warp_size;
392 int max_threads_per_block;
393 int max_threads_per_multiprocessor;
394 int default_dims[GOMP_DIM_MAX];
396 struct ptx_image_data *images; /* Images loaded on device. */
397 pthread_mutex_t image_lock; /* Lock for above list. */
399 struct ptx_device *next;
402 enum ptx_event_type
404 PTX_EVT_MEM,
405 PTX_EVT_KNL,
406 PTX_EVT_SYNC,
407 PTX_EVT_ASYNC_CLEANUP
410 struct ptx_event
412 CUevent *evt;
413 int type;
414 void *addr;
415 int ord;
416 int val;
418 struct ptx_event *next;
421 static pthread_mutex_t ptx_event_lock;
422 static struct ptx_event *ptx_events;
424 static struct ptx_device **ptx_devices;
426 static inline struct nvptx_thread *
427 nvptx_thread (void)
429 return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
432 static bool
433 init_streams_for_device (struct ptx_device *ptx_dev, int concurrency)
435 int i;
436 struct ptx_stream *null_stream
437 = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
439 null_stream->stream = NULL;
440 null_stream->host_thread = pthread_self ();
441 null_stream->multithreaded = true;
442 null_stream->d = (CUdeviceptr) NULL;
443 null_stream->h = NULL;
444 if (!map_init (null_stream))
445 return false;
447 ptx_dev->null_stream = null_stream;
448 ptx_dev->active_streams = NULL;
449 pthread_mutex_init (&ptx_dev->stream_lock, NULL);
451 if (concurrency < 1)
452 concurrency = 1;
454 /* This is just a guess -- make space for as many async streams as the
455 current device is capable of concurrently executing. This can grow
456 later as necessary. No streams are created yet. */
457 ptx_dev->async_streams.arr
458 = GOMP_PLUGIN_malloc (concurrency * sizeof (struct ptx_stream *));
459 ptx_dev->async_streams.size = concurrency;
461 for (i = 0; i < concurrency; i++)
462 ptx_dev->async_streams.arr[i] = NULL;
464 return true;
467 static bool
468 fini_streams_for_device (struct ptx_device *ptx_dev)
470 free (ptx_dev->async_streams.arr);
472 bool ret = true;
473 while (ptx_dev->active_streams != NULL)
475 struct ptx_stream *s = ptx_dev->active_streams;
476 ptx_dev->active_streams = ptx_dev->active_streams->next;
478 ret &= map_fini (s);
480 CUresult r = CUDA_CALL_NOCHECK (cuStreamDestroy, s->stream);
481 if (r != CUDA_SUCCESS)
483 GOMP_PLUGIN_error ("cuStreamDestroy error: %s", cuda_error (r));
484 ret = false;
486 free (s);
489 ret &= map_fini (ptx_dev->null_stream);
490 free (ptx_dev->null_stream);
491 return ret;
494 /* Select a stream for (OpenACC-semantics) ASYNC argument for the current
495 thread THREAD (and also current device/context). If CREATE is true, create
496 the stream if it does not exist (or use EXISTING if it is non-NULL), and
497 associate the stream with the same thread argument. Returns stream to use
498 as result. */
500 static struct ptx_stream *
501 select_stream_for_async (int async, pthread_t thread, bool create,
502 CUstream existing)
504 struct nvptx_thread *nvthd = nvptx_thread ();
505 /* Local copy of TLS variable. */
506 struct ptx_device *ptx_dev = nvthd->ptx_dev;
507 struct ptx_stream *stream = NULL;
508 int orig_async = async;
510 /* The special value acc_async_noval (-1) maps (for now) to an
511 implicitly-created stream, which is then handled the same as any other
512 numbered async stream. Other options are available, e.g. using the null
513 stream for anonymous async operations, or choosing an idle stream from an
514 active set. But, stick with this for now. */
515 if (async > acc_async_sync)
516 async++;
518 if (create)
519 pthread_mutex_lock (&ptx_dev->stream_lock);
521 /* NOTE: AFAICT there's no particular need for acc_async_sync to map to the
522 null stream, and in fact better performance may be obtainable if it doesn't
523 (because the null stream enforces overly-strict synchronisation with
524 respect to other streams for legacy reasons, and that's probably not
525 needed with OpenACC). Maybe investigate later. */
526 if (async == acc_async_sync)
527 stream = ptx_dev->null_stream;
528 else if (async >= 0 && async < ptx_dev->async_streams.size
529 && ptx_dev->async_streams.arr[async] && !(create && existing))
530 stream = ptx_dev->async_streams.arr[async];
531 else if (async >= 0 && create)
533 if (async >= ptx_dev->async_streams.size)
535 int i, newsize = ptx_dev->async_streams.size * 2;
537 if (async >= newsize)
538 newsize = async + 1;
540 ptx_dev->async_streams.arr
541 = GOMP_PLUGIN_realloc (ptx_dev->async_streams.arr,
542 newsize * sizeof (struct ptx_stream *));
544 for (i = ptx_dev->async_streams.size; i < newsize; i++)
545 ptx_dev->async_streams.arr[i] = NULL;
547 ptx_dev->async_streams.size = newsize;
550 /* Create a new stream on-demand if there isn't one already, or if we're
551 setting a particular async value to an existing (externally-provided)
552 stream. */
553 if (!ptx_dev->async_streams.arr[async] || existing)
555 CUresult r;
556 struct ptx_stream *s
557 = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
559 if (existing)
560 s->stream = existing;
561 else
563 r = CUDA_CALL_NOCHECK (cuStreamCreate, &s->stream,
564 CU_STREAM_DEFAULT);
565 if (r != CUDA_SUCCESS)
567 pthread_mutex_unlock (&ptx_dev->stream_lock);
568 GOMP_PLUGIN_fatal ("cuStreamCreate error: %s",
569 cuda_error (r));
573 /* If CREATE is true, we're going to be queueing some work on this
574 stream. Associate it with the current host thread. */
575 s->host_thread = thread;
576 s->multithreaded = false;
578 s->d = (CUdeviceptr) NULL;
579 s->h = NULL;
580 if (!map_init (s))
582 pthread_mutex_unlock (&ptx_dev->stream_lock);
583 GOMP_PLUGIN_fatal ("map_init fail");
586 s->next = ptx_dev->active_streams;
587 ptx_dev->active_streams = s;
588 ptx_dev->async_streams.arr[async] = s;
591 stream = ptx_dev->async_streams.arr[async];
593 else if (async < 0)
595 if (create)
596 pthread_mutex_unlock (&ptx_dev->stream_lock);
597 GOMP_PLUGIN_fatal ("bad async %d", async);
600 if (create)
602 assert (stream != NULL);
604 /* If we're trying to use the same stream from different threads
605 simultaneously, set stream->multithreaded to true. This affects the
606 behaviour of acc_async_test_all and acc_wait_all, which are supposed to
607 only wait for asynchronous launches from the same host thread they are
608 invoked on. If multiple threads use the same async value, we make note
609 of that here and fall back to testing/waiting for all threads in those
610 functions. */
611 if (thread != stream->host_thread)
612 stream->multithreaded = true;
614 pthread_mutex_unlock (&ptx_dev->stream_lock);
616 else if (stream && !stream->multithreaded
617 && !pthread_equal (stream->host_thread, thread))
618 GOMP_PLUGIN_fatal ("async %d used on wrong thread", orig_async);
620 return stream;
623 /* Initialize the device. Return TRUE on success, else FALSE. PTX_DEV_LOCK
624 should be locked on entry and remains locked on exit. */
626 static bool
627 nvptx_init (void)
629 int ndevs;
631 if (instantiated_devices != 0)
632 return true;
634 ptx_events = NULL;
635 pthread_mutex_init (&ptx_event_lock, NULL);
637 if (!init_cuda_lib ())
638 return false;
640 CUDA_CALL (cuInit, 0);
642 CUDA_CALL (cuDeviceGetCount, &ndevs);
643 ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *)
644 * ndevs);
645 return true;
648 /* Select the N'th PTX device for the current host thread. The device must
649 have been previously opened before calling this function. */
651 static bool
652 nvptx_attach_host_thread_to_device (int n)
654 CUdevice dev;
655 CUresult r;
656 struct ptx_device *ptx_dev;
657 CUcontext thd_ctx;
659 r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &dev);
660 if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
662 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
663 return false;
666 if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n)
667 return true;
668 else
670 CUcontext old_ctx;
672 ptx_dev = ptx_devices[n];
673 if (!ptx_dev)
675 GOMP_PLUGIN_error ("device %d not found", n);
676 return false;
679 CUDA_CALL (cuCtxGetCurrent, &thd_ctx);
681 /* We don't necessarily have a current context (e.g. if it has been
682 destroyed. Pop it if we do though. */
683 if (thd_ctx != NULL)
684 CUDA_CALL (cuCtxPopCurrent, &old_ctx);
686 CUDA_CALL (cuCtxPushCurrent, ptx_dev->ctx);
688 return true;
691 static struct ptx_device *
692 nvptx_open_device (int n)
694 struct ptx_device *ptx_dev;
695 CUdevice dev, ctx_dev;
696 CUresult r;
697 int async_engines, pi;
699 CUDA_CALL_ERET (NULL, cuDeviceGet, &dev, n);
701 ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device));
703 ptx_dev->ord = n;
704 ptx_dev->dev = dev;
705 ptx_dev->ctx_shared = false;
707 r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &ctx_dev);
708 if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
710 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
711 return NULL;
714 if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev)
716 /* The current host thread has an active context for a different device.
717 Detach it. */
718 CUcontext old_ctx;
719 CUDA_CALL_ERET (NULL, cuCtxPopCurrent, &old_ctx);
722 CUDA_CALL_ERET (NULL, cuCtxGetCurrent, &ptx_dev->ctx);
724 if (!ptx_dev->ctx)
725 CUDA_CALL_ERET (NULL, cuCtxCreate, &ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
726 else
727 ptx_dev->ctx_shared = true;
729 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
730 &pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
731 ptx_dev->overlap = pi;
733 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
734 &pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
735 ptx_dev->map = pi;
737 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
738 &pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
739 ptx_dev->concur = pi;
741 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
742 &pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
743 ptx_dev->mode = pi;
745 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
746 &pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
747 ptx_dev->mkern = pi;
749 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
750 &pi, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
751 ptx_dev->clock_khz = pi;
753 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
754 &pi, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
755 ptx_dev->num_sms = pi;
757 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
758 &pi, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev);
759 ptx_dev->regs_per_block = pi;
761 /* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR is defined only
762 in CUDA 6.0 and newer. */
763 r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi,
764 CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR,
765 dev);
766 /* Fallback: use limit of registers per block, which is usually equal. */
767 if (r == CUDA_ERROR_INVALID_VALUE)
768 pi = ptx_dev->regs_per_block;
769 else if (r != CUDA_SUCCESS)
771 GOMP_PLUGIN_error ("cuDeviceGetAttribute error: %s", cuda_error (r));
772 return NULL;
774 ptx_dev->regs_per_sm = pi;
776 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
777 &pi, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev);
778 if (pi != 32)
780 GOMP_PLUGIN_error ("Only warp size 32 is supported");
781 return NULL;
783 ptx_dev->warp_size = pi;
785 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
786 CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev);
787 ptx_dev->max_threads_per_block = pi;
789 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
790 CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev);
791 ptx_dev->max_threads_per_multiprocessor = pi;
793 r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &async_engines,
794 CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
795 if (r != CUDA_SUCCESS)
796 async_engines = 1;
798 for (int i = 0; i != GOMP_DIM_MAX; i++)
799 ptx_dev->default_dims[i] = 0;
801 ptx_dev->images = NULL;
802 pthread_mutex_init (&ptx_dev->image_lock, NULL);
804 if (!init_streams_for_device (ptx_dev, async_engines))
805 return NULL;
807 return ptx_dev;
810 static bool
811 nvptx_close_device (struct ptx_device *ptx_dev)
813 if (!ptx_dev)
814 return true;
816 if (!fini_streams_for_device (ptx_dev))
817 return false;
819 pthread_mutex_destroy (&ptx_dev->image_lock);
821 if (!ptx_dev->ctx_shared)
822 CUDA_CALL (cuCtxDestroy, ptx_dev->ctx);
824 free (ptx_dev);
825 return true;
828 static int
829 nvptx_get_num_devices (void)
831 int n;
833 /* PR libgomp/65099: Currently, we only support offloading in 64-bit
834 configurations. */
835 if (sizeof (void *) != 8)
837 GOMP_PLUGIN_debug (0, "Disabling nvptx offloading;"
838 " only 64-bit configurations are supported\n");
839 return 0;
842 /* This function will be called before the plugin has been initialized in
843 order to enumerate available devices, but CUDA API routines can't be used
844 until cuInit has been called. Just call it now (but don't yet do any
845 further initialization). */
846 if (instantiated_devices == 0)
848 if (!init_cuda_lib ())
849 return 0;
850 CUresult r = CUDA_CALL_NOCHECK (cuInit, 0);
851 /* This is not an error: e.g. we may have CUDA libraries installed but
852 no devices available. */
853 if (r != CUDA_SUCCESS)
855 GOMP_PLUGIN_debug (0, "Disabling nvptx offloading; cuInit: %s\n",
856 cuda_error (r));
857 return 0;
861 CUDA_CALL_ERET (-1, cuDeviceGetCount, &n);
862 return n;
865 static void
866 notify_var (const char *var_name, const char *env_var)
868 if (env_var == NULL)
869 GOMP_PLUGIN_debug (0, "%s: <Not defined>\n", var_name);
870 else
871 GOMP_PLUGIN_debug (0, "%s: '%s'\n", var_name, env_var);
874 static void
875 process_GOMP_NVPTX_JIT (intptr_t *gomp_nvptx_o)
877 const char *var_name = "GOMP_NVPTX_JIT";
878 const char *env_var = secure_getenv (var_name);
879 notify_var (var_name, env_var);
881 if (env_var == NULL)
882 return;
884 const char *c = env_var;
885 while (*c != '\0')
887 while (*c == ' ')
888 c++;
890 if (c[0] == '-' && c[1] == 'O'
891 && '0' <= c[2] && c[2] <= '4'
892 && (c[3] == '\0' || c[3] == ' '))
894 *gomp_nvptx_o = c[2] - '0';
895 c += 3;
896 continue;
899 GOMP_PLUGIN_error ("Error parsing %s", var_name);
900 break;
904 static bool
905 link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs,
906 unsigned num_objs)
908 CUjit_option opts[7];
909 void *optvals[7];
910 float elapsed = 0.0;
911 char elog[1024];
912 char ilog[16384];
913 CUlinkState linkstate;
914 CUresult r;
915 void *linkout;
916 size_t linkoutsize __attribute__ ((unused));
918 opts[0] = CU_JIT_WALL_TIME;
919 optvals[0] = &elapsed;
921 opts[1] = CU_JIT_INFO_LOG_BUFFER;
922 optvals[1] = &ilog[0];
924 opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
925 optvals[2] = (void *) sizeof ilog;
927 opts[3] = CU_JIT_ERROR_LOG_BUFFER;
928 optvals[3] = &elog[0];
930 opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
931 optvals[4] = (void *) sizeof elog;
933 opts[5] = CU_JIT_LOG_VERBOSE;
934 optvals[5] = (void *) 1;
936 static intptr_t gomp_nvptx_o = -1;
938 static bool init_done = false;
939 if (!init_done)
941 process_GOMP_NVPTX_JIT (&gomp_nvptx_o);
942 init_done = true;
945 int nopts = 6;
946 if (gomp_nvptx_o != -1)
948 opts[nopts] = CU_JIT_OPTIMIZATION_LEVEL;
949 optvals[nopts] = (void *) gomp_nvptx_o;
950 nopts++;
953 if (CUDA_CALL_EXISTS (cuLinkCreate_v2))
954 CUDA_CALL (cuLinkCreate_v2, nopts, opts, optvals, &linkstate);
955 else
956 CUDA_CALL (cuLinkCreate, nopts, opts, optvals, &linkstate);
958 for (; num_objs--; ptx_objs++)
960 /* cuLinkAddData's 'data' argument erroneously omits the const
961 qualifier. */
962 GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs->code);
963 if (CUDA_CALL_EXISTS (cuLinkAddData_v2))
964 r = CUDA_CALL_NOCHECK (cuLinkAddData_v2, linkstate, CU_JIT_INPUT_PTX,
965 (char *) ptx_objs->code, ptx_objs->size,
966 0, 0, 0, 0);
967 else
968 r = CUDA_CALL_NOCHECK (cuLinkAddData, linkstate, CU_JIT_INPUT_PTX,
969 (char *) ptx_objs->code, ptx_objs->size,
970 0, 0, 0, 0);
971 if (r != CUDA_SUCCESS)
973 GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
974 GOMP_PLUGIN_error ("cuLinkAddData (ptx_code) error: %s",
975 cuda_error (r));
976 return false;
980 GOMP_PLUGIN_debug (0, "Linking\n");
981 r = CUDA_CALL_NOCHECK (cuLinkComplete, linkstate, &linkout, &linkoutsize);
983 GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed);
984 GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]);
986 if (r != CUDA_SUCCESS)
988 GOMP_PLUGIN_error ("cuLinkComplete error: %s", cuda_error (r));
989 return false;
992 CUDA_CALL (cuModuleLoadData, module, linkout);
993 CUDA_CALL (cuLinkDestroy, linkstate);
994 return true;
997 static void
998 event_gc (bool memmap_lockable)
1000 struct ptx_event *ptx_event = ptx_events;
1001 struct ptx_event *async_cleanups = NULL;
1002 struct nvptx_thread *nvthd = nvptx_thread ();
1004 pthread_mutex_lock (&ptx_event_lock);
1006 while (ptx_event != NULL)
1008 CUresult r;
1009 struct ptx_event *e = ptx_event;
1011 ptx_event = ptx_event->next;
1013 if (e->ord != nvthd->ptx_dev->ord)
1014 continue;
1016 r = CUDA_CALL_NOCHECK (cuEventQuery, *e->evt);
1017 if (r == CUDA_SUCCESS)
1019 bool append_async = false;
1020 CUevent *te;
1022 te = e->evt;
1024 switch (e->type)
1026 case PTX_EVT_MEM:
1027 case PTX_EVT_SYNC:
1028 break;
1030 case PTX_EVT_KNL:
1031 map_pop (e->addr);
1032 break;
1034 case PTX_EVT_ASYNC_CLEANUP:
1036 /* The function gomp_plugin_async_unmap_vars needs to claim the
1037 memory-map splay tree lock for the current device, so we
1038 can't call it when one of our callers has already claimed
1039 the lock. In that case, just delay the GC for this event
1040 until later. */
1041 if (!memmap_lockable)
1042 continue;
1044 append_async = true;
1046 break;
1049 CUDA_CALL_NOCHECK (cuEventDestroy, *te);
1050 free ((void *)te);
1052 /* Unlink 'e' from ptx_events list. */
1053 if (ptx_events == e)
1054 ptx_events = ptx_events->next;
1055 else
1057 struct ptx_event *e_ = ptx_events;
1058 while (e_->next != e)
1059 e_ = e_->next;
1060 e_->next = e_->next->next;
1063 if (append_async)
1065 e->next = async_cleanups;
1066 async_cleanups = e;
1068 else
1069 free (e);
1073 pthread_mutex_unlock (&ptx_event_lock);
1075 /* We have to do these here, after ptx_event_lock is released. */
1076 while (async_cleanups)
1078 struct ptx_event *e = async_cleanups;
1079 async_cleanups = async_cleanups->next;
1081 GOMP_PLUGIN_async_unmap_vars (e->addr, e->val);
1082 free (e);
1086 static void
1087 event_add (enum ptx_event_type type, CUevent *e, void *h, int val)
1089 struct ptx_event *ptx_event;
1090 struct nvptx_thread *nvthd = nvptx_thread ();
1092 assert (type == PTX_EVT_MEM || type == PTX_EVT_KNL || type == PTX_EVT_SYNC
1093 || type == PTX_EVT_ASYNC_CLEANUP);
1095 ptx_event = GOMP_PLUGIN_malloc (sizeof (struct ptx_event));
1096 ptx_event->type = type;
1097 ptx_event->evt = e;
1098 ptx_event->addr = h;
1099 ptx_event->ord = nvthd->ptx_dev->ord;
1100 ptx_event->val = val;
1102 pthread_mutex_lock (&ptx_event_lock);
1104 ptx_event->next = ptx_events;
1105 ptx_events = ptx_event;
1107 pthread_mutex_unlock (&ptx_event_lock);
1110 static void
1111 nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
1112 int async, unsigned *dims, void *targ_mem_desc)
1114 struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
1115 CUfunction function;
1116 CUresult r;
1117 int i;
1118 struct ptx_stream *dev_str;
1119 void *kargs[1];
1120 void *hp, *dp;
1121 struct nvptx_thread *nvthd = nvptx_thread ();
1122 int warp_size = nvthd->ptx_dev->warp_size;
1123 const char *maybe_abort_msg = "(perhaps abort was called)";
1125 function = targ_fn->fn;
1127 dev_str = select_stream_for_async (async, pthread_self (), false, NULL);
1128 assert (dev_str == nvthd->current_stream);
1130 /* Initialize the launch dimensions. Typically this is constant,
1131 provided by the device compiler, but we must permit runtime
1132 values. */
1133 int seen_zero = 0;
1134 for (i = 0; i != GOMP_DIM_MAX; i++)
1136 if (targ_fn->launch->dim[i])
1137 dims[i] = targ_fn->launch->dim[i];
1138 if (!dims[i])
1139 seen_zero = 1;
1142 if (seen_zero)
1144 pthread_mutex_lock (&ptx_dev_lock);
1146 static int gomp_openacc_dims[GOMP_DIM_MAX];
1147 if (!gomp_openacc_dims[0])
1149 /* See if the user provided GOMP_OPENACC_DIM environment
1150 variable to specify runtime defaults. */
1151 for (int i = 0; i < GOMP_DIM_MAX; ++i)
1152 gomp_openacc_dims[i] = GOMP_PLUGIN_acc_default_dim (i);
1155 if (!nvthd->ptx_dev->default_dims[0])
1157 int default_dims[GOMP_DIM_MAX];
1158 for (int i = 0; i < GOMP_DIM_MAX; ++i)
1159 default_dims[i] = gomp_openacc_dims[i];
1161 int gang, worker, vector;
1163 int block_size = nvthd->ptx_dev->max_threads_per_block;
1164 int cpu_size = nvthd->ptx_dev->max_threads_per_multiprocessor;
1165 int dev_size = nvthd->ptx_dev->num_sms;
1166 GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
1167 " dev_size=%d, cpu_size=%d\n",
1168 warp_size, block_size, dev_size, cpu_size);
1170 gang = (cpu_size / block_size) * dev_size;
1171 worker = block_size / warp_size;
1172 vector = warp_size;
1175 /* There is no upper bound on the gang size. The best size
1176 matches the hardware configuration. Logical gangs are
1177 scheduled onto physical hardware. To maximize usage, we
1178 should guess a large number. */
1179 if (default_dims[GOMP_DIM_GANG] < 1)
1180 default_dims[GOMP_DIM_GANG] = gang ? gang : 1024;
1181 /* The worker size must not exceed the hardware. */
1182 if (default_dims[GOMP_DIM_WORKER] < 1
1183 || (default_dims[GOMP_DIM_WORKER] > worker && gang))
1184 default_dims[GOMP_DIM_WORKER] = worker;
1185 /* The vector size must exactly match the hardware. */
1186 if (default_dims[GOMP_DIM_VECTOR] < 1
1187 || (default_dims[GOMP_DIM_VECTOR] != vector && gang))
1188 default_dims[GOMP_DIM_VECTOR] = vector;
1190 GOMP_PLUGIN_debug (0, " default dimensions [%d,%d,%d]\n",
1191 default_dims[GOMP_DIM_GANG],
1192 default_dims[GOMP_DIM_WORKER],
1193 default_dims[GOMP_DIM_VECTOR]);
1195 for (i = 0; i != GOMP_DIM_MAX; i++)
1196 nvthd->ptx_dev->default_dims[i] = default_dims[i];
1198 pthread_mutex_unlock (&ptx_dev_lock);
1201 bool default_dim_p[GOMP_DIM_MAX];
1202 for (i = 0; i != GOMP_DIM_MAX; i++)
1204 default_dim_p[i] = !dims[i];
1205 if (default_dim_p[i])
1206 dims[i] = nvthd->ptx_dev->default_dims[i];
1209 if (default_dim_p[GOMP_DIM_VECTOR])
1210 dims[GOMP_DIM_VECTOR]
1211 = MIN (dims[GOMP_DIM_VECTOR],
1212 (targ_fn->max_threads_per_block / warp_size * warp_size));
1214 if (default_dim_p[GOMP_DIM_WORKER])
1215 dims[GOMP_DIM_WORKER]
1216 = MIN (dims[GOMP_DIM_WORKER],
1217 targ_fn->max_threads_per_block / dims[GOMP_DIM_VECTOR]);
1221 /* Check if the accelerator has sufficient hardware resources to
1222 launch the offloaded kernel. */
1223 if (dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]
1224 > targ_fn->max_threads_per_block)
1226 int suggest_workers
1227 = targ_fn->max_threads_per_block / dims[GOMP_DIM_VECTOR];
1228 GOMP_PLUGIN_fatal ("The Nvidia accelerator has insufficient resources to"
1229 " launch '%s' with num_workers = %d; recompile the"
1230 " program with 'num_workers = %d' on that offloaded"
1231 " region or '-fopenacc-dim=:%d'",
1232 targ_fn->launch->fn, dims[GOMP_DIM_WORKER],
1233 suggest_workers, suggest_workers);
1236 /* This reserves a chunk of a pre-allocated page of memory mapped on both
1237 the host and the device. HP is a host pointer to the new chunk, and DP is
1238 the corresponding device pointer. */
1239 map_push (dev_str, mapnum * sizeof (void *), &hp, &dp);
1241 GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__);
1243 /* Copy the array of arguments to the mapped page. */
1244 for (i = 0; i < mapnum; i++)
1245 ((void **) hp)[i] = devaddrs[i];
1247 /* Copy the (device) pointers to arguments to the device (dp and hp might in
1248 fact have the same value on a unified-memory system). */
1249 CUDA_CALL_ASSERT (cuMemcpy, (CUdeviceptr) dp, (CUdeviceptr) hp,
1250 mapnum * sizeof (void *));
1251 GOMP_PLUGIN_debug (0, " %s: kernel %s: launch"
1252 " gangs=%u, workers=%u, vectors=%u\n",
1253 __FUNCTION__, targ_fn->launch->fn, dims[GOMP_DIM_GANG],
1254 dims[GOMP_DIM_WORKER], dims[GOMP_DIM_VECTOR]);
1256 // OpenACC CUDA
1258 // num_gangs nctaid.x
1259 // num_workers ntid.y
1260 // vector length ntid.x
1262 kargs[0] = &dp;
1263 CUDA_CALL_ASSERT (cuLaunchKernel, function,
1264 dims[GOMP_DIM_GANG], 1, 1,
1265 dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1,
1266 0, dev_str->stream, kargs, 0);
1268 #ifndef DISABLE_ASYNC
1269 if (async < acc_async_noval)
1271 r = CUDA_CALL_NOCHECK (cuStreamSynchronize, dev_str->stream);
1272 if (r == CUDA_ERROR_LAUNCH_FAILED)
1273 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
1274 maybe_abort_msg);
1275 else if (r != CUDA_SUCCESS)
1276 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1278 else
1280 CUevent *e;
1282 e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1284 r = CUDA_CALL_NOCHECK (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1285 if (r == CUDA_ERROR_LAUNCH_FAILED)
1286 GOMP_PLUGIN_fatal ("cuEventCreate error: %s %s\n", cuda_error (r),
1287 maybe_abort_msg);
1288 else if (r != CUDA_SUCCESS)
1289 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1291 event_gc (true);
1293 CUDA_CALL_ASSERT (cuEventRecord, *e, dev_str->stream);
1295 event_add (PTX_EVT_KNL, e, (void *)dev_str, 0);
1297 #else
1298 r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
1299 if (r == CUDA_ERROR_LAUNCH_FAILED)
1300 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
1301 maybe_abort_msg);
1302 else if (r != CUDA_SUCCESS)
1303 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
1304 #endif
1306 GOMP_PLUGIN_debug (0, " %s: kernel %s: finished\n", __FUNCTION__,
1307 targ_fn->launch->fn);
1309 #ifndef DISABLE_ASYNC
1310 if (async < acc_async_noval)
1311 #endif
1312 map_pop (dev_str);
1315 void * openacc_get_current_cuda_context (void);
1317 static void *
1318 nvptx_alloc (size_t s)
1320 CUdeviceptr d;
1322 CUDA_CALL_ERET (NULL, cuMemAlloc, &d, s);
1323 return (void *) d;
1326 static bool
1327 nvptx_free (void *p)
1329 CUdeviceptr pb;
1330 size_t ps;
1332 CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) p);
1333 if ((CUdeviceptr) p != pb)
1335 GOMP_PLUGIN_error ("invalid device address");
1336 return false;
1339 CUDA_CALL (cuMemFree, (CUdeviceptr) p);
1340 return true;
1344 static bool
1345 nvptx_host2dev (void *d, const void *h, size_t s)
1347 CUdeviceptr pb;
1348 size_t ps;
1349 struct nvptx_thread *nvthd = nvptx_thread ();
1351 if (!s)
1352 return true;
1353 if (!d)
1355 GOMP_PLUGIN_error ("invalid device address");
1356 return false;
1359 CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
1361 if (!pb)
1363 GOMP_PLUGIN_error ("invalid device address");
1364 return false;
1366 if (!h)
1368 GOMP_PLUGIN_error ("invalid host address");
1369 return false;
1371 if (d == h)
1373 GOMP_PLUGIN_error ("invalid host or device address");
1374 return false;
1376 if ((void *)(d + s) > (void *)(pb + ps))
1378 GOMP_PLUGIN_error ("invalid size");
1379 return false;
1382 #ifndef DISABLE_ASYNC
1383 if (nvthd && nvthd->current_stream != nvthd->ptx_dev->null_stream)
1385 CUevent *e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1386 CUDA_CALL (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1387 event_gc (false);
1388 CUDA_CALL (cuMemcpyHtoDAsync,
1389 (CUdeviceptr) d, h, s, nvthd->current_stream->stream);
1390 CUDA_CALL (cuEventRecord, *e, nvthd->current_stream->stream);
1391 event_add (PTX_EVT_MEM, e, (void *)h, 0);
1393 else
1394 #endif
1395 CUDA_CALL (cuMemcpyHtoD, (CUdeviceptr) d, h, s);
1397 return true;
1400 static bool
1401 nvptx_dev2host (void *h, const void *d, size_t s)
1403 CUdeviceptr pb;
1404 size_t ps;
1405 struct nvptx_thread *nvthd = nvptx_thread ();
1407 if (!s)
1408 return true;
1409 if (!d)
1411 GOMP_PLUGIN_error ("invalid device address");
1412 return false;
1415 CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
1417 if (!pb)
1419 GOMP_PLUGIN_error ("invalid device address");
1420 return false;
1422 if (!h)
1424 GOMP_PLUGIN_error ("invalid host address");
1425 return false;
1427 if (d == h)
1429 GOMP_PLUGIN_error ("invalid host or device address");
1430 return false;
1432 if ((void *)(d + s) > (void *)(pb + ps))
1434 GOMP_PLUGIN_error ("invalid size");
1435 return false;
1438 #ifndef DISABLE_ASYNC
1439 if (nvthd && nvthd->current_stream != nvthd->ptx_dev->null_stream)
1441 CUevent *e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1442 CUDA_CALL (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1443 event_gc (false);
1444 CUDA_CALL (cuMemcpyDtoHAsync,
1445 h, (CUdeviceptr) d, s, nvthd->current_stream->stream);
1446 CUDA_CALL (cuEventRecord, *e, nvthd->current_stream->stream);
1447 event_add (PTX_EVT_MEM, e, (void *)h, 0);
1449 else
1450 #endif
1451 CUDA_CALL (cuMemcpyDtoH, h, (CUdeviceptr) d, s);
1453 return true;
1456 static void
1457 nvptx_set_async (int async)
1459 struct nvptx_thread *nvthd = nvptx_thread ();
1460 nvthd->current_stream
1461 = select_stream_for_async (async, pthread_self (), true, NULL);
1464 static int
1465 nvptx_async_test (int async)
1467 CUresult r;
1468 struct ptx_stream *s;
1470 s = select_stream_for_async (async, pthread_self (), false, NULL);
1472 if (!s)
1473 GOMP_PLUGIN_fatal ("unknown async %d", async);
1475 r = CUDA_CALL_NOCHECK (cuStreamQuery, s->stream);
1476 if (r == CUDA_SUCCESS)
1478 /* The oacc-parallel.c:goacc_wait function calls this hook to determine
1479 whether all work has completed on this stream, and if so omits the call
1480 to the wait hook. If that happens, event_gc might not get called
1481 (which prevents variables from getting unmapped and their associated
1482 device storage freed), so call it here. */
1483 event_gc (true);
1484 return 1;
1486 else if (r == CUDA_ERROR_NOT_READY)
1487 return 0;
1489 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
1491 return 0;
1494 static int
1495 nvptx_async_test_all (void)
1497 struct ptx_stream *s;
1498 pthread_t self = pthread_self ();
1499 struct nvptx_thread *nvthd = nvptx_thread ();
1501 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1503 for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
1505 if ((s->multithreaded || pthread_equal (s->host_thread, self))
1506 && CUDA_CALL_NOCHECK (cuStreamQuery,
1507 s->stream) == CUDA_ERROR_NOT_READY)
1509 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1510 return 0;
1514 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1516 event_gc (true);
1518 return 1;
1521 static void
1522 nvptx_wait (int async)
1524 struct ptx_stream *s;
1526 s = select_stream_for_async (async, pthread_self (), false, NULL);
1527 if (!s)
1528 GOMP_PLUGIN_fatal ("unknown async %d", async);
1530 CUDA_CALL_ASSERT (cuStreamSynchronize, s->stream);
1532 event_gc (true);
1535 static void
1536 nvptx_wait_async (int async1, int async2)
1538 CUevent *e;
1539 struct ptx_stream *s1, *s2;
1540 pthread_t self = pthread_self ();
1542 /* The stream that is waiting (rather than being waited for) doesn't
1543 necessarily have to exist already. */
1544 s2 = select_stream_for_async (async2, self, true, NULL);
1546 s1 = select_stream_for_async (async1, self, false, NULL);
1547 if (!s1)
1548 GOMP_PLUGIN_fatal ("invalid async 1\n");
1550 if (s1 == s2)
1551 GOMP_PLUGIN_fatal ("identical parameters");
1553 e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1555 CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1557 event_gc (true);
1559 CUDA_CALL_ASSERT (cuEventRecord, *e, s1->stream);
1561 event_add (PTX_EVT_SYNC, e, NULL, 0);
1563 CUDA_CALL_ASSERT (cuStreamWaitEvent, s2->stream, *e, 0);
1566 static void
1567 nvptx_wait_all (void)
1569 CUresult r;
1570 struct ptx_stream *s;
1571 pthread_t self = pthread_self ();
1572 struct nvptx_thread *nvthd = nvptx_thread ();
1574 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1576 /* Wait for active streams initiated by this thread (or by multiple threads)
1577 to complete. */
1578 for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
1580 if (s->multithreaded || pthread_equal (s->host_thread, self))
1582 r = CUDA_CALL_NOCHECK (cuStreamQuery, s->stream);
1583 if (r == CUDA_SUCCESS)
1584 continue;
1585 else if (r != CUDA_ERROR_NOT_READY)
1586 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
1588 CUDA_CALL_ASSERT (cuStreamSynchronize, s->stream);
1592 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1594 event_gc (true);
1597 static void
1598 nvptx_wait_all_async (int async)
1600 struct ptx_stream *waiting_stream, *other_stream;
1601 CUevent *e;
1602 struct nvptx_thread *nvthd = nvptx_thread ();
1603 pthread_t self = pthread_self ();
1605 /* The stream doing the waiting. This could be the first mention of the
1606 stream, so create it if necessary. */
1607 waiting_stream
1608 = select_stream_for_async (async, pthread_self (), true, NULL);
1610 /* Launches on the null stream already block on other streams in the
1611 context. */
1612 if (!waiting_stream || waiting_stream == nvthd->ptx_dev->null_stream)
1613 return;
1615 event_gc (true);
1617 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1619 for (other_stream = nvthd->ptx_dev->active_streams;
1620 other_stream != NULL;
1621 other_stream = other_stream->next)
1623 if (!other_stream->multithreaded
1624 && !pthread_equal (other_stream->host_thread, self))
1625 continue;
1627 e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1629 CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1631 /* Record an event on the waited-for stream. */
1632 CUDA_CALL_ASSERT (cuEventRecord, *e, other_stream->stream);
1634 event_add (PTX_EVT_SYNC, e, NULL, 0);
1636 CUDA_CALL_ASSERT (cuStreamWaitEvent, waiting_stream->stream, *e, 0);
1639 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1642 static void *
1643 nvptx_get_current_cuda_device (void)
1645 struct nvptx_thread *nvthd = nvptx_thread ();
1647 if (!nvthd || !nvthd->ptx_dev)
1648 return NULL;
1650 return &nvthd->ptx_dev->dev;
1653 static void *
1654 nvptx_get_current_cuda_context (void)
1656 struct nvptx_thread *nvthd = nvptx_thread ();
1658 if (!nvthd || !nvthd->ptx_dev)
1659 return NULL;
1661 return nvthd->ptx_dev->ctx;
1664 static void *
1665 nvptx_get_cuda_stream (int async)
1667 struct ptx_stream *s;
1668 struct nvptx_thread *nvthd = nvptx_thread ();
1670 if (!nvthd || !nvthd->ptx_dev)
1671 return NULL;
1673 s = select_stream_for_async (async, pthread_self (), false, NULL);
1675 return s ? s->stream : NULL;
1678 static int
1679 nvptx_set_cuda_stream (int async, void *stream)
1681 struct ptx_stream *oldstream;
1682 pthread_t self = pthread_self ();
1683 struct nvptx_thread *nvthd = nvptx_thread ();
1685 if (async < 0)
1686 GOMP_PLUGIN_fatal ("bad async %d", async);
1688 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1690 /* We have a list of active streams and an array mapping async values to
1691 entries of that list. We need to take "ownership" of the passed-in stream,
1692 and add it to our list, removing the previous entry also (if there was one)
1693 in order to prevent resource leaks. Note the potential for surprise
1694 here: maybe we should keep track of passed-in streams and leave it up to
1695 the user to tidy those up, but that doesn't work for stream handles
1696 returned from acc_get_cuda_stream above... */
1698 oldstream = select_stream_for_async (async, self, false, NULL);
1700 if (oldstream)
1702 if (nvthd->ptx_dev->active_streams == oldstream)
1703 nvthd->ptx_dev->active_streams = nvthd->ptx_dev->active_streams->next;
1704 else
1706 struct ptx_stream *s = nvthd->ptx_dev->active_streams;
1707 while (s->next != oldstream)
1708 s = s->next;
1709 s->next = s->next->next;
1712 CUDA_CALL_ASSERT (cuStreamDestroy, oldstream->stream);
1714 if (!map_fini (oldstream))
1715 GOMP_PLUGIN_fatal ("error when freeing host memory");
1717 free (oldstream);
1720 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1722 (void) select_stream_for_async (async, self, true, (CUstream) stream);
1724 return 1;
1727 /* Plugin entry points. */
1729 const char *
1730 GOMP_OFFLOAD_get_name (void)
1732 return "nvptx";
1735 unsigned int
1736 GOMP_OFFLOAD_get_caps (void)
1738 return GOMP_OFFLOAD_CAP_OPENACC_200 | GOMP_OFFLOAD_CAP_OPENMP_400;
1742 GOMP_OFFLOAD_get_type (void)
1744 return OFFLOAD_TARGET_TYPE_NVIDIA_PTX;
1748 GOMP_OFFLOAD_get_num_devices (void)
1750 return nvptx_get_num_devices ();
1753 bool
1754 GOMP_OFFLOAD_init_device (int n)
1756 struct ptx_device *dev;
1758 pthread_mutex_lock (&ptx_dev_lock);
1760 if (!nvptx_init () || ptx_devices[n] != NULL)
1762 pthread_mutex_unlock (&ptx_dev_lock);
1763 return false;
1766 dev = nvptx_open_device (n);
1767 if (dev)
1769 ptx_devices[n] = dev;
1770 instantiated_devices++;
1773 pthread_mutex_unlock (&ptx_dev_lock);
1775 return dev != NULL;
1778 bool
1779 GOMP_OFFLOAD_fini_device (int n)
1781 pthread_mutex_lock (&ptx_dev_lock);
1783 if (ptx_devices[n] != NULL)
1785 if (!nvptx_attach_host_thread_to_device (n)
1786 || !nvptx_close_device (ptx_devices[n]))
1788 pthread_mutex_unlock (&ptx_dev_lock);
1789 return false;
1791 ptx_devices[n] = NULL;
1792 instantiated_devices--;
1795 pthread_mutex_unlock (&ptx_dev_lock);
1796 return true;
1799 /* Return the libgomp version number we're compatible with. There is
1800 no requirement for cross-version compatibility. */
1802 unsigned
1803 GOMP_OFFLOAD_version (void)
1805 return GOMP_VERSION;
1808 /* Initialize __nvptx_clocktick, if present in MODULE. */
1810 static void
1811 nvptx_set_clocktick (CUmodule module, struct ptx_device *dev)
1813 CUdeviceptr dptr;
1814 CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &dptr, NULL,
1815 module, "__nvptx_clocktick");
1816 if (r == CUDA_ERROR_NOT_FOUND)
1817 return;
1818 if (r != CUDA_SUCCESS)
1819 GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
1820 double __nvptx_clocktick = 1e-3 / dev->clock_khz;
1821 r = CUDA_CALL_NOCHECK (cuMemcpyHtoD, dptr, &__nvptx_clocktick,
1822 sizeof (__nvptx_clocktick));
1823 if (r != CUDA_SUCCESS)
1824 GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
1827 /* Load the (partial) program described by TARGET_DATA to device
1828 number ORD. Allocate and return TARGET_TABLE. */
1831 GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
1832 struct addr_pair **target_table)
1834 CUmodule module;
1835 const char *const *var_names;
1836 const struct targ_fn_launch *fn_descs;
1837 unsigned int fn_entries, var_entries, i, j;
1838 struct targ_fn_descriptor *targ_fns;
1839 struct addr_pair *targ_tbl;
1840 const nvptx_tdata_t *img_header = (const nvptx_tdata_t *) target_data;
1841 struct ptx_image_data *new_image;
1842 struct ptx_device *dev;
1844 if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1846 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1847 " (expected %u, received %u)",
1848 GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1849 return -1;
1852 if (!nvptx_attach_host_thread_to_device (ord)
1853 || !link_ptx (&module, img_header->ptx_objs, img_header->ptx_num))
1854 return -1;
1856 dev = ptx_devices[ord];
1858 /* The mkoffload utility emits a struct of pointers/integers at the
1859 start of each offload image. The array of kernel names and the
1860 functions addresses form a one-to-one correspondence. */
1862 var_entries = img_header->var_num;
1863 var_names = img_header->var_names;
1864 fn_entries = img_header->fn_num;
1865 fn_descs = img_header->fn_descs;
1867 targ_tbl = GOMP_PLUGIN_malloc (sizeof (struct addr_pair)
1868 * (fn_entries + var_entries));
1869 targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
1870 * fn_entries);
1872 *target_table = targ_tbl;
1874 new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data));
1875 new_image->target_data = target_data;
1876 new_image->module = module;
1877 new_image->fns = targ_fns;
1879 pthread_mutex_lock (&dev->image_lock);
1880 new_image->next = dev->images;
1881 dev->images = new_image;
1882 pthread_mutex_unlock (&dev->image_lock);
1884 for (i = 0; i < fn_entries; i++, targ_fns++, targ_tbl++)
1886 CUfunction function;
1887 int nregs, mthrs;
1889 CUDA_CALL_ERET (-1, cuModuleGetFunction, &function, module,
1890 fn_descs[i].fn);
1891 CUDA_CALL_ERET (-1, cuFuncGetAttribute, &nregs,
1892 CU_FUNC_ATTRIBUTE_NUM_REGS, function);
1893 CUDA_CALL_ERET (-1, cuFuncGetAttribute, &mthrs,
1894 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, function);
1896 targ_fns->fn = function;
1897 targ_fns->launch = &fn_descs[i];
1898 targ_fns->regs_per_thread = nregs;
1899 targ_fns->max_threads_per_block = mthrs;
1901 targ_tbl->start = (uintptr_t) targ_fns;
1902 targ_tbl->end = targ_tbl->start + 1;
1905 for (j = 0; j < var_entries; j++, targ_tbl++)
1907 CUdeviceptr var;
1908 size_t bytes;
1910 CUDA_CALL_ERET (-1, cuModuleGetGlobal,
1911 &var, &bytes, module, var_names[j]);
1913 targ_tbl->start = (uintptr_t) var;
1914 targ_tbl->end = targ_tbl->start + bytes;
1917 nvptx_set_clocktick (module, dev);
1919 return fn_entries + var_entries;
1922 /* Unload the program described by TARGET_DATA. DEV_DATA is the
1923 function descriptors allocated by G_O_load_image. */
1925 bool
1926 GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data)
1928 struct ptx_image_data *image, **prev_p;
1929 struct ptx_device *dev = ptx_devices[ord];
1931 if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1933 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1934 " (expected %u, received %u)",
1935 GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1936 return false;
1939 bool ret = true;
1940 pthread_mutex_lock (&dev->image_lock);
1941 for (prev_p = &dev->images; (image = *prev_p) != 0; prev_p = &image->next)
1942 if (image->target_data == target_data)
1944 *prev_p = image->next;
1945 if (CUDA_CALL_NOCHECK (cuModuleUnload, image->module) != CUDA_SUCCESS)
1946 ret = false;
1947 free (image->fns);
1948 free (image);
1949 break;
1951 pthread_mutex_unlock (&dev->image_lock);
1952 return ret;
1955 void *
1956 GOMP_OFFLOAD_alloc (int ord, size_t size)
1958 if (!nvptx_attach_host_thread_to_device (ord))
1959 return NULL;
1960 return nvptx_alloc (size);
1963 bool
1964 GOMP_OFFLOAD_free (int ord, void *ptr)
1966 return (nvptx_attach_host_thread_to_device (ord)
1967 && nvptx_free (ptr));
1970 bool
1971 GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
1973 return (nvptx_attach_host_thread_to_device (ord)
1974 && nvptx_dev2host (dst, src, n));
1977 bool
1978 GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
1980 return (nvptx_attach_host_thread_to_device (ord)
1981 && nvptx_host2dev (dst, src, n));
1984 bool
1985 GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n)
1987 struct ptx_device *ptx_dev = ptx_devices[ord];
1988 CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n,
1989 ptx_dev->null_stream->stream);
1990 return true;
1993 void (*device_run) (int n, void *fn_ptr, void *vars) = NULL;
1995 void
1996 GOMP_OFFLOAD_openacc_exec (void (*fn) (void *), size_t mapnum,
1997 void **hostaddrs, void **devaddrs,
1998 int async, unsigned *dims, void *targ_mem_desc)
2000 nvptx_exec (fn, mapnum, hostaddrs, devaddrs, async, dims, targ_mem_desc);
2003 void
2004 GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc, int async)
2006 struct nvptx_thread *nvthd = nvptx_thread ();
2007 CUevent *e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
2009 CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
2010 CUDA_CALL_ASSERT (cuEventRecord, *e, nvthd->current_stream->stream);
2011 event_add (PTX_EVT_ASYNC_CLEANUP, e, targ_mem_desc, async);
2015 GOMP_OFFLOAD_openacc_async_test (int async)
2017 return nvptx_async_test (async);
2021 GOMP_OFFLOAD_openacc_async_test_all (void)
2023 return nvptx_async_test_all ();
2026 void
2027 GOMP_OFFLOAD_openacc_async_wait (int async)
2029 nvptx_wait (async);
2032 void
2033 GOMP_OFFLOAD_openacc_async_wait_async (int async1, int async2)
2035 nvptx_wait_async (async1, async2);
2038 void
2039 GOMP_OFFLOAD_openacc_async_wait_all (void)
2041 nvptx_wait_all ();
2044 void
2045 GOMP_OFFLOAD_openacc_async_wait_all_async (int async)
2047 nvptx_wait_all_async (async);
2050 void
2051 GOMP_OFFLOAD_openacc_async_set_async (int async)
2053 nvptx_set_async (async);
2056 void *
2057 GOMP_OFFLOAD_openacc_create_thread_data (int ord)
2059 struct ptx_device *ptx_dev;
2060 struct nvptx_thread *nvthd
2061 = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
2062 CUcontext thd_ctx;
2064 ptx_dev = ptx_devices[ord];
2066 assert (ptx_dev);
2068 CUDA_CALL_ASSERT (cuCtxGetCurrent, &thd_ctx);
2070 assert (ptx_dev->ctx);
2072 if (!thd_ctx)
2073 CUDA_CALL_ASSERT (cuCtxPushCurrent, ptx_dev->ctx);
2075 nvthd->current_stream = ptx_dev->null_stream;
2076 nvthd->ptx_dev = ptx_dev;
2078 return (void *) nvthd;
2081 void
2082 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
2084 free (data);
2087 void *
2088 GOMP_OFFLOAD_openacc_cuda_get_current_device (void)
2090 return nvptx_get_current_cuda_device ();
2093 void *
2094 GOMP_OFFLOAD_openacc_cuda_get_current_context (void)
2096 return nvptx_get_current_cuda_context ();
2099 /* NOTE: This returns a CUstream, not a ptx_stream pointer. */
2101 void *
2102 GOMP_OFFLOAD_openacc_cuda_get_stream (int async)
2104 return nvptx_get_cuda_stream (async);
2107 /* NOTE: This takes a CUstream, not a ptx_stream pointer. */
2110 GOMP_OFFLOAD_openacc_cuda_set_stream (int async, void *stream)
2112 return nvptx_set_cuda_stream (async, stream);
2115 /* Adjust launch dimensions: pick good values for number of blocks and warps
2116 and ensure that number of warps does not exceed CUDA limits as well as GCC's
2117 own limits. */
2119 static void
2120 nvptx_adjust_launch_bounds (struct targ_fn_descriptor *fn,
2121 struct ptx_device *ptx_dev,
2122 int *teams_p, int *threads_p)
2124 int max_warps_block = fn->max_threads_per_block / 32;
2125 /* Maximum 32 warps per block is an implementation limit in NVPTX backend
2126 and libgcc, which matches documented limit of all GPUs as of 2015. */
2127 if (max_warps_block > 32)
2128 max_warps_block = 32;
2129 if (*threads_p <= 0)
2130 *threads_p = 8;
2131 if (*threads_p > max_warps_block)
2132 *threads_p = max_warps_block;
2134 int regs_per_block = fn->regs_per_thread * 32 * *threads_p;
2135 /* This is an estimate of how many blocks the device can host simultaneously.
2136 Actual limit, which may be lower, can be queried with "occupancy control"
2137 driver interface (since CUDA 6.0). */
2138 int max_blocks = ptx_dev->regs_per_sm / regs_per_block * ptx_dev->num_sms;
2139 if (*teams_p <= 0 || *teams_p > max_blocks)
2140 *teams_p = max_blocks;
2143 /* Return the size of per-warp stacks (see gcc -msoft-stack) to use for OpenMP
2144 target regions. */
2146 static size_t
2147 nvptx_stacks_size ()
2149 return 128 * 1024;
2152 /* Return contiguous storage for NUM stacks, each SIZE bytes. */
2154 static void *
2155 nvptx_stacks_alloc (size_t size, int num)
2157 CUdeviceptr stacks;
2158 CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &stacks, size * num);
2159 if (r != CUDA_SUCCESS)
2160 GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
2161 return (void *) stacks;
2164 /* Release storage previously allocated by nvptx_stacks_alloc. */
2166 static void
2167 nvptx_stacks_free (void *p, int num)
2169 CUresult r = CUDA_CALL_NOCHECK (cuMemFree, (CUdeviceptr) p);
2170 if (r != CUDA_SUCCESS)
2171 GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
2174 void
2175 GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
2177 CUfunction function = ((struct targ_fn_descriptor *) tgt_fn)->fn;
2178 CUresult r;
2179 struct ptx_device *ptx_dev = ptx_devices[ord];
2180 const char *maybe_abort_msg = "(perhaps abort was called)";
2181 int teams = 0, threads = 0;
2183 if (!args)
2184 GOMP_PLUGIN_fatal ("No target arguments provided");
2185 while (*args)
2187 intptr_t id = (intptr_t) *args++, val;
2188 if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
2189 val = (intptr_t) *args++;
2190 else
2191 val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
2192 if ((id & GOMP_TARGET_ARG_DEVICE_MASK) != GOMP_TARGET_ARG_DEVICE_ALL)
2193 continue;
2194 val = val > INT_MAX ? INT_MAX : val;
2195 id &= GOMP_TARGET_ARG_ID_MASK;
2196 if (id == GOMP_TARGET_ARG_NUM_TEAMS)
2197 teams = val;
2198 else if (id == GOMP_TARGET_ARG_THREAD_LIMIT)
2199 threads = val;
2201 nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads);
2203 size_t stack_size = nvptx_stacks_size ();
2204 void *stacks = nvptx_stacks_alloc (stack_size, teams * threads);
2205 void *fn_args[] = {tgt_vars, stacks, (void *) stack_size};
2206 size_t fn_args_size = sizeof fn_args;
2207 void *config[] = {
2208 CU_LAUNCH_PARAM_BUFFER_POINTER, fn_args,
2209 CU_LAUNCH_PARAM_BUFFER_SIZE, &fn_args_size,
2210 CU_LAUNCH_PARAM_END
2212 r = CUDA_CALL_NOCHECK (cuLaunchKernel, function, teams, 1, 1,
2213 32, threads, 1, 0, ptx_dev->null_stream->stream,
2214 NULL, config);
2215 if (r != CUDA_SUCCESS)
2216 GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
2218 r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
2219 if (r == CUDA_ERROR_LAUNCH_FAILED)
2220 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
2221 maybe_abort_msg);
2222 else if (r != CUDA_SUCCESS)
2223 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
2224 nvptx_stacks_free (stacks, teams * threads);
2227 void
2228 GOMP_OFFLOAD_async_run (int ord, void *tgt_fn, void *tgt_vars, void **args,
2229 void *async_data)
2231 GOMP_PLUGIN_fatal ("GOMP_OFFLOAD_async_run unimplemented");