Fix warning with -Wsign-compare -Wsystem-headers
[official-gcc.git] / libgomp / plugin / plugin-nvptx.c
blobbae1b05ccaa9f6c530c1f36898a73eff2aa1a8e3
1 /* Plugin for NVPTX execution.
3 Copyright (C) 2013-2018 Free Software Foundation, Inc.
5 Contributed by Mentor Embedded.
7 This file is part of the GNU Offloading and Multi Processing Library
8 (libgomp).
10 Libgomp is free software; you can redistribute it and/or modify it
11 under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 3, or (at your option)
13 any later version.
15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
18 more details.
20 Under Section 7 of GPL version 3, you are granted additional
21 permissions described in the GCC Runtime Library Exception, version
22 3.1, as published by the Free Software Foundation.
24 You should have received a copy of the GNU General Public License and
25 a copy of the GCC Runtime Library Exception along with this program;
26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
27 <http://www.gnu.org/licenses/>. */
29 /* Nvidia PTX-specific parts of OpenACC support. The cuda driver
30 library appears to hold some implicit state, but the documentation
31 is not clear as to what that state might be. Or how one might
32 propagate it from one thread to another. */
34 #define _GNU_SOURCE
35 #include "openacc.h"
36 #include "config.h"
37 #include "libgomp-plugin.h"
38 #include "oacc-plugin.h"
39 #include "gomp-constants.h"
41 #include <pthread.h>
42 #include <cuda.h>
43 #include <stdbool.h>
44 #include <stdint.h>
45 #include <limits.h>
46 #include <string.h>
47 #include <stdio.h>
48 #include <unistd.h>
49 #include <assert.h>
50 #include <errno.h>
52 #if CUDA_VERSION < 6000
53 extern CUresult cuGetErrorString (CUresult, const char **);
54 #define CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR 82
55 #endif
57 #if CUDA_VERSION >= 6050
58 #undef cuLinkCreate
59 #undef cuLinkAddData
60 CUresult cuLinkAddData (CUlinkState, CUjitInputType, void *, size_t,
61 const char *, unsigned, CUjit_option *, void **);
62 CUresult cuLinkCreate (unsigned, CUjit_option *, void **, CUlinkState *);
63 #else
64 typedef size_t (*CUoccupancyB2DSize)(int);
65 CUresult cuLinkAddData_v2 (CUlinkState, CUjitInputType, void *, size_t,
66 const char *, unsigned, CUjit_option *, void **);
67 CUresult cuLinkCreate_v2 (unsigned, CUjit_option *, void **, CUlinkState *);
68 CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
69 CUoccupancyB2DSize, size_t, int);
70 #endif
72 #define DO_PRAGMA(x) _Pragma (#x)
74 #if PLUGIN_NVPTX_DYNAMIC
75 # include <dlfcn.h>
77 struct cuda_lib_s {
79 # define CUDA_ONE_CALL(call) \
80 __typeof (call) *call;
81 # define CUDA_ONE_CALL_MAYBE_NULL(call) \
82 CUDA_ONE_CALL (call)
83 #include "cuda-lib.def"
84 # undef CUDA_ONE_CALL
85 # undef CUDA_ONE_CALL_MAYBE_NULL
87 } cuda_lib;
89 /* -1 if init_cuda_lib has not been called yet, false
90 if it has been and failed, true if it has been and succeeded. */
91 static signed char cuda_lib_inited = -1;
93 /* Dynamically load the CUDA runtime library and initialize function
94 pointers, return false if unsuccessful, true if successful. */
95 static bool
96 init_cuda_lib (void)
98 if (cuda_lib_inited != -1)
99 return cuda_lib_inited;
100 const char *cuda_runtime_lib = "libcuda.so.1";
101 void *h = dlopen (cuda_runtime_lib, RTLD_LAZY);
102 cuda_lib_inited = false;
103 if (h == NULL)
104 return false;
106 # define CUDA_ONE_CALL(call) CUDA_ONE_CALL_1 (call, false)
107 # define CUDA_ONE_CALL_MAYBE_NULL(call) CUDA_ONE_CALL_1 (call, true)
108 # define CUDA_ONE_CALL_1(call, allow_null) \
109 cuda_lib.call = dlsym (h, #call); \
110 if (!allow_null && cuda_lib.call == NULL) \
111 return false;
112 #include "cuda-lib.def"
113 # undef CUDA_ONE_CALL
114 # undef CUDA_ONE_CALL_1
115 # undef CUDA_ONE_CALL_MAYBE_NULL
117 cuda_lib_inited = true;
118 return true;
120 # define CUDA_CALL_PREFIX cuda_lib.
121 #else
123 # define CUDA_ONE_CALL(call)
124 # define CUDA_ONE_CALL_MAYBE_NULL(call) DO_PRAGMA (weak call)
125 #include "cuda-lib.def"
126 #undef CUDA_ONE_CALL_MAYBE_NULL
127 #undef CUDA_ONE_CALL
129 # define CUDA_CALL_PREFIX
130 # define init_cuda_lib() true
131 #endif
133 #include "secure_getenv.h"
135 #undef MIN
136 #undef MAX
137 #define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
138 #define MAX(X,Y) ((X) > (Y) ? (X) : (Y))
140 /* Convenience macros for the frequently used CUDA library call and
141 error handling sequence as well as CUDA library calls that
142 do the error checking themselves or don't do it at all. */
144 #define CUDA_CALL_ERET(ERET, FN, ...) \
145 do { \
146 unsigned __r \
147 = CUDA_CALL_PREFIX FN (__VA_ARGS__); \
148 if (__r != CUDA_SUCCESS) \
150 GOMP_PLUGIN_error (#FN " error: %s", \
151 cuda_error (__r)); \
152 return ERET; \
154 } while (0)
156 #define CUDA_CALL(FN, ...) \
157 CUDA_CALL_ERET (false, FN, __VA_ARGS__)
159 #define CUDA_CALL_ASSERT(FN, ...) \
160 do { \
161 unsigned __r \
162 = CUDA_CALL_PREFIX FN (__VA_ARGS__); \
163 if (__r != CUDA_SUCCESS) \
165 GOMP_PLUGIN_fatal (#FN " error: %s", \
166 cuda_error (__r)); \
168 } while (0)
170 #define CUDA_CALL_NOCHECK(FN, ...) \
171 CUDA_CALL_PREFIX FN (__VA_ARGS__)
173 #define CUDA_CALL_EXISTS(FN) \
174 CUDA_CALL_PREFIX FN
176 static const char *
177 cuda_error (CUresult r)
179 const char *fallback = "unknown cuda error";
180 const char *desc;
182 if (!CUDA_CALL_EXISTS (cuGetErrorString))
183 return fallback;
185 r = CUDA_CALL_NOCHECK (cuGetErrorString, r, &desc);
186 if (r == CUDA_SUCCESS)
187 return desc;
189 return fallback;
192 static unsigned int instantiated_devices = 0;
193 static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
195 struct ptx_stream
197 CUstream stream;
198 pthread_t host_thread;
199 bool multithreaded;
201 CUdeviceptr d;
202 void *h;
203 void *h_begin;
204 void *h_end;
205 void *h_next;
206 void *h_prev;
207 void *h_tail;
209 struct ptx_stream *next;
212 /* Thread-specific data for PTX. */
214 struct nvptx_thread
216 struct ptx_stream *current_stream;
217 struct ptx_device *ptx_dev;
220 static bool
221 map_init (struct ptx_stream *s)
223 int size = getpagesize ();
225 assert (s);
226 assert (!s->d);
227 assert (!s->h);
229 CUDA_CALL (cuMemAllocHost, &s->h, size);
230 CUDA_CALL (cuMemHostGetDevicePointer, &s->d, s->h, 0);
232 assert (s->h);
234 s->h_begin = s->h;
235 s->h_end = s->h_begin + size;
236 s->h_next = s->h_prev = s->h_tail = s->h_begin;
238 assert (s->h_next);
239 assert (s->h_end);
240 return true;
243 static bool
244 map_fini (struct ptx_stream *s)
246 CUDA_CALL (cuMemFreeHost, s->h);
247 return true;
250 static void
251 map_pop (struct ptx_stream *s)
253 assert (s != NULL);
254 assert (s->h_next);
255 assert (s->h_prev);
256 assert (s->h_tail);
258 s->h_tail = s->h_next;
260 if (s->h_tail >= s->h_end)
261 s->h_tail = s->h_begin + (int) (s->h_tail - s->h_end);
263 if (s->h_next == s->h_tail)
264 s->h_prev = s->h_next;
266 assert (s->h_next >= s->h_begin);
267 assert (s->h_tail >= s->h_begin);
268 assert (s->h_prev >= s->h_begin);
270 assert (s->h_next <= s->h_end);
271 assert (s->h_tail <= s->h_end);
272 assert (s->h_prev <= s->h_end);
275 static void
276 map_push (struct ptx_stream *s, size_t size, void **h, void **d)
278 int left;
279 int offset;
281 assert (s != NULL);
283 left = s->h_end - s->h_next;
285 assert (s->h_prev);
286 assert (s->h_next);
288 if (size >= left)
290 assert (s->h_next == s->h_prev);
291 s->h_next = s->h_prev = s->h_tail = s->h_begin;
294 assert (s->h_next);
296 offset = s->h_next - s->h;
298 *d = (void *)(s->d + offset);
299 *h = (void *)(s->h + offset);
301 s->h_prev = s->h_next;
302 s->h_next += size;
304 assert (s->h_prev);
305 assert (s->h_next);
307 assert (s->h_next >= s->h_begin);
308 assert (s->h_tail >= s->h_begin);
309 assert (s->h_prev >= s->h_begin);
310 assert (s->h_next <= s->h_end);
311 assert (s->h_tail <= s->h_end);
312 assert (s->h_prev <= s->h_end);
314 return;
317 /* Target data function launch information. */
319 struct targ_fn_launch
321 const char *fn;
322 unsigned short dim[GOMP_DIM_MAX];
325 /* Target PTX object information. */
327 struct targ_ptx_obj
329 const char *code;
330 size_t size;
333 /* Target data image information. */
335 typedef struct nvptx_tdata
337 const struct targ_ptx_obj *ptx_objs;
338 unsigned ptx_num;
340 const char *const *var_names;
341 unsigned var_num;
343 const struct targ_fn_launch *fn_descs;
344 unsigned fn_num;
345 } nvptx_tdata_t;
347 /* Descriptor of a loaded function. */
349 struct targ_fn_descriptor
351 CUfunction fn;
352 const struct targ_fn_launch *launch;
353 int regs_per_thread;
354 int max_threads_per_block;
357 /* A loaded PTX image. */
358 struct ptx_image_data
360 const void *target_data;
361 CUmodule module;
363 struct targ_fn_descriptor *fns; /* Array of functions. */
365 struct ptx_image_data *next;
368 struct ptx_device
370 CUcontext ctx;
371 bool ctx_shared;
372 CUdevice dev;
373 struct ptx_stream *null_stream;
374 /* All non-null streams associated with this device (actually context),
375 either created implicitly or passed in from the user (via
376 acc_set_cuda_stream). */
377 struct ptx_stream *active_streams;
378 struct {
379 struct ptx_stream **arr;
380 int size;
381 } async_streams;
382 /* A lock for use when manipulating the above stream list and array. */
383 pthread_mutex_t stream_lock;
384 int ord;
385 bool overlap;
386 bool map;
387 bool concur;
388 bool mkern;
389 int mode;
390 int clock_khz;
391 int num_sms;
392 int regs_per_block;
393 int regs_per_sm;
394 int warp_size;
395 int max_threads_per_block;
396 int max_threads_per_multiprocessor;
397 int default_dims[GOMP_DIM_MAX];
399 struct ptx_image_data *images; /* Images loaded on device. */
400 pthread_mutex_t image_lock; /* Lock for above list. */
402 struct ptx_device *next;
405 enum ptx_event_type
407 PTX_EVT_MEM,
408 PTX_EVT_KNL,
409 PTX_EVT_SYNC,
410 PTX_EVT_ASYNC_CLEANUP
413 struct ptx_event
415 CUevent *evt;
416 int type;
417 void *addr;
418 int ord;
419 int val;
421 struct ptx_event *next;
424 static pthread_mutex_t ptx_event_lock;
425 static struct ptx_event *ptx_events;
427 static struct ptx_device **ptx_devices;
429 static inline struct nvptx_thread *
430 nvptx_thread (void)
432 return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
435 static bool
436 init_streams_for_device (struct ptx_device *ptx_dev, int concurrency)
438 int i;
439 struct ptx_stream *null_stream
440 = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
442 null_stream->stream = NULL;
443 null_stream->host_thread = pthread_self ();
444 null_stream->multithreaded = true;
445 null_stream->d = (CUdeviceptr) NULL;
446 null_stream->h = NULL;
447 if (!map_init (null_stream))
448 return false;
450 ptx_dev->null_stream = null_stream;
451 ptx_dev->active_streams = NULL;
452 pthread_mutex_init (&ptx_dev->stream_lock, NULL);
454 if (concurrency < 1)
455 concurrency = 1;
457 /* This is just a guess -- make space for as many async streams as the
458 current device is capable of concurrently executing. This can grow
459 later as necessary. No streams are created yet. */
460 ptx_dev->async_streams.arr
461 = GOMP_PLUGIN_malloc (concurrency * sizeof (struct ptx_stream *));
462 ptx_dev->async_streams.size = concurrency;
464 for (i = 0; i < concurrency; i++)
465 ptx_dev->async_streams.arr[i] = NULL;
467 return true;
470 static bool
471 fini_streams_for_device (struct ptx_device *ptx_dev)
473 free (ptx_dev->async_streams.arr);
475 bool ret = true;
476 while (ptx_dev->active_streams != NULL)
478 struct ptx_stream *s = ptx_dev->active_streams;
479 ptx_dev->active_streams = ptx_dev->active_streams->next;
481 ret &= map_fini (s);
483 CUresult r = CUDA_CALL_NOCHECK (cuStreamDestroy, s->stream);
484 if (r != CUDA_SUCCESS)
486 GOMP_PLUGIN_error ("cuStreamDestroy error: %s", cuda_error (r));
487 ret = false;
489 free (s);
492 ret &= map_fini (ptx_dev->null_stream);
493 free (ptx_dev->null_stream);
494 return ret;
497 /* Select a stream for (OpenACC-semantics) ASYNC argument for the current
498 thread THREAD (and also current device/context). If CREATE is true, create
499 the stream if it does not exist (or use EXISTING if it is non-NULL), and
500 associate the stream with the same thread argument. Returns stream to use
501 as result. */
503 static struct ptx_stream *
504 select_stream_for_async (int async, pthread_t thread, bool create,
505 CUstream existing)
507 struct nvptx_thread *nvthd = nvptx_thread ();
508 /* Local copy of TLS variable. */
509 struct ptx_device *ptx_dev = nvthd->ptx_dev;
510 struct ptx_stream *stream = NULL;
511 int orig_async = async;
513 /* The special value acc_async_noval (-1) maps (for now) to an
514 implicitly-created stream, which is then handled the same as any other
515 numbered async stream. Other options are available, e.g. using the null
516 stream for anonymous async operations, or choosing an idle stream from an
517 active set. But, stick with this for now. */
518 if (async > acc_async_sync)
519 async++;
521 if (create)
522 pthread_mutex_lock (&ptx_dev->stream_lock);
524 /* NOTE: AFAICT there's no particular need for acc_async_sync to map to the
525 null stream, and in fact better performance may be obtainable if it doesn't
526 (because the null stream enforces overly-strict synchronisation with
527 respect to other streams for legacy reasons, and that's probably not
528 needed with OpenACC). Maybe investigate later. */
529 if (async == acc_async_sync)
530 stream = ptx_dev->null_stream;
531 else if (async >= 0 && async < ptx_dev->async_streams.size
532 && ptx_dev->async_streams.arr[async] && !(create && existing))
533 stream = ptx_dev->async_streams.arr[async];
534 else if (async >= 0 && create)
536 if (async >= ptx_dev->async_streams.size)
538 int i, newsize = ptx_dev->async_streams.size * 2;
540 if (async >= newsize)
541 newsize = async + 1;
543 ptx_dev->async_streams.arr
544 = GOMP_PLUGIN_realloc (ptx_dev->async_streams.arr,
545 newsize * sizeof (struct ptx_stream *));
547 for (i = ptx_dev->async_streams.size; i < newsize; i++)
548 ptx_dev->async_streams.arr[i] = NULL;
550 ptx_dev->async_streams.size = newsize;
553 /* Create a new stream on-demand if there isn't one already, or if we're
554 setting a particular async value to an existing (externally-provided)
555 stream. */
556 if (!ptx_dev->async_streams.arr[async] || existing)
558 CUresult r;
559 struct ptx_stream *s
560 = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
562 if (existing)
563 s->stream = existing;
564 else
566 r = CUDA_CALL_NOCHECK (cuStreamCreate, &s->stream,
567 CU_STREAM_DEFAULT);
568 if (r != CUDA_SUCCESS)
570 pthread_mutex_unlock (&ptx_dev->stream_lock);
571 GOMP_PLUGIN_fatal ("cuStreamCreate error: %s",
572 cuda_error (r));
576 /* If CREATE is true, we're going to be queueing some work on this
577 stream. Associate it with the current host thread. */
578 s->host_thread = thread;
579 s->multithreaded = false;
581 s->d = (CUdeviceptr) NULL;
582 s->h = NULL;
583 if (!map_init (s))
585 pthread_mutex_unlock (&ptx_dev->stream_lock);
586 GOMP_PLUGIN_fatal ("map_init fail");
589 s->next = ptx_dev->active_streams;
590 ptx_dev->active_streams = s;
591 ptx_dev->async_streams.arr[async] = s;
594 stream = ptx_dev->async_streams.arr[async];
596 else if (async < 0)
598 if (create)
599 pthread_mutex_unlock (&ptx_dev->stream_lock);
600 GOMP_PLUGIN_fatal ("bad async %d", async);
603 if (create)
605 assert (stream != NULL);
607 /* If we're trying to use the same stream from different threads
608 simultaneously, set stream->multithreaded to true. This affects the
609 behaviour of acc_async_test_all and acc_wait_all, which are supposed to
610 only wait for asynchronous launches from the same host thread they are
611 invoked on. If multiple threads use the same async value, we make note
612 of that here and fall back to testing/waiting for all threads in those
613 functions. */
614 if (thread != stream->host_thread)
615 stream->multithreaded = true;
617 pthread_mutex_unlock (&ptx_dev->stream_lock);
619 else if (stream && !stream->multithreaded
620 && !pthread_equal (stream->host_thread, thread))
621 GOMP_PLUGIN_fatal ("async %d used on wrong thread", orig_async);
623 return stream;
626 /* Initialize the device. Return TRUE on success, else FALSE. PTX_DEV_LOCK
627 should be locked on entry and remains locked on exit. */
629 static bool
630 nvptx_init (void)
632 int ndevs;
634 if (instantiated_devices != 0)
635 return true;
637 ptx_events = NULL;
638 pthread_mutex_init (&ptx_event_lock, NULL);
640 if (!init_cuda_lib ())
641 return false;
643 CUDA_CALL (cuInit, 0);
645 CUDA_CALL (cuDeviceGetCount, &ndevs);
646 ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *)
647 * ndevs);
648 return true;
651 /* Select the N'th PTX device for the current host thread. The device must
652 have been previously opened before calling this function. */
654 static bool
655 nvptx_attach_host_thread_to_device (int n)
657 CUdevice dev;
658 CUresult r;
659 struct ptx_device *ptx_dev;
660 CUcontext thd_ctx;
662 r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &dev);
663 if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
665 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
666 return false;
669 if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n)
670 return true;
671 else
673 CUcontext old_ctx;
675 ptx_dev = ptx_devices[n];
676 if (!ptx_dev)
678 GOMP_PLUGIN_error ("device %d not found", n);
679 return false;
682 CUDA_CALL (cuCtxGetCurrent, &thd_ctx);
684 /* We don't necessarily have a current context (e.g. if it has been
685 destroyed. Pop it if we do though. */
686 if (thd_ctx != NULL)
687 CUDA_CALL (cuCtxPopCurrent, &old_ctx);
689 CUDA_CALL (cuCtxPushCurrent, ptx_dev->ctx);
691 return true;
694 static struct ptx_device *
695 nvptx_open_device (int n)
697 struct ptx_device *ptx_dev;
698 CUdevice dev, ctx_dev;
699 CUresult r;
700 int async_engines, pi;
702 CUDA_CALL_ERET (NULL, cuDeviceGet, &dev, n);
704 ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device));
706 ptx_dev->ord = n;
707 ptx_dev->dev = dev;
708 ptx_dev->ctx_shared = false;
710 r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &ctx_dev);
711 if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
713 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
714 return NULL;
717 if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev)
719 /* The current host thread has an active context for a different device.
720 Detach it. */
721 CUcontext old_ctx;
722 CUDA_CALL_ERET (NULL, cuCtxPopCurrent, &old_ctx);
725 CUDA_CALL_ERET (NULL, cuCtxGetCurrent, &ptx_dev->ctx);
727 if (!ptx_dev->ctx)
728 CUDA_CALL_ERET (NULL, cuCtxCreate, &ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
729 else
730 ptx_dev->ctx_shared = true;
732 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
733 &pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
734 ptx_dev->overlap = pi;
736 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
737 &pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
738 ptx_dev->map = pi;
740 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
741 &pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
742 ptx_dev->concur = pi;
744 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
745 &pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
746 ptx_dev->mode = pi;
748 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
749 &pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
750 ptx_dev->mkern = pi;
752 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
753 &pi, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
754 ptx_dev->clock_khz = pi;
756 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
757 &pi, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
758 ptx_dev->num_sms = pi;
760 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
761 &pi, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev);
762 ptx_dev->regs_per_block = pi;
764 /* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR is defined only
765 in CUDA 6.0 and newer. */
766 r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi,
767 CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR,
768 dev);
769 /* Fallback: use limit of registers per block, which is usually equal. */
770 if (r == CUDA_ERROR_INVALID_VALUE)
771 pi = ptx_dev->regs_per_block;
772 else if (r != CUDA_SUCCESS)
774 GOMP_PLUGIN_error ("cuDeviceGetAttribute error: %s", cuda_error (r));
775 return NULL;
777 ptx_dev->regs_per_sm = pi;
779 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
780 &pi, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev);
781 if (pi != 32)
783 GOMP_PLUGIN_error ("Only warp size 32 is supported");
784 return NULL;
786 ptx_dev->warp_size = pi;
788 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
789 CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev);
790 ptx_dev->max_threads_per_block = pi;
792 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
793 CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev);
794 ptx_dev->max_threads_per_multiprocessor = pi;
796 r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &async_engines,
797 CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
798 if (r != CUDA_SUCCESS)
799 async_engines = 1;
801 for (int i = 0; i != GOMP_DIM_MAX; i++)
802 ptx_dev->default_dims[i] = 0;
804 ptx_dev->images = NULL;
805 pthread_mutex_init (&ptx_dev->image_lock, NULL);
807 if (!init_streams_for_device (ptx_dev, async_engines))
808 return NULL;
810 return ptx_dev;
813 static bool
814 nvptx_close_device (struct ptx_device *ptx_dev)
816 if (!ptx_dev)
817 return true;
819 if (!fini_streams_for_device (ptx_dev))
820 return false;
822 pthread_mutex_destroy (&ptx_dev->image_lock);
824 if (!ptx_dev->ctx_shared)
825 CUDA_CALL (cuCtxDestroy, ptx_dev->ctx);
827 free (ptx_dev);
828 return true;
831 static int
832 nvptx_get_num_devices (void)
834 int n;
836 /* PR libgomp/65099: Currently, we only support offloading in 64-bit
837 configurations. */
838 if (sizeof (void *) != 8)
840 GOMP_PLUGIN_debug (0, "Disabling nvptx offloading;"
841 " only 64-bit configurations are supported\n");
842 return 0;
845 /* This function will be called before the plugin has been initialized in
846 order to enumerate available devices, but CUDA API routines can't be used
847 until cuInit has been called. Just call it now (but don't yet do any
848 further initialization). */
849 if (instantiated_devices == 0)
851 if (!init_cuda_lib ())
852 return 0;
853 CUresult r = CUDA_CALL_NOCHECK (cuInit, 0);
854 /* This is not an error: e.g. we may have CUDA libraries installed but
855 no devices available. */
856 if (r != CUDA_SUCCESS)
858 GOMP_PLUGIN_debug (0, "Disabling nvptx offloading; cuInit: %s\n",
859 cuda_error (r));
860 return 0;
864 CUDA_CALL_ERET (-1, cuDeviceGetCount, &n);
865 return n;
868 static void
869 notify_var (const char *var_name, const char *env_var)
871 if (env_var == NULL)
872 GOMP_PLUGIN_debug (0, "%s: <Not defined>\n", var_name);
873 else
874 GOMP_PLUGIN_debug (0, "%s: '%s'\n", var_name, env_var);
877 static void
878 process_GOMP_NVPTX_JIT (intptr_t *gomp_nvptx_o)
880 const char *var_name = "GOMP_NVPTX_JIT";
881 const char *env_var = secure_getenv (var_name);
882 notify_var (var_name, env_var);
884 if (env_var == NULL)
885 return;
887 const char *c = env_var;
888 while (*c != '\0')
890 while (*c == ' ')
891 c++;
893 if (c[0] == '-' && c[1] == 'O'
894 && '0' <= c[2] && c[2] <= '4'
895 && (c[3] == '\0' || c[3] == ' '))
897 *gomp_nvptx_o = c[2] - '0';
898 c += 3;
899 continue;
902 GOMP_PLUGIN_error ("Error parsing %s", var_name);
903 break;
907 static bool
908 link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs,
909 unsigned num_objs)
911 CUjit_option opts[7];
912 void *optvals[7];
913 float elapsed = 0.0;
914 char elog[1024];
915 char ilog[16384];
916 CUlinkState linkstate;
917 CUresult r;
918 void *linkout;
919 size_t linkoutsize __attribute__ ((unused));
921 opts[0] = CU_JIT_WALL_TIME;
922 optvals[0] = &elapsed;
924 opts[1] = CU_JIT_INFO_LOG_BUFFER;
925 optvals[1] = &ilog[0];
927 opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
928 optvals[2] = (void *) sizeof ilog;
930 opts[3] = CU_JIT_ERROR_LOG_BUFFER;
931 optvals[3] = &elog[0];
933 opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
934 optvals[4] = (void *) sizeof elog;
936 opts[5] = CU_JIT_LOG_VERBOSE;
937 optvals[5] = (void *) 1;
939 static intptr_t gomp_nvptx_o = -1;
941 static bool init_done = false;
942 if (!init_done)
944 process_GOMP_NVPTX_JIT (&gomp_nvptx_o);
945 init_done = true;
948 int nopts = 6;
949 if (gomp_nvptx_o != -1)
951 opts[nopts] = CU_JIT_OPTIMIZATION_LEVEL;
952 optvals[nopts] = (void *) gomp_nvptx_o;
953 nopts++;
956 if (CUDA_CALL_EXISTS (cuLinkCreate_v2))
957 CUDA_CALL (cuLinkCreate_v2, nopts, opts, optvals, &linkstate);
958 else
959 CUDA_CALL (cuLinkCreate, nopts, opts, optvals, &linkstate);
961 for (; num_objs--; ptx_objs++)
963 /* cuLinkAddData's 'data' argument erroneously omits the const
964 qualifier. */
965 GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs->code);
966 if (CUDA_CALL_EXISTS (cuLinkAddData_v2))
967 r = CUDA_CALL_NOCHECK (cuLinkAddData_v2, linkstate, CU_JIT_INPUT_PTX,
968 (char *) ptx_objs->code, ptx_objs->size,
969 0, 0, 0, 0);
970 else
971 r = CUDA_CALL_NOCHECK (cuLinkAddData, linkstate, CU_JIT_INPUT_PTX,
972 (char *) ptx_objs->code, ptx_objs->size,
973 0, 0, 0, 0);
974 if (r != CUDA_SUCCESS)
976 GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
977 GOMP_PLUGIN_error ("cuLinkAddData (ptx_code) error: %s",
978 cuda_error (r));
979 return false;
983 GOMP_PLUGIN_debug (0, "Linking\n");
984 r = CUDA_CALL_NOCHECK (cuLinkComplete, linkstate, &linkout, &linkoutsize);
986 GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed);
987 GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]);
989 if (r != CUDA_SUCCESS)
991 GOMP_PLUGIN_error ("cuLinkComplete error: %s", cuda_error (r));
992 return false;
995 CUDA_CALL (cuModuleLoadData, module, linkout);
996 CUDA_CALL (cuLinkDestroy, linkstate);
997 return true;
1000 static void
1001 event_gc (bool memmap_lockable)
1003 struct ptx_event *ptx_event = ptx_events;
1004 struct ptx_event *async_cleanups = NULL;
1005 struct nvptx_thread *nvthd = nvptx_thread ();
1007 pthread_mutex_lock (&ptx_event_lock);
1009 while (ptx_event != NULL)
1011 CUresult r;
1012 struct ptx_event *e = ptx_event;
1014 ptx_event = ptx_event->next;
1016 if (e->ord != nvthd->ptx_dev->ord)
1017 continue;
1019 r = CUDA_CALL_NOCHECK (cuEventQuery, *e->evt);
1020 if (r == CUDA_SUCCESS)
1022 bool append_async = false;
1023 CUevent *te;
1025 te = e->evt;
1027 switch (e->type)
1029 case PTX_EVT_MEM:
1030 case PTX_EVT_SYNC:
1031 break;
1033 case PTX_EVT_KNL:
1034 map_pop (e->addr);
1035 break;
1037 case PTX_EVT_ASYNC_CLEANUP:
1039 /* The function gomp_plugin_async_unmap_vars needs to claim the
1040 memory-map splay tree lock for the current device, so we
1041 can't call it when one of our callers has already claimed
1042 the lock. In that case, just delay the GC for this event
1043 until later. */
1044 if (!memmap_lockable)
1045 continue;
1047 append_async = true;
1049 break;
1052 CUDA_CALL_NOCHECK (cuEventDestroy, *te);
1053 free ((void *)te);
1055 /* Unlink 'e' from ptx_events list. */
1056 if (ptx_events == e)
1057 ptx_events = ptx_events->next;
1058 else
1060 struct ptx_event *e_ = ptx_events;
1061 while (e_->next != e)
1062 e_ = e_->next;
1063 e_->next = e_->next->next;
1066 if (append_async)
1068 e->next = async_cleanups;
1069 async_cleanups = e;
1071 else
1072 free (e);
1076 pthread_mutex_unlock (&ptx_event_lock);
1078 /* We have to do these here, after ptx_event_lock is released. */
1079 while (async_cleanups)
1081 struct ptx_event *e = async_cleanups;
1082 async_cleanups = async_cleanups->next;
1084 GOMP_PLUGIN_async_unmap_vars (e->addr, e->val);
1085 free (e);
1089 static void
1090 event_add (enum ptx_event_type type, CUevent *e, void *h, int val)
1092 struct ptx_event *ptx_event;
1093 struct nvptx_thread *nvthd = nvptx_thread ();
1095 assert (type == PTX_EVT_MEM || type == PTX_EVT_KNL || type == PTX_EVT_SYNC
1096 || type == PTX_EVT_ASYNC_CLEANUP);
1098 ptx_event = GOMP_PLUGIN_malloc (sizeof (struct ptx_event));
1099 ptx_event->type = type;
1100 ptx_event->evt = e;
1101 ptx_event->addr = h;
1102 ptx_event->ord = nvthd->ptx_dev->ord;
1103 ptx_event->val = val;
1105 pthread_mutex_lock (&ptx_event_lock);
1107 ptx_event->next = ptx_events;
1108 ptx_events = ptx_event;
1110 pthread_mutex_unlock (&ptx_event_lock);
1113 static void
1114 nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
1115 int async, unsigned *dims, void *targ_mem_desc)
1117 struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
1118 CUfunction function;
1119 CUresult r;
1120 int i;
1121 struct ptx_stream *dev_str;
1122 void *kargs[1];
1123 void *hp, *dp;
1124 struct nvptx_thread *nvthd = nvptx_thread ();
1125 int warp_size = nvthd->ptx_dev->warp_size;
1126 const char *maybe_abort_msg = "(perhaps abort was called)";
1128 function = targ_fn->fn;
1130 dev_str = select_stream_for_async (async, pthread_self (), false, NULL);
1131 assert (dev_str == nvthd->current_stream);
1133 /* Initialize the launch dimensions. Typically this is constant,
1134 provided by the device compiler, but we must permit runtime
1135 values. */
1136 int seen_zero = 0;
1137 for (i = 0; i != GOMP_DIM_MAX; i++)
1139 if (targ_fn->launch->dim[i])
1140 dims[i] = targ_fn->launch->dim[i];
1141 if (!dims[i])
1142 seen_zero = 1;
1145 if (seen_zero)
1147 pthread_mutex_lock (&ptx_dev_lock);
1149 static int gomp_openacc_dims[GOMP_DIM_MAX];
1150 if (!gomp_openacc_dims[0])
1152 /* See if the user provided GOMP_OPENACC_DIM environment
1153 variable to specify runtime defaults. */
1154 for (int i = 0; i < GOMP_DIM_MAX; ++i)
1155 gomp_openacc_dims[i] = GOMP_PLUGIN_acc_default_dim (i);
1158 if (!nvthd->ptx_dev->default_dims[0])
1160 int default_dims[GOMP_DIM_MAX];
1161 for (int i = 0; i < GOMP_DIM_MAX; ++i)
1162 default_dims[i] = gomp_openacc_dims[i];
1164 int gang, worker, vector;
1166 int block_size = nvthd->ptx_dev->max_threads_per_block;
1167 int cpu_size = nvthd->ptx_dev->max_threads_per_multiprocessor;
1168 int dev_size = nvthd->ptx_dev->num_sms;
1169 GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
1170 " dev_size=%d, cpu_size=%d\n",
1171 warp_size, block_size, dev_size, cpu_size);
1173 gang = (cpu_size / block_size) * dev_size;
1174 worker = block_size / warp_size;
1175 vector = warp_size;
1178 /* There is no upper bound on the gang size. The best size
1179 matches the hardware configuration. Logical gangs are
1180 scheduled onto physical hardware. To maximize usage, we
1181 should guess a large number. */
1182 if (default_dims[GOMP_DIM_GANG] < 1)
1183 default_dims[GOMP_DIM_GANG] = gang ? gang : 1024;
1184 /* The worker size must not exceed the hardware. */
1185 if (default_dims[GOMP_DIM_WORKER] < 1
1186 || (default_dims[GOMP_DIM_WORKER] > worker && gang))
1187 default_dims[GOMP_DIM_WORKER] = worker;
1188 /* The vector size must exactly match the hardware. */
1189 if (default_dims[GOMP_DIM_VECTOR] < 1
1190 || (default_dims[GOMP_DIM_VECTOR] != vector && gang))
1191 default_dims[GOMP_DIM_VECTOR] = vector;
1193 GOMP_PLUGIN_debug (0, " default dimensions [%d,%d,%d]\n",
1194 default_dims[GOMP_DIM_GANG],
1195 default_dims[GOMP_DIM_WORKER],
1196 default_dims[GOMP_DIM_VECTOR]);
1198 for (i = 0; i != GOMP_DIM_MAX; i++)
1199 nvthd->ptx_dev->default_dims[i] = default_dims[i];
1201 pthread_mutex_unlock (&ptx_dev_lock);
1204 bool default_dim_p[GOMP_DIM_MAX];
1205 for (i = 0; i != GOMP_DIM_MAX; i++)
1206 default_dim_p[i] = !dims[i];
1208 if (!CUDA_CALL_EXISTS (cuOccupancyMaxPotentialBlockSize))
1210 for (i = 0; i != GOMP_DIM_MAX; i++)
1211 if (default_dim_p[i])
1212 dims[i] = nvthd->ptx_dev->default_dims[i];
1214 if (default_dim_p[GOMP_DIM_VECTOR])
1215 dims[GOMP_DIM_VECTOR]
1216 = MIN (dims[GOMP_DIM_VECTOR],
1217 (targ_fn->max_threads_per_block / warp_size
1218 * warp_size));
1220 if (default_dim_p[GOMP_DIM_WORKER])
1221 dims[GOMP_DIM_WORKER]
1222 = MIN (dims[GOMP_DIM_WORKER],
1223 targ_fn->max_threads_per_block / dims[GOMP_DIM_VECTOR]);
1225 else
1227 /* Handle the case that the compiler allows the runtime to choose
1228 the vector-length conservatively, by ignoring
1229 gomp_openacc_dims[GOMP_DIM_VECTOR]. TODO: actually handle
1230 it. */
1231 int vectors = 0;
1232 /* TODO: limit gomp_openacc_dims[GOMP_DIM_WORKER] such that that
1233 gomp_openacc_dims[GOMP_DIM_WORKER] * actual_vectors does not
1234 exceed targ_fn->max_threads_per_block. */
1235 int workers = gomp_openacc_dims[GOMP_DIM_WORKER];
1236 int gangs = gomp_openacc_dims[GOMP_DIM_GANG];
1237 int grids, blocks;
1239 CUDA_CALL_ASSERT (cuOccupancyMaxPotentialBlockSize, &grids,
1240 &blocks, function, NULL, 0,
1241 dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]);
1242 GOMP_PLUGIN_debug (0, "cuOccupancyMaxPotentialBlockSize: "
1243 "grid = %d, block = %d\n", grids, blocks);
1245 /* Keep the num_gangs proportional to the block size. In
1246 the case were a block size is limited by shared-memory
1247 or the register file capacity, the runtime will not
1248 excessively over assign gangs to the multiprocessor
1249 units if their state is going to be swapped out even
1250 more than necessary. The constant factor 2 is there to
1251 prevent threads from idling when there is insufficient
1252 work for them. */
1253 if (gangs == 0)
1254 gangs = 2 * grids * (blocks / warp_size);
1256 if (vectors == 0)
1257 vectors = warp_size;
1259 if (workers == 0)
1261 int actual_vectors = (default_dim_p[GOMP_DIM_VECTOR]
1262 ? vectors
1263 : dims[GOMP_DIM_VECTOR]);
1264 workers = blocks / actual_vectors;
1267 for (i = 0; i != GOMP_DIM_MAX; i++)
1268 if (default_dim_p[i])
1269 switch (i)
1271 case GOMP_DIM_GANG: dims[i] = gangs; break;
1272 case GOMP_DIM_WORKER: dims[i] = workers; break;
1273 case GOMP_DIM_VECTOR: dims[i] = vectors; break;
1274 default: GOMP_PLUGIN_fatal ("invalid dim");
1280 /* Check if the accelerator has sufficient hardware resources to
1281 launch the offloaded kernel. */
1282 if (dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]
1283 > targ_fn->max_threads_per_block)
1285 int suggest_workers
1286 = targ_fn->max_threads_per_block / dims[GOMP_DIM_VECTOR];
1287 GOMP_PLUGIN_fatal ("The Nvidia accelerator has insufficient resources to"
1288 " launch '%s' with num_workers = %d; recompile the"
1289 " program with 'num_workers = %d' on that offloaded"
1290 " region or '-fopenacc-dim=:%d'",
1291 targ_fn->launch->fn, dims[GOMP_DIM_WORKER],
1292 suggest_workers, suggest_workers);
1295 /* This reserves a chunk of a pre-allocated page of memory mapped on both
1296 the host and the device. HP is a host pointer to the new chunk, and DP is
1297 the corresponding device pointer. */
1298 map_push (dev_str, mapnum * sizeof (void *), &hp, &dp);
1300 GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__);
1302 /* Copy the array of arguments to the mapped page. */
1303 for (i = 0; i < mapnum; i++)
1304 ((void **) hp)[i] = devaddrs[i];
1306 /* Copy the (device) pointers to arguments to the device (dp and hp might in
1307 fact have the same value on a unified-memory system). */
1308 CUDA_CALL_ASSERT (cuMemcpy, (CUdeviceptr) dp, (CUdeviceptr) hp,
1309 mapnum * sizeof (void *));
1310 GOMP_PLUGIN_debug (0, " %s: kernel %s: launch"
1311 " gangs=%u, workers=%u, vectors=%u\n",
1312 __FUNCTION__, targ_fn->launch->fn, dims[GOMP_DIM_GANG],
1313 dims[GOMP_DIM_WORKER], dims[GOMP_DIM_VECTOR]);
1315 // OpenACC CUDA
1317 // num_gangs nctaid.x
1318 // num_workers ntid.y
1319 // vector length ntid.x
1321 kargs[0] = &dp;
1322 CUDA_CALL_ASSERT (cuLaunchKernel, function,
1323 dims[GOMP_DIM_GANG], 1, 1,
1324 dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1,
1325 0, dev_str->stream, kargs, 0);
1327 #ifndef DISABLE_ASYNC
1328 if (async < acc_async_noval)
1330 r = CUDA_CALL_NOCHECK (cuStreamSynchronize, dev_str->stream);
1331 if (r == CUDA_ERROR_LAUNCH_FAILED)
1332 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
1333 maybe_abort_msg);
1334 else if (r != CUDA_SUCCESS)
1335 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1337 else
1339 CUevent *e;
1341 e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1343 r = CUDA_CALL_NOCHECK (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1344 if (r == CUDA_ERROR_LAUNCH_FAILED)
1345 GOMP_PLUGIN_fatal ("cuEventCreate error: %s %s\n", cuda_error (r),
1346 maybe_abort_msg);
1347 else if (r != CUDA_SUCCESS)
1348 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1350 event_gc (true);
1352 CUDA_CALL_ASSERT (cuEventRecord, *e, dev_str->stream);
1354 event_add (PTX_EVT_KNL, e, (void *)dev_str, 0);
1356 #else
1357 r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
1358 if (r == CUDA_ERROR_LAUNCH_FAILED)
1359 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
1360 maybe_abort_msg);
1361 else if (r != CUDA_SUCCESS)
1362 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
1363 #endif
1365 GOMP_PLUGIN_debug (0, " %s: kernel %s: finished\n", __FUNCTION__,
1366 targ_fn->launch->fn);
1368 #ifndef DISABLE_ASYNC
1369 if (async < acc_async_noval)
1370 #endif
1371 map_pop (dev_str);
1374 void * openacc_get_current_cuda_context (void);
1376 static void *
1377 nvptx_alloc (size_t s)
1379 CUdeviceptr d;
1381 CUDA_CALL_ERET (NULL, cuMemAlloc, &d, s);
1382 return (void *) d;
1385 static bool
1386 nvptx_free (void *p)
1388 CUdeviceptr pb;
1389 size_t ps;
1391 CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) p);
1392 if ((CUdeviceptr) p != pb)
1394 GOMP_PLUGIN_error ("invalid device address");
1395 return false;
1398 CUDA_CALL (cuMemFree, (CUdeviceptr) p);
1399 return true;
1403 static bool
1404 nvptx_host2dev (void *d, const void *h, size_t s)
1406 CUdeviceptr pb;
1407 size_t ps;
1408 struct nvptx_thread *nvthd = nvptx_thread ();
1410 if (!s)
1411 return true;
1412 if (!d)
1414 GOMP_PLUGIN_error ("invalid device address");
1415 return false;
1418 CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
1420 if (!pb)
1422 GOMP_PLUGIN_error ("invalid device address");
1423 return false;
1425 if (!h)
1427 GOMP_PLUGIN_error ("invalid host address");
1428 return false;
1430 if (d == h)
1432 GOMP_PLUGIN_error ("invalid host or device address");
1433 return false;
1435 if ((void *)(d + s) > (void *)(pb + ps))
1437 GOMP_PLUGIN_error ("invalid size");
1438 return false;
1441 #ifndef DISABLE_ASYNC
1442 if (nvthd && nvthd->current_stream != nvthd->ptx_dev->null_stream)
1444 CUevent *e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1445 CUDA_CALL (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1446 event_gc (false);
1447 CUDA_CALL (cuMemcpyHtoDAsync,
1448 (CUdeviceptr) d, h, s, nvthd->current_stream->stream);
1449 CUDA_CALL (cuEventRecord, *e, nvthd->current_stream->stream);
1450 event_add (PTX_EVT_MEM, e, (void *)h, 0);
1452 else
1453 #endif
1454 CUDA_CALL (cuMemcpyHtoD, (CUdeviceptr) d, h, s);
1456 return true;
1459 static bool
1460 nvptx_dev2host (void *h, const void *d, size_t s)
1462 CUdeviceptr pb;
1463 size_t ps;
1464 struct nvptx_thread *nvthd = nvptx_thread ();
1466 if (!s)
1467 return true;
1468 if (!d)
1470 GOMP_PLUGIN_error ("invalid device address");
1471 return false;
1474 CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
1476 if (!pb)
1478 GOMP_PLUGIN_error ("invalid device address");
1479 return false;
1481 if (!h)
1483 GOMP_PLUGIN_error ("invalid host address");
1484 return false;
1486 if (d == h)
1488 GOMP_PLUGIN_error ("invalid host or device address");
1489 return false;
1491 if ((void *)(d + s) > (void *)(pb + ps))
1493 GOMP_PLUGIN_error ("invalid size");
1494 return false;
1497 #ifndef DISABLE_ASYNC
1498 if (nvthd && nvthd->current_stream != nvthd->ptx_dev->null_stream)
1500 CUevent *e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1501 CUDA_CALL (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1502 event_gc (false);
1503 CUDA_CALL (cuMemcpyDtoHAsync,
1504 h, (CUdeviceptr) d, s, nvthd->current_stream->stream);
1505 CUDA_CALL (cuEventRecord, *e, nvthd->current_stream->stream);
1506 event_add (PTX_EVT_MEM, e, (void *)h, 0);
1508 else
1509 #endif
1510 CUDA_CALL (cuMemcpyDtoH, h, (CUdeviceptr) d, s);
1512 return true;
1515 static void
1516 nvptx_set_async (int async)
1518 struct nvptx_thread *nvthd = nvptx_thread ();
1519 nvthd->current_stream
1520 = select_stream_for_async (async, pthread_self (), true, NULL);
1523 static int
1524 nvptx_async_test (int async)
1526 CUresult r;
1527 struct ptx_stream *s;
1529 s = select_stream_for_async (async, pthread_self (), false, NULL);
1531 if (!s)
1532 GOMP_PLUGIN_fatal ("unknown async %d", async);
1534 r = CUDA_CALL_NOCHECK (cuStreamQuery, s->stream);
1535 if (r == CUDA_SUCCESS)
1537 /* The oacc-parallel.c:goacc_wait function calls this hook to determine
1538 whether all work has completed on this stream, and if so omits the call
1539 to the wait hook. If that happens, event_gc might not get called
1540 (which prevents variables from getting unmapped and their associated
1541 device storage freed), so call it here. */
1542 event_gc (true);
1543 return 1;
1545 else if (r == CUDA_ERROR_NOT_READY)
1546 return 0;
1548 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
1550 return 0;
1553 static int
1554 nvptx_async_test_all (void)
1556 struct ptx_stream *s;
1557 pthread_t self = pthread_self ();
1558 struct nvptx_thread *nvthd = nvptx_thread ();
1560 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1562 for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
1564 if ((s->multithreaded || pthread_equal (s->host_thread, self))
1565 && CUDA_CALL_NOCHECK (cuStreamQuery,
1566 s->stream) == CUDA_ERROR_NOT_READY)
1568 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1569 return 0;
1573 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1575 event_gc (true);
1577 return 1;
1580 static void
1581 nvptx_wait (int async)
1583 struct ptx_stream *s;
1585 s = select_stream_for_async (async, pthread_self (), false, NULL);
1586 if (!s)
1587 GOMP_PLUGIN_fatal ("unknown async %d", async);
1589 CUDA_CALL_ASSERT (cuStreamSynchronize, s->stream);
1591 event_gc (true);
1594 static void
1595 nvptx_wait_async (int async1, int async2)
1597 CUevent *e;
1598 struct ptx_stream *s1, *s2;
1599 pthread_t self = pthread_self ();
1601 /* The stream that is waiting (rather than being waited for) doesn't
1602 necessarily have to exist already. */
1603 s2 = select_stream_for_async (async2, self, true, NULL);
1605 s1 = select_stream_for_async (async1, self, false, NULL);
1606 if (!s1)
1607 GOMP_PLUGIN_fatal ("invalid async 1\n");
1609 if (s1 == s2)
1610 GOMP_PLUGIN_fatal ("identical parameters");
1612 e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1614 CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1616 event_gc (true);
1618 CUDA_CALL_ASSERT (cuEventRecord, *e, s1->stream);
1620 event_add (PTX_EVT_SYNC, e, NULL, 0);
1622 CUDA_CALL_ASSERT (cuStreamWaitEvent, s2->stream, *e, 0);
1625 static void
1626 nvptx_wait_all (void)
1628 CUresult r;
1629 struct ptx_stream *s;
1630 pthread_t self = pthread_self ();
1631 struct nvptx_thread *nvthd = nvptx_thread ();
1633 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1635 /* Wait for active streams initiated by this thread (or by multiple threads)
1636 to complete. */
1637 for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
1639 if (s->multithreaded || pthread_equal (s->host_thread, self))
1641 r = CUDA_CALL_NOCHECK (cuStreamQuery, s->stream);
1642 if (r == CUDA_SUCCESS)
1643 continue;
1644 else if (r != CUDA_ERROR_NOT_READY)
1645 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
1647 CUDA_CALL_ASSERT (cuStreamSynchronize, s->stream);
1651 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1653 event_gc (true);
1656 static void
1657 nvptx_wait_all_async (int async)
1659 struct ptx_stream *waiting_stream, *other_stream;
1660 CUevent *e;
1661 struct nvptx_thread *nvthd = nvptx_thread ();
1662 pthread_t self = pthread_self ();
1664 /* The stream doing the waiting. This could be the first mention of the
1665 stream, so create it if necessary. */
1666 waiting_stream
1667 = select_stream_for_async (async, pthread_self (), true, NULL);
1669 /* Launches on the null stream already block on other streams in the
1670 context. */
1671 if (!waiting_stream || waiting_stream == nvthd->ptx_dev->null_stream)
1672 return;
1674 event_gc (true);
1676 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1678 for (other_stream = nvthd->ptx_dev->active_streams;
1679 other_stream != NULL;
1680 other_stream = other_stream->next)
1682 if (!other_stream->multithreaded
1683 && !pthread_equal (other_stream->host_thread, self))
1684 continue;
1686 e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1688 CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1690 /* Record an event on the waited-for stream. */
1691 CUDA_CALL_ASSERT (cuEventRecord, *e, other_stream->stream);
1693 event_add (PTX_EVT_SYNC, e, NULL, 0);
1695 CUDA_CALL_ASSERT (cuStreamWaitEvent, waiting_stream->stream, *e, 0);
1698 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1701 static void *
1702 nvptx_get_current_cuda_device (void)
1704 struct nvptx_thread *nvthd = nvptx_thread ();
1706 if (!nvthd || !nvthd->ptx_dev)
1707 return NULL;
1709 return &nvthd->ptx_dev->dev;
1712 static void *
1713 nvptx_get_current_cuda_context (void)
1715 struct nvptx_thread *nvthd = nvptx_thread ();
1717 if (!nvthd || !nvthd->ptx_dev)
1718 return NULL;
1720 return nvthd->ptx_dev->ctx;
1723 static void *
1724 nvptx_get_cuda_stream (int async)
1726 struct ptx_stream *s;
1727 struct nvptx_thread *nvthd = nvptx_thread ();
1729 if (!nvthd || !nvthd->ptx_dev)
1730 return NULL;
1732 s = select_stream_for_async (async, pthread_self (), false, NULL);
1734 return s ? s->stream : NULL;
1737 static int
1738 nvptx_set_cuda_stream (int async, void *stream)
1740 struct ptx_stream *oldstream;
1741 pthread_t self = pthread_self ();
1742 struct nvptx_thread *nvthd = nvptx_thread ();
1744 if (async < 0)
1745 GOMP_PLUGIN_fatal ("bad async %d", async);
1747 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1749 /* We have a list of active streams and an array mapping async values to
1750 entries of that list. We need to take "ownership" of the passed-in stream,
1751 and add it to our list, removing the previous entry also (if there was one)
1752 in order to prevent resource leaks. Note the potential for surprise
1753 here: maybe we should keep track of passed-in streams and leave it up to
1754 the user to tidy those up, but that doesn't work for stream handles
1755 returned from acc_get_cuda_stream above... */
1757 oldstream = select_stream_for_async (async, self, false, NULL);
1759 if (oldstream)
1761 if (nvthd->ptx_dev->active_streams == oldstream)
1762 nvthd->ptx_dev->active_streams = nvthd->ptx_dev->active_streams->next;
1763 else
1765 struct ptx_stream *s = nvthd->ptx_dev->active_streams;
1766 while (s->next != oldstream)
1767 s = s->next;
1768 s->next = s->next->next;
1771 CUDA_CALL_ASSERT (cuStreamDestroy, oldstream->stream);
1773 if (!map_fini (oldstream))
1774 GOMP_PLUGIN_fatal ("error when freeing host memory");
1776 free (oldstream);
1779 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1781 (void) select_stream_for_async (async, self, true, (CUstream) stream);
1783 return 1;
1786 /* Plugin entry points. */
1788 const char *
1789 GOMP_OFFLOAD_get_name (void)
1791 return "nvptx";
1794 unsigned int
1795 GOMP_OFFLOAD_get_caps (void)
1797 return GOMP_OFFLOAD_CAP_OPENACC_200 | GOMP_OFFLOAD_CAP_OPENMP_400;
1801 GOMP_OFFLOAD_get_type (void)
1803 return OFFLOAD_TARGET_TYPE_NVIDIA_PTX;
1807 GOMP_OFFLOAD_get_num_devices (void)
1809 return nvptx_get_num_devices ();
1812 bool
1813 GOMP_OFFLOAD_init_device (int n)
1815 struct ptx_device *dev;
1817 pthread_mutex_lock (&ptx_dev_lock);
1819 if (!nvptx_init () || ptx_devices[n] != NULL)
1821 pthread_mutex_unlock (&ptx_dev_lock);
1822 return false;
1825 dev = nvptx_open_device (n);
1826 if (dev)
1828 ptx_devices[n] = dev;
1829 instantiated_devices++;
1832 pthread_mutex_unlock (&ptx_dev_lock);
1834 return dev != NULL;
1837 bool
1838 GOMP_OFFLOAD_fini_device (int n)
1840 pthread_mutex_lock (&ptx_dev_lock);
1842 if (ptx_devices[n] != NULL)
1844 if (!nvptx_attach_host_thread_to_device (n)
1845 || !nvptx_close_device (ptx_devices[n]))
1847 pthread_mutex_unlock (&ptx_dev_lock);
1848 return false;
1850 ptx_devices[n] = NULL;
1851 instantiated_devices--;
1854 pthread_mutex_unlock (&ptx_dev_lock);
1855 return true;
1858 /* Return the libgomp version number we're compatible with. There is
1859 no requirement for cross-version compatibility. */
1861 unsigned
1862 GOMP_OFFLOAD_version (void)
1864 return GOMP_VERSION;
1867 /* Initialize __nvptx_clocktick, if present in MODULE. */
1869 static void
1870 nvptx_set_clocktick (CUmodule module, struct ptx_device *dev)
1872 CUdeviceptr dptr;
1873 CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &dptr, NULL,
1874 module, "__nvptx_clocktick");
1875 if (r == CUDA_ERROR_NOT_FOUND)
1876 return;
1877 if (r != CUDA_SUCCESS)
1878 GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
1879 double __nvptx_clocktick = 1e-3 / dev->clock_khz;
1880 r = CUDA_CALL_NOCHECK (cuMemcpyHtoD, dptr, &__nvptx_clocktick,
1881 sizeof (__nvptx_clocktick));
1882 if (r != CUDA_SUCCESS)
1883 GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
1886 /* Load the (partial) program described by TARGET_DATA to device
1887 number ORD. Allocate and return TARGET_TABLE. */
1890 GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
1891 struct addr_pair **target_table)
1893 CUmodule module;
1894 const char *const *var_names;
1895 const struct targ_fn_launch *fn_descs;
1896 unsigned int fn_entries, var_entries, i, j;
1897 struct targ_fn_descriptor *targ_fns;
1898 struct addr_pair *targ_tbl;
1899 const nvptx_tdata_t *img_header = (const nvptx_tdata_t *) target_data;
1900 struct ptx_image_data *new_image;
1901 struct ptx_device *dev;
1903 if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1905 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1906 " (expected %u, received %u)",
1907 GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1908 return -1;
1911 if (!nvptx_attach_host_thread_to_device (ord)
1912 || !link_ptx (&module, img_header->ptx_objs, img_header->ptx_num))
1913 return -1;
1915 dev = ptx_devices[ord];
1917 /* The mkoffload utility emits a struct of pointers/integers at the
1918 start of each offload image. The array of kernel names and the
1919 functions addresses form a one-to-one correspondence. */
1921 var_entries = img_header->var_num;
1922 var_names = img_header->var_names;
1923 fn_entries = img_header->fn_num;
1924 fn_descs = img_header->fn_descs;
1926 targ_tbl = GOMP_PLUGIN_malloc (sizeof (struct addr_pair)
1927 * (fn_entries + var_entries));
1928 targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
1929 * fn_entries);
1931 *target_table = targ_tbl;
1933 new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data));
1934 new_image->target_data = target_data;
1935 new_image->module = module;
1936 new_image->fns = targ_fns;
1938 pthread_mutex_lock (&dev->image_lock);
1939 new_image->next = dev->images;
1940 dev->images = new_image;
1941 pthread_mutex_unlock (&dev->image_lock);
1943 for (i = 0; i < fn_entries; i++, targ_fns++, targ_tbl++)
1945 CUfunction function;
1946 int nregs, mthrs;
1948 CUDA_CALL_ERET (-1, cuModuleGetFunction, &function, module,
1949 fn_descs[i].fn);
1950 CUDA_CALL_ERET (-1, cuFuncGetAttribute, &nregs,
1951 CU_FUNC_ATTRIBUTE_NUM_REGS, function);
1952 CUDA_CALL_ERET (-1, cuFuncGetAttribute, &mthrs,
1953 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, function);
1955 targ_fns->fn = function;
1956 targ_fns->launch = &fn_descs[i];
1957 targ_fns->regs_per_thread = nregs;
1958 targ_fns->max_threads_per_block = mthrs;
1960 targ_tbl->start = (uintptr_t) targ_fns;
1961 targ_tbl->end = targ_tbl->start + 1;
1964 for (j = 0; j < var_entries; j++, targ_tbl++)
1966 CUdeviceptr var;
1967 size_t bytes;
1969 CUDA_CALL_ERET (-1, cuModuleGetGlobal,
1970 &var, &bytes, module, var_names[j]);
1972 targ_tbl->start = (uintptr_t) var;
1973 targ_tbl->end = targ_tbl->start + bytes;
1976 nvptx_set_clocktick (module, dev);
1978 return fn_entries + var_entries;
1981 /* Unload the program described by TARGET_DATA. DEV_DATA is the
1982 function descriptors allocated by G_O_load_image. */
1984 bool
1985 GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data)
1987 struct ptx_image_data *image, **prev_p;
1988 struct ptx_device *dev = ptx_devices[ord];
1990 if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1992 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1993 " (expected %u, received %u)",
1994 GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1995 return false;
1998 bool ret = true;
1999 pthread_mutex_lock (&dev->image_lock);
2000 for (prev_p = &dev->images; (image = *prev_p) != 0; prev_p = &image->next)
2001 if (image->target_data == target_data)
2003 *prev_p = image->next;
2004 if (CUDA_CALL_NOCHECK (cuModuleUnload, image->module) != CUDA_SUCCESS)
2005 ret = false;
2006 free (image->fns);
2007 free (image);
2008 break;
2010 pthread_mutex_unlock (&dev->image_lock);
2011 return ret;
2014 void *
2015 GOMP_OFFLOAD_alloc (int ord, size_t size)
2017 if (!nvptx_attach_host_thread_to_device (ord))
2018 return NULL;
2019 return nvptx_alloc (size);
2022 bool
2023 GOMP_OFFLOAD_free (int ord, void *ptr)
2025 return (nvptx_attach_host_thread_to_device (ord)
2026 && nvptx_free (ptr));
2029 bool
2030 GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
2032 return (nvptx_attach_host_thread_to_device (ord)
2033 && nvptx_dev2host (dst, src, n));
2036 bool
2037 GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
2039 return (nvptx_attach_host_thread_to_device (ord)
2040 && nvptx_host2dev (dst, src, n));
2043 bool
2044 GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n)
2046 struct ptx_device *ptx_dev = ptx_devices[ord];
2047 CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n,
2048 ptx_dev->null_stream->stream);
2049 return true;
2052 void (*device_run) (int n, void *fn_ptr, void *vars) = NULL;
2054 void
2055 GOMP_OFFLOAD_openacc_exec (void (*fn) (void *), size_t mapnum,
2056 void **hostaddrs, void **devaddrs,
2057 int async, unsigned *dims, void *targ_mem_desc)
2059 nvptx_exec (fn, mapnum, hostaddrs, devaddrs, async, dims, targ_mem_desc);
2062 void
2063 GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc, int async)
2065 struct nvptx_thread *nvthd = nvptx_thread ();
2066 CUevent *e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
2068 CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
2069 CUDA_CALL_ASSERT (cuEventRecord, *e, nvthd->current_stream->stream);
2070 event_add (PTX_EVT_ASYNC_CLEANUP, e, targ_mem_desc, async);
2074 GOMP_OFFLOAD_openacc_async_test (int async)
2076 return nvptx_async_test (async);
2080 GOMP_OFFLOAD_openacc_async_test_all (void)
2082 return nvptx_async_test_all ();
2085 void
2086 GOMP_OFFLOAD_openacc_async_wait (int async)
2088 nvptx_wait (async);
2091 void
2092 GOMP_OFFLOAD_openacc_async_wait_async (int async1, int async2)
2094 nvptx_wait_async (async1, async2);
2097 void
2098 GOMP_OFFLOAD_openacc_async_wait_all (void)
2100 nvptx_wait_all ();
2103 void
2104 GOMP_OFFLOAD_openacc_async_wait_all_async (int async)
2106 nvptx_wait_all_async (async);
2109 void
2110 GOMP_OFFLOAD_openacc_async_set_async (int async)
2112 nvptx_set_async (async);
2115 void *
2116 GOMP_OFFLOAD_openacc_create_thread_data (int ord)
2118 struct ptx_device *ptx_dev;
2119 struct nvptx_thread *nvthd
2120 = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
2121 CUcontext thd_ctx;
2123 ptx_dev = ptx_devices[ord];
2125 assert (ptx_dev);
2127 CUDA_CALL_ASSERT (cuCtxGetCurrent, &thd_ctx);
2129 assert (ptx_dev->ctx);
2131 if (!thd_ctx)
2132 CUDA_CALL_ASSERT (cuCtxPushCurrent, ptx_dev->ctx);
2134 nvthd->current_stream = ptx_dev->null_stream;
2135 nvthd->ptx_dev = ptx_dev;
2137 return (void *) nvthd;
2140 void
2141 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
2143 free (data);
2146 void *
2147 GOMP_OFFLOAD_openacc_cuda_get_current_device (void)
2149 return nvptx_get_current_cuda_device ();
2152 void *
2153 GOMP_OFFLOAD_openacc_cuda_get_current_context (void)
2155 return nvptx_get_current_cuda_context ();
2158 /* NOTE: This returns a CUstream, not a ptx_stream pointer. */
2160 void *
2161 GOMP_OFFLOAD_openacc_cuda_get_stream (int async)
2163 return nvptx_get_cuda_stream (async);
2166 /* NOTE: This takes a CUstream, not a ptx_stream pointer. */
2169 GOMP_OFFLOAD_openacc_cuda_set_stream (int async, void *stream)
2171 return nvptx_set_cuda_stream (async, stream);
2174 /* Adjust launch dimensions: pick good values for number of blocks and warps
2175 and ensure that number of warps does not exceed CUDA limits as well as GCC's
2176 own limits. */
2178 static void
2179 nvptx_adjust_launch_bounds (struct targ_fn_descriptor *fn,
2180 struct ptx_device *ptx_dev,
2181 int *teams_p, int *threads_p)
2183 int max_warps_block = fn->max_threads_per_block / 32;
2184 /* Maximum 32 warps per block is an implementation limit in NVPTX backend
2185 and libgcc, which matches documented limit of all GPUs as of 2015. */
2186 if (max_warps_block > 32)
2187 max_warps_block = 32;
2188 if (*threads_p <= 0)
2189 *threads_p = 8;
2190 if (*threads_p > max_warps_block)
2191 *threads_p = max_warps_block;
2193 int regs_per_block = fn->regs_per_thread * 32 * *threads_p;
2194 /* This is an estimate of how many blocks the device can host simultaneously.
2195 Actual limit, which may be lower, can be queried with "occupancy control"
2196 driver interface (since CUDA 6.0). */
2197 int max_blocks = ptx_dev->regs_per_sm / regs_per_block * ptx_dev->num_sms;
2198 if (*teams_p <= 0 || *teams_p > max_blocks)
2199 *teams_p = max_blocks;
2202 /* Return the size of per-warp stacks (see gcc -msoft-stack) to use for OpenMP
2203 target regions. */
2205 static size_t
2206 nvptx_stacks_size ()
2208 return 128 * 1024;
2211 /* Return contiguous storage for NUM stacks, each SIZE bytes. */
2213 static void *
2214 nvptx_stacks_alloc (size_t size, int num)
2216 CUdeviceptr stacks;
2217 CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &stacks, size * num);
2218 if (r != CUDA_SUCCESS)
2219 GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
2220 return (void *) stacks;
2223 /* Release storage previously allocated by nvptx_stacks_alloc. */
2225 static void
2226 nvptx_stacks_free (void *p, int num)
2228 CUresult r = CUDA_CALL_NOCHECK (cuMemFree, (CUdeviceptr) p);
2229 if (r != CUDA_SUCCESS)
2230 GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
2233 void
2234 GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
2236 CUfunction function = ((struct targ_fn_descriptor *) tgt_fn)->fn;
2237 CUresult r;
2238 struct ptx_device *ptx_dev = ptx_devices[ord];
2239 const char *maybe_abort_msg = "(perhaps abort was called)";
2240 int teams = 0, threads = 0;
2242 if (!args)
2243 GOMP_PLUGIN_fatal ("No target arguments provided");
2244 while (*args)
2246 intptr_t id = (intptr_t) *args++, val;
2247 if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
2248 val = (intptr_t) *args++;
2249 else
2250 val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
2251 if ((id & GOMP_TARGET_ARG_DEVICE_MASK) != GOMP_TARGET_ARG_DEVICE_ALL)
2252 continue;
2253 val = val > INT_MAX ? INT_MAX : val;
2254 id &= GOMP_TARGET_ARG_ID_MASK;
2255 if (id == GOMP_TARGET_ARG_NUM_TEAMS)
2256 teams = val;
2257 else if (id == GOMP_TARGET_ARG_THREAD_LIMIT)
2258 threads = val;
2260 nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads);
2262 size_t stack_size = nvptx_stacks_size ();
2263 void *stacks = nvptx_stacks_alloc (stack_size, teams * threads);
2264 void *fn_args[] = {tgt_vars, stacks, (void *) stack_size};
2265 size_t fn_args_size = sizeof fn_args;
2266 void *config[] = {
2267 CU_LAUNCH_PARAM_BUFFER_POINTER, fn_args,
2268 CU_LAUNCH_PARAM_BUFFER_SIZE, &fn_args_size,
2269 CU_LAUNCH_PARAM_END
2271 r = CUDA_CALL_NOCHECK (cuLaunchKernel, function, teams, 1, 1,
2272 32, threads, 1, 0, ptx_dev->null_stream->stream,
2273 NULL, config);
2274 if (r != CUDA_SUCCESS)
2275 GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
2277 r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
2278 if (r == CUDA_ERROR_LAUNCH_FAILED)
2279 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
2280 maybe_abort_msg);
2281 else if (r != CUDA_SUCCESS)
2282 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
2283 nvptx_stacks_free (stacks, teams * threads);
2286 void
2287 GOMP_OFFLOAD_async_run (int ord, void *tgt_fn, void *tgt_vars, void **args,
2288 void *async_data)
2290 GOMP_PLUGIN_fatal ("GOMP_OFFLOAD_async_run unimplemented");