gcc/
[official-gcc.git] / libgomp / plugin / plugin-nvptx.c
blob6492e5ffab77dc3479b0be22d146b5bebe50f00f
1 /* Plugin for NVPTX execution.
3 Copyright (C) 2013-2018 Free Software Foundation, Inc.
5 Contributed by Mentor Embedded.
7 This file is part of the GNU Offloading and Multi Processing Library
8 (libgomp).
10 Libgomp is free software; you can redistribute it and/or modify it
11 under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 3, or (at your option)
13 any later version.
15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
18 more details.
20 Under Section 7 of GPL version 3, you are granted additional
21 permissions described in the GCC Runtime Library Exception, version
22 3.1, as published by the Free Software Foundation.
24 You should have received a copy of the GNU General Public License and
25 a copy of the GCC Runtime Library Exception along with this program;
26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
27 <http://www.gnu.org/licenses/>. */
29 /* Nvidia PTX-specific parts of OpenACC support. The cuda driver
30 library appears to hold some implicit state, but the documentation
31 is not clear as to what that state might be. Or how one might
32 propagate it from one thread to another. */
34 #define _GNU_SOURCE
35 #include "openacc.h"
36 #include "config.h"
37 #include "libgomp-plugin.h"
38 #include "oacc-plugin.h"
39 #include "gomp-constants.h"
41 #include <pthread.h>
42 #include <cuda.h>
43 #include <stdbool.h>
44 #include <stdint.h>
45 #include <limits.h>
46 #include <string.h>
47 #include <stdio.h>
48 #include <unistd.h>
49 #include <assert.h>
50 #include <errno.h>
52 #if CUDA_VERSION < 6000
53 extern CUresult cuGetErrorString (CUresult, const char **);
54 #define CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR 82
55 #endif
57 #if CUDA_VERSION >= 6050
58 #undef cuLinkCreate
59 #undef cuLinkAddData
60 CUresult cuLinkAddData (CUlinkState, CUjitInputType, void *, size_t,
61 const char *, unsigned, CUjit_option *, void **);
62 CUresult cuLinkCreate (unsigned, CUjit_option *, void **, CUlinkState *);
63 #else
64 typedef size_t (*CUoccupancyB2DSize)(int);
65 CUresult cuLinkAddData_v2 (CUlinkState, CUjitInputType, void *, size_t,
66 const char *, unsigned, CUjit_option *, void **);
67 CUresult cuLinkCreate_v2 (unsigned, CUjit_option *, void **, CUlinkState *);
68 CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
69 CUoccupancyB2DSize, size_t, int);
70 #endif
72 #define DO_PRAGMA(x) _Pragma (#x)
74 #if PLUGIN_NVPTX_DYNAMIC
75 # include <dlfcn.h>
77 struct cuda_lib_s {
79 # define CUDA_ONE_CALL(call) \
80 __typeof (call) *call;
81 # define CUDA_ONE_CALL_MAYBE_NULL(call) \
82 CUDA_ONE_CALL (call)
83 #include "cuda-lib.def"
84 # undef CUDA_ONE_CALL
85 # undef CUDA_ONE_CALL_MAYBE_NULL
87 } cuda_lib;
89 /* -1 if init_cuda_lib has not been called yet, false
90 if it has been and failed, true if it has been and succeeded. */
91 static signed char cuda_lib_inited = -1;
93 /* Dynamically load the CUDA runtime library and initialize function
94 pointers, return false if unsuccessful, true if successful. */
95 static bool
96 init_cuda_lib (void)
98 if (cuda_lib_inited != -1)
99 return cuda_lib_inited;
100 const char *cuda_runtime_lib = "libcuda.so.1";
101 void *h = dlopen (cuda_runtime_lib, RTLD_LAZY);
102 cuda_lib_inited = false;
103 if (h == NULL)
104 return false;
106 # define CUDA_ONE_CALL(call) CUDA_ONE_CALL_1 (call, false)
107 # define CUDA_ONE_CALL_MAYBE_NULL(call) CUDA_ONE_CALL_1 (call, true)
108 # define CUDA_ONE_CALL_1(call, allow_null) \
109 cuda_lib.call = dlsym (h, #call); \
110 if (!allow_null && cuda_lib.call == NULL) \
111 return false;
112 #include "cuda-lib.def"
113 # undef CUDA_ONE_CALL
114 # undef CUDA_ONE_CALL_1
115 # undef CUDA_ONE_CALL_MAYBE_NULL
117 cuda_lib_inited = true;
118 return true;
120 # define CUDA_CALL_PREFIX cuda_lib.
121 #else
123 # define CUDA_ONE_CALL(call)
124 # define CUDA_ONE_CALL_MAYBE_NULL(call) DO_PRAGMA (weak call)
125 #include "cuda-lib.def"
126 #undef CUDA_ONE_CALL_MAYBE_NULL
127 #undef CUDA_ONE_CALL
129 # define CUDA_CALL_PREFIX
130 # define init_cuda_lib() true
131 #endif
133 #include "secure_getenv.h"
135 #undef MIN
136 #undef MAX
137 #define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
138 #define MAX(X,Y) ((X) > (Y) ? (X) : (Y))
140 /* Convenience macros for the frequently used CUDA library call and
141 error handling sequence as well as CUDA library calls that
142 do the error checking themselves or don't do it at all. */
144 #define CUDA_CALL_ERET(ERET, FN, ...) \
145 do { \
146 unsigned __r \
147 = CUDA_CALL_PREFIX FN (__VA_ARGS__); \
148 if (__r != CUDA_SUCCESS) \
150 GOMP_PLUGIN_error (#FN " error: %s", \
151 cuda_error (__r)); \
152 return ERET; \
154 } while (0)
156 #define CUDA_CALL(FN, ...) \
157 CUDA_CALL_ERET (false, FN, __VA_ARGS__)
159 #define CUDA_CALL_ASSERT(FN, ...) \
160 do { \
161 unsigned __r \
162 = CUDA_CALL_PREFIX FN (__VA_ARGS__); \
163 if (__r != CUDA_SUCCESS) \
165 GOMP_PLUGIN_fatal (#FN " error: %s", \
166 cuda_error (__r)); \
168 } while (0)
170 #define CUDA_CALL_NOCHECK(FN, ...) \
171 CUDA_CALL_PREFIX FN (__VA_ARGS__)
173 #define CUDA_CALL_EXISTS(FN) \
174 CUDA_CALL_PREFIX FN
176 static const char *
177 cuda_error (CUresult r)
179 const char *fallback = "unknown cuda error";
180 const char *desc;
182 if (!CUDA_CALL_EXISTS (cuGetErrorString))
183 return fallback;
185 r = CUDA_CALL_NOCHECK (cuGetErrorString, r, &desc);
186 if (r == CUDA_SUCCESS)
187 return desc;
189 return fallback;
192 static unsigned int instantiated_devices = 0;
193 static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
195 struct cuda_map
197 CUdeviceptr d;
198 size_t size;
199 bool active;
200 struct cuda_map *next;
203 struct ptx_stream
205 CUstream stream;
206 pthread_t host_thread;
207 bool multithreaded;
208 struct cuda_map *map;
209 struct ptx_stream *next;
212 /* Thread-specific data for PTX. */
214 struct nvptx_thread
216 struct ptx_stream *current_stream;
217 struct ptx_device *ptx_dev;
220 static struct cuda_map *
221 cuda_map_create (size_t size)
223 struct cuda_map *map = GOMP_PLUGIN_malloc (sizeof (struct cuda_map));
225 assert (map);
227 map->next = NULL;
228 map->size = size;
229 map->active = false;
231 CUDA_CALL_ERET (NULL, cuMemAlloc, &map->d, size);
232 assert (map->d);
234 return map;
237 static void
238 cuda_map_destroy (struct cuda_map *map)
240 CUDA_CALL_ASSERT (cuMemFree, map->d);
241 free (map);
244 /* The following map_* routines manage the CUDA device memory that
245 contains the data mapping arguments for cuLaunchKernel. Each
246 asynchronous PTX stream may have multiple pending kernel
247 invocations, which are launched in a FIFO order. As such, the map
248 routines maintains a queue of cuLaunchKernel arguments.
250 Calls to map_push and map_pop must be guarded by ptx_event_lock.
251 Likewise, calls to map_init and map_fini are guarded by
252 ptx_dev_lock inside GOMP_OFFLOAD_init_device and
253 GOMP_OFFLOAD_fini_device, respectively. */
255 static bool
256 map_init (struct ptx_stream *s)
258 int size = getpagesize ();
260 assert (s);
262 s->map = cuda_map_create (size);
264 return true;
267 static bool
268 map_fini (struct ptx_stream *s)
270 assert (s->map->next == NULL);
271 assert (!s->map->active);
273 cuda_map_destroy (s->map);
275 return true;
278 static void
279 map_pop (struct ptx_stream *s)
281 struct cuda_map *next;
283 assert (s != NULL);
285 if (s->map->next == NULL)
287 s->map->active = false;
288 return;
291 next = s->map->next;
292 cuda_map_destroy (s->map);
293 s->map = next;
296 static CUdeviceptr
297 map_push (struct ptx_stream *s, size_t size)
299 struct cuda_map *map = NULL, *t = NULL;
301 assert (s);
302 assert (s->map);
304 /* Each PTX stream requires a separate data region to store the
305 launch arguments for cuLaunchKernel. Allocate a new
306 cuda_map and push it to the end of the list. */
307 if (s->map->active)
309 map = cuda_map_create (size);
311 for (t = s->map; t->next != NULL; t = t->next)
314 t->next = map;
316 else if (s->map->size < size)
318 cuda_map_destroy (s->map);
319 map = cuda_map_create (size);
321 else
322 map = s->map;
324 s->map = map;
325 s->map->active = true;
327 return s->map->d;
330 /* Target data function launch information. */
332 struct targ_fn_launch
334 const char *fn;
335 unsigned short dim[GOMP_DIM_MAX];
338 /* Target PTX object information. */
340 struct targ_ptx_obj
342 const char *code;
343 size_t size;
346 /* Target data image information. */
348 typedef struct nvptx_tdata
350 const struct targ_ptx_obj *ptx_objs;
351 unsigned ptx_num;
353 const char *const *var_names;
354 unsigned var_num;
356 const struct targ_fn_launch *fn_descs;
357 unsigned fn_num;
358 } nvptx_tdata_t;
360 /* Descriptor of a loaded function. */
362 struct targ_fn_descriptor
364 CUfunction fn;
365 const struct targ_fn_launch *launch;
366 int regs_per_thread;
367 int max_threads_per_block;
370 /* A loaded PTX image. */
371 struct ptx_image_data
373 const void *target_data;
374 CUmodule module;
376 struct targ_fn_descriptor *fns; /* Array of functions. */
378 struct ptx_image_data *next;
381 struct ptx_device
383 CUcontext ctx;
384 bool ctx_shared;
385 CUdevice dev;
386 struct ptx_stream *null_stream;
387 /* All non-null streams associated with this device (actually context),
388 either created implicitly or passed in from the user (via
389 acc_set_cuda_stream). */
390 struct ptx_stream *active_streams;
391 struct {
392 struct ptx_stream **arr;
393 int size;
394 } async_streams;
395 /* A lock for use when manipulating the above stream list and array. */
396 pthread_mutex_t stream_lock;
397 int ord;
398 bool overlap;
399 bool map;
400 bool concur;
401 bool mkern;
402 int mode;
403 int clock_khz;
404 int num_sms;
405 int regs_per_block;
406 int regs_per_sm;
407 int warp_size;
408 int max_threads_per_block;
409 int max_threads_per_multiprocessor;
410 int default_dims[GOMP_DIM_MAX];
412 struct ptx_image_data *images; /* Images loaded on device. */
413 pthread_mutex_t image_lock; /* Lock for above list. */
415 struct ptx_device *next;
418 enum ptx_event_type
420 PTX_EVT_MEM,
421 PTX_EVT_KNL,
422 PTX_EVT_SYNC,
423 PTX_EVT_ASYNC_CLEANUP
426 struct ptx_event
428 CUevent *evt;
429 int type;
430 void *addr;
431 int ord;
432 int val;
434 struct ptx_event *next;
437 static pthread_mutex_t ptx_event_lock;
438 static struct ptx_event *ptx_events;
440 static struct ptx_device **ptx_devices;
442 static inline struct nvptx_thread *
443 nvptx_thread (void)
445 return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
448 static bool
449 init_streams_for_device (struct ptx_device *ptx_dev, int concurrency)
451 int i;
452 struct ptx_stream *null_stream
453 = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
455 null_stream->stream = NULL;
456 null_stream->host_thread = pthread_self ();
457 null_stream->multithreaded = true;
458 if (!map_init (null_stream))
459 return false;
461 ptx_dev->null_stream = null_stream;
462 ptx_dev->active_streams = NULL;
463 pthread_mutex_init (&ptx_dev->stream_lock, NULL);
465 if (concurrency < 1)
466 concurrency = 1;
468 /* This is just a guess -- make space for as many async streams as the
469 current device is capable of concurrently executing. This can grow
470 later as necessary. No streams are created yet. */
471 ptx_dev->async_streams.arr
472 = GOMP_PLUGIN_malloc (concurrency * sizeof (struct ptx_stream *));
473 ptx_dev->async_streams.size = concurrency;
475 for (i = 0; i < concurrency; i++)
476 ptx_dev->async_streams.arr[i] = NULL;
478 return true;
481 static bool
482 fini_streams_for_device (struct ptx_device *ptx_dev)
484 free (ptx_dev->async_streams.arr);
486 bool ret = true;
487 while (ptx_dev->active_streams != NULL)
489 struct ptx_stream *s = ptx_dev->active_streams;
490 ptx_dev->active_streams = ptx_dev->active_streams->next;
492 ret &= map_fini (s);
494 CUresult r = CUDA_CALL_NOCHECK (cuStreamDestroy, s->stream);
495 if (r != CUDA_SUCCESS)
497 GOMP_PLUGIN_error ("cuStreamDestroy error: %s", cuda_error (r));
498 ret = false;
500 free (s);
503 ret &= map_fini (ptx_dev->null_stream);
504 free (ptx_dev->null_stream);
505 return ret;
508 /* Select a stream for (OpenACC-semantics) ASYNC argument for the current
509 thread THREAD (and also current device/context). If CREATE is true, create
510 the stream if it does not exist (or use EXISTING if it is non-NULL), and
511 associate the stream with the same thread argument. Returns stream to use
512 as result. */
514 static struct ptx_stream *
515 select_stream_for_async (int async, pthread_t thread, bool create,
516 CUstream existing)
518 struct nvptx_thread *nvthd = nvptx_thread ();
519 /* Local copy of TLS variable. */
520 struct ptx_device *ptx_dev = nvthd->ptx_dev;
521 struct ptx_stream *stream = NULL;
522 int orig_async = async;
524 /* The special value acc_async_noval (-1) maps (for now) to an
525 implicitly-created stream, which is then handled the same as any other
526 numbered async stream. Other options are available, e.g. using the null
527 stream for anonymous async operations, or choosing an idle stream from an
528 active set. But, stick with this for now. */
529 if (async > acc_async_sync)
530 async++;
532 if (create)
533 pthread_mutex_lock (&ptx_dev->stream_lock);
535 /* NOTE: AFAICT there's no particular need for acc_async_sync to map to the
536 null stream, and in fact better performance may be obtainable if it doesn't
537 (because the null stream enforces overly-strict synchronisation with
538 respect to other streams for legacy reasons, and that's probably not
539 needed with OpenACC). Maybe investigate later. */
540 if (async == acc_async_sync)
541 stream = ptx_dev->null_stream;
542 else if (async >= 0 && async < ptx_dev->async_streams.size
543 && ptx_dev->async_streams.arr[async] && !(create && existing))
544 stream = ptx_dev->async_streams.arr[async];
545 else if (async >= 0 && create)
547 if (async >= ptx_dev->async_streams.size)
549 int i, newsize = ptx_dev->async_streams.size * 2;
551 if (async >= newsize)
552 newsize = async + 1;
554 ptx_dev->async_streams.arr
555 = GOMP_PLUGIN_realloc (ptx_dev->async_streams.arr,
556 newsize * sizeof (struct ptx_stream *));
558 for (i = ptx_dev->async_streams.size; i < newsize; i++)
559 ptx_dev->async_streams.arr[i] = NULL;
561 ptx_dev->async_streams.size = newsize;
564 /* Create a new stream on-demand if there isn't one already, or if we're
565 setting a particular async value to an existing (externally-provided)
566 stream. */
567 if (!ptx_dev->async_streams.arr[async] || existing)
569 CUresult r;
570 struct ptx_stream *s
571 = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
573 if (existing)
574 s->stream = existing;
575 else
577 r = CUDA_CALL_NOCHECK (cuStreamCreate, &s->stream,
578 CU_STREAM_DEFAULT);
579 if (r != CUDA_SUCCESS)
581 pthread_mutex_unlock (&ptx_dev->stream_lock);
582 GOMP_PLUGIN_fatal ("cuStreamCreate error: %s",
583 cuda_error (r));
587 /* If CREATE is true, we're going to be queueing some work on this
588 stream. Associate it with the current host thread. */
589 s->host_thread = thread;
590 s->multithreaded = false;
592 if (!map_init (s))
594 pthread_mutex_unlock (&ptx_dev->stream_lock);
595 GOMP_PLUGIN_fatal ("map_init fail");
598 s->next = ptx_dev->active_streams;
599 ptx_dev->active_streams = s;
600 ptx_dev->async_streams.arr[async] = s;
603 stream = ptx_dev->async_streams.arr[async];
605 else if (async < 0)
607 if (create)
608 pthread_mutex_unlock (&ptx_dev->stream_lock);
609 GOMP_PLUGIN_fatal ("bad async %d", async);
612 if (create)
614 assert (stream != NULL);
616 /* If we're trying to use the same stream from different threads
617 simultaneously, set stream->multithreaded to true. This affects the
618 behaviour of acc_async_test_all and acc_wait_all, which are supposed to
619 only wait for asynchronous launches from the same host thread they are
620 invoked on. If multiple threads use the same async value, we make note
621 of that here and fall back to testing/waiting for all threads in those
622 functions. */
623 if (thread != stream->host_thread)
624 stream->multithreaded = true;
626 pthread_mutex_unlock (&ptx_dev->stream_lock);
628 else if (stream && !stream->multithreaded
629 && !pthread_equal (stream->host_thread, thread))
630 GOMP_PLUGIN_fatal ("async %d used on wrong thread", orig_async);
632 return stream;
635 /* Initialize the device. Return TRUE on success, else FALSE. PTX_DEV_LOCK
636 should be locked on entry and remains locked on exit. */
638 static bool
639 nvptx_init (void)
641 int ndevs;
643 if (instantiated_devices != 0)
644 return true;
646 ptx_events = NULL;
647 pthread_mutex_init (&ptx_event_lock, NULL);
649 if (!init_cuda_lib ())
650 return false;
652 CUDA_CALL (cuInit, 0);
654 CUDA_CALL (cuDeviceGetCount, &ndevs);
655 ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *)
656 * ndevs);
657 return true;
660 /* Select the N'th PTX device for the current host thread. The device must
661 have been previously opened before calling this function. */
663 static bool
664 nvptx_attach_host_thread_to_device (int n)
666 CUdevice dev;
667 CUresult r;
668 struct ptx_device *ptx_dev;
669 CUcontext thd_ctx;
671 r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &dev);
672 if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
674 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
675 return false;
678 if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n)
679 return true;
680 else
682 CUcontext old_ctx;
684 ptx_dev = ptx_devices[n];
685 if (!ptx_dev)
687 GOMP_PLUGIN_error ("device %d not found", n);
688 return false;
691 CUDA_CALL (cuCtxGetCurrent, &thd_ctx);
693 /* We don't necessarily have a current context (e.g. if it has been
694 destroyed. Pop it if we do though. */
695 if (thd_ctx != NULL)
696 CUDA_CALL (cuCtxPopCurrent, &old_ctx);
698 CUDA_CALL (cuCtxPushCurrent, ptx_dev->ctx);
700 return true;
703 static struct ptx_device *
704 nvptx_open_device (int n)
706 struct ptx_device *ptx_dev;
707 CUdevice dev, ctx_dev;
708 CUresult r;
709 int async_engines, pi;
711 CUDA_CALL_ERET (NULL, cuDeviceGet, &dev, n);
713 ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device));
715 ptx_dev->ord = n;
716 ptx_dev->dev = dev;
717 ptx_dev->ctx_shared = false;
719 r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &ctx_dev);
720 if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
722 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
723 return NULL;
726 if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev)
728 /* The current host thread has an active context for a different device.
729 Detach it. */
730 CUcontext old_ctx;
731 CUDA_CALL_ERET (NULL, cuCtxPopCurrent, &old_ctx);
734 CUDA_CALL_ERET (NULL, cuCtxGetCurrent, &ptx_dev->ctx);
736 if (!ptx_dev->ctx)
737 CUDA_CALL_ERET (NULL, cuCtxCreate, &ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
738 else
739 ptx_dev->ctx_shared = true;
741 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
742 &pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
743 ptx_dev->overlap = pi;
745 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
746 &pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
747 ptx_dev->map = pi;
749 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
750 &pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
751 ptx_dev->concur = pi;
753 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
754 &pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
755 ptx_dev->mode = pi;
757 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
758 &pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
759 ptx_dev->mkern = pi;
761 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
762 &pi, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
763 ptx_dev->clock_khz = pi;
765 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
766 &pi, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
767 ptx_dev->num_sms = pi;
769 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
770 &pi, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev);
771 ptx_dev->regs_per_block = pi;
773 /* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR is defined only
774 in CUDA 6.0 and newer. */
775 r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi,
776 CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR,
777 dev);
778 /* Fallback: use limit of registers per block, which is usually equal. */
779 if (r == CUDA_ERROR_INVALID_VALUE)
780 pi = ptx_dev->regs_per_block;
781 else if (r != CUDA_SUCCESS)
783 GOMP_PLUGIN_error ("cuDeviceGetAttribute error: %s", cuda_error (r));
784 return NULL;
786 ptx_dev->regs_per_sm = pi;
788 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
789 &pi, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev);
790 if (pi != 32)
792 GOMP_PLUGIN_error ("Only warp size 32 is supported");
793 return NULL;
795 ptx_dev->warp_size = pi;
797 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
798 CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev);
799 ptx_dev->max_threads_per_block = pi;
801 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
802 CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev);
803 ptx_dev->max_threads_per_multiprocessor = pi;
805 r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &async_engines,
806 CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
807 if (r != CUDA_SUCCESS)
808 async_engines = 1;
810 for (int i = 0; i != GOMP_DIM_MAX; i++)
811 ptx_dev->default_dims[i] = 0;
813 ptx_dev->images = NULL;
814 pthread_mutex_init (&ptx_dev->image_lock, NULL);
816 if (!init_streams_for_device (ptx_dev, async_engines))
817 return NULL;
819 return ptx_dev;
822 static bool
823 nvptx_close_device (struct ptx_device *ptx_dev)
825 if (!ptx_dev)
826 return true;
828 if (!fini_streams_for_device (ptx_dev))
829 return false;
831 pthread_mutex_destroy (&ptx_dev->image_lock);
833 if (!ptx_dev->ctx_shared)
834 CUDA_CALL (cuCtxDestroy, ptx_dev->ctx);
836 free (ptx_dev);
837 return true;
840 static int
841 nvptx_get_num_devices (void)
843 int n;
845 /* PR libgomp/65099: Currently, we only support offloading in 64-bit
846 configurations. */
847 if (sizeof (void *) != 8)
849 GOMP_PLUGIN_debug (0, "Disabling nvptx offloading;"
850 " only 64-bit configurations are supported\n");
851 return 0;
854 /* This function will be called before the plugin has been initialized in
855 order to enumerate available devices, but CUDA API routines can't be used
856 until cuInit has been called. Just call it now (but don't yet do any
857 further initialization). */
858 if (instantiated_devices == 0)
860 if (!init_cuda_lib ())
861 return 0;
862 CUresult r = CUDA_CALL_NOCHECK (cuInit, 0);
863 /* This is not an error: e.g. we may have CUDA libraries installed but
864 no devices available. */
865 if (r != CUDA_SUCCESS)
867 GOMP_PLUGIN_debug (0, "Disabling nvptx offloading; cuInit: %s\n",
868 cuda_error (r));
869 return 0;
873 CUDA_CALL_ERET (-1, cuDeviceGetCount, &n);
874 return n;
877 static void
878 notify_var (const char *var_name, const char *env_var)
880 if (env_var == NULL)
881 GOMP_PLUGIN_debug (0, "%s: <Not defined>\n", var_name);
882 else
883 GOMP_PLUGIN_debug (0, "%s: '%s'\n", var_name, env_var);
886 static void
887 process_GOMP_NVPTX_JIT (intptr_t *gomp_nvptx_o)
889 const char *var_name = "GOMP_NVPTX_JIT";
890 const char *env_var = secure_getenv (var_name);
891 notify_var (var_name, env_var);
893 if (env_var == NULL)
894 return;
896 const char *c = env_var;
897 while (*c != '\0')
899 while (*c == ' ')
900 c++;
902 if (c[0] == '-' && c[1] == 'O'
903 && '0' <= c[2] && c[2] <= '4'
904 && (c[3] == '\0' || c[3] == ' '))
906 *gomp_nvptx_o = c[2] - '0';
907 c += 3;
908 continue;
911 GOMP_PLUGIN_error ("Error parsing %s", var_name);
912 break;
916 static bool
917 link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs,
918 unsigned num_objs)
920 CUjit_option opts[7];
921 void *optvals[7];
922 float elapsed = 0.0;
923 char elog[1024];
924 char ilog[16384];
925 CUlinkState linkstate;
926 CUresult r;
927 void *linkout;
928 size_t linkoutsize __attribute__ ((unused));
930 opts[0] = CU_JIT_WALL_TIME;
931 optvals[0] = &elapsed;
933 opts[1] = CU_JIT_INFO_LOG_BUFFER;
934 optvals[1] = &ilog[0];
936 opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
937 optvals[2] = (void *) sizeof ilog;
939 opts[3] = CU_JIT_ERROR_LOG_BUFFER;
940 optvals[3] = &elog[0];
942 opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
943 optvals[4] = (void *) sizeof elog;
945 opts[5] = CU_JIT_LOG_VERBOSE;
946 optvals[5] = (void *) 1;
948 static intptr_t gomp_nvptx_o = -1;
950 static bool init_done = false;
951 if (!init_done)
953 process_GOMP_NVPTX_JIT (&gomp_nvptx_o);
954 init_done = true;
957 int nopts = 6;
958 if (gomp_nvptx_o != -1)
960 opts[nopts] = CU_JIT_OPTIMIZATION_LEVEL;
961 optvals[nopts] = (void *) gomp_nvptx_o;
962 nopts++;
965 if (CUDA_CALL_EXISTS (cuLinkCreate_v2))
966 CUDA_CALL (cuLinkCreate_v2, nopts, opts, optvals, &linkstate);
967 else
968 CUDA_CALL (cuLinkCreate, nopts, opts, optvals, &linkstate);
970 for (; num_objs--; ptx_objs++)
972 /* cuLinkAddData's 'data' argument erroneously omits the const
973 qualifier. */
974 GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs->code);
975 if (CUDA_CALL_EXISTS (cuLinkAddData_v2))
976 r = CUDA_CALL_NOCHECK (cuLinkAddData_v2, linkstate, CU_JIT_INPUT_PTX,
977 (char *) ptx_objs->code, ptx_objs->size,
978 0, 0, 0, 0);
979 else
980 r = CUDA_CALL_NOCHECK (cuLinkAddData, linkstate, CU_JIT_INPUT_PTX,
981 (char *) ptx_objs->code, ptx_objs->size,
982 0, 0, 0, 0);
983 if (r != CUDA_SUCCESS)
985 GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
986 GOMP_PLUGIN_error ("cuLinkAddData (ptx_code) error: %s",
987 cuda_error (r));
988 return false;
992 GOMP_PLUGIN_debug (0, "Linking\n");
993 r = CUDA_CALL_NOCHECK (cuLinkComplete, linkstate, &linkout, &linkoutsize);
995 GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed);
996 GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]);
998 if (r != CUDA_SUCCESS)
1000 GOMP_PLUGIN_error ("cuLinkComplete error: %s", cuda_error (r));
1001 return false;
1004 CUDA_CALL (cuModuleLoadData, module, linkout);
1005 CUDA_CALL (cuLinkDestroy, linkstate);
1006 return true;
1009 static void
1010 event_gc (bool memmap_lockable)
1012 struct ptx_event *ptx_event = ptx_events;
1013 struct ptx_event *async_cleanups = NULL;
1014 struct nvptx_thread *nvthd = nvptx_thread ();
1016 pthread_mutex_lock (&ptx_event_lock);
1018 while (ptx_event != NULL)
1020 CUresult r;
1021 struct ptx_event *e = ptx_event;
1023 ptx_event = ptx_event->next;
1025 if (e->ord != nvthd->ptx_dev->ord)
1026 continue;
1028 r = CUDA_CALL_NOCHECK (cuEventQuery, *e->evt);
1029 if (r == CUDA_SUCCESS)
1031 bool append_async = false;
1032 CUevent *te;
1034 te = e->evt;
1036 switch (e->type)
1038 case PTX_EVT_MEM:
1039 case PTX_EVT_SYNC:
1040 break;
1042 case PTX_EVT_KNL:
1043 map_pop (e->addr);
1044 break;
1046 case PTX_EVT_ASYNC_CLEANUP:
1048 /* The function gomp_plugin_async_unmap_vars needs to claim the
1049 memory-map splay tree lock for the current device, so we
1050 can't call it when one of our callers has already claimed
1051 the lock. In that case, just delay the GC for this event
1052 until later. */
1053 if (!memmap_lockable)
1054 continue;
1056 append_async = true;
1058 break;
1061 CUDA_CALL_NOCHECK (cuEventDestroy, *te);
1062 free ((void *)te);
1064 /* Unlink 'e' from ptx_events list. */
1065 if (ptx_events == e)
1066 ptx_events = ptx_events->next;
1067 else
1069 struct ptx_event *e_ = ptx_events;
1070 while (e_->next != e)
1071 e_ = e_->next;
1072 e_->next = e_->next->next;
1075 if (append_async)
1077 e->next = async_cleanups;
1078 async_cleanups = e;
1080 else
1081 free (e);
1085 pthread_mutex_unlock (&ptx_event_lock);
1087 /* We have to do these here, after ptx_event_lock is released. */
1088 while (async_cleanups)
1090 struct ptx_event *e = async_cleanups;
1091 async_cleanups = async_cleanups->next;
1093 GOMP_PLUGIN_async_unmap_vars (e->addr, e->val);
1094 free (e);
1098 static void
1099 event_add (enum ptx_event_type type, CUevent *e, void *h, int val)
1101 struct ptx_event *ptx_event;
1102 struct nvptx_thread *nvthd = nvptx_thread ();
1104 assert (type == PTX_EVT_MEM || type == PTX_EVT_KNL || type == PTX_EVT_SYNC
1105 || type == PTX_EVT_ASYNC_CLEANUP);
1107 ptx_event = GOMP_PLUGIN_malloc (sizeof (struct ptx_event));
1108 ptx_event->type = type;
1109 ptx_event->evt = e;
1110 ptx_event->addr = h;
1111 ptx_event->ord = nvthd->ptx_dev->ord;
1112 ptx_event->val = val;
1114 pthread_mutex_lock (&ptx_event_lock);
1116 ptx_event->next = ptx_events;
1117 ptx_events = ptx_event;
1119 pthread_mutex_unlock (&ptx_event_lock);
1122 static void
1123 nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
1124 int async, unsigned *dims, void *targ_mem_desc)
1126 struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
1127 CUfunction function;
1128 CUresult r;
1129 int i;
1130 struct ptx_stream *dev_str;
1131 void *kargs[1];
1132 void *hp;
1133 CUdeviceptr dp;
1134 struct nvptx_thread *nvthd = nvptx_thread ();
1135 int warp_size = nvthd->ptx_dev->warp_size;
1136 const char *maybe_abort_msg = "(perhaps abort was called)";
1138 function = targ_fn->fn;
1140 dev_str = select_stream_for_async (async, pthread_self (), false, NULL);
1141 assert (dev_str == nvthd->current_stream);
1143 /* Initialize the launch dimensions. Typically this is constant,
1144 provided by the device compiler, but we must permit runtime
1145 values. */
1146 int seen_zero = 0;
1147 for (i = 0; i != GOMP_DIM_MAX; i++)
1149 if (targ_fn->launch->dim[i])
1150 dims[i] = targ_fn->launch->dim[i];
1151 if (!dims[i])
1152 seen_zero = 1;
1155 if (seen_zero)
1157 pthread_mutex_lock (&ptx_dev_lock);
1159 static int gomp_openacc_dims[GOMP_DIM_MAX];
1160 if (!gomp_openacc_dims[0])
1162 /* See if the user provided GOMP_OPENACC_DIM environment
1163 variable to specify runtime defaults. */
1164 for (int i = 0; i < GOMP_DIM_MAX; ++i)
1165 gomp_openacc_dims[i] = GOMP_PLUGIN_acc_default_dim (i);
1168 if (!nvthd->ptx_dev->default_dims[0])
1170 int default_dims[GOMP_DIM_MAX];
1171 for (int i = 0; i < GOMP_DIM_MAX; ++i)
1172 default_dims[i] = gomp_openacc_dims[i];
1174 int gang, worker, vector;
1176 int block_size = nvthd->ptx_dev->max_threads_per_block;
1177 int cpu_size = nvthd->ptx_dev->max_threads_per_multiprocessor;
1178 int dev_size = nvthd->ptx_dev->num_sms;
1179 GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
1180 " dev_size=%d, cpu_size=%d\n",
1181 warp_size, block_size, dev_size, cpu_size);
1183 gang = (cpu_size / block_size) * dev_size;
1184 worker = block_size / warp_size;
1185 vector = warp_size;
1188 /* There is no upper bound on the gang size. The best size
1189 matches the hardware configuration. Logical gangs are
1190 scheduled onto physical hardware. To maximize usage, we
1191 should guess a large number. */
1192 if (default_dims[GOMP_DIM_GANG] < 1)
1193 default_dims[GOMP_DIM_GANG] = gang ? gang : 1024;
1194 /* The worker size must not exceed the hardware. */
1195 if (default_dims[GOMP_DIM_WORKER] < 1
1196 || (default_dims[GOMP_DIM_WORKER] > worker && gang))
1197 default_dims[GOMP_DIM_WORKER] = worker;
1198 /* The vector size must exactly match the hardware. */
1199 if (default_dims[GOMP_DIM_VECTOR] < 1
1200 || (default_dims[GOMP_DIM_VECTOR] != vector && gang))
1201 default_dims[GOMP_DIM_VECTOR] = vector;
1203 GOMP_PLUGIN_debug (0, " default dimensions [%d,%d,%d]\n",
1204 default_dims[GOMP_DIM_GANG],
1205 default_dims[GOMP_DIM_WORKER],
1206 default_dims[GOMP_DIM_VECTOR]);
1208 for (i = 0; i != GOMP_DIM_MAX; i++)
1209 nvthd->ptx_dev->default_dims[i] = default_dims[i];
1211 pthread_mutex_unlock (&ptx_dev_lock);
1214 bool default_dim_p[GOMP_DIM_MAX];
1215 for (i = 0; i != GOMP_DIM_MAX; i++)
1216 default_dim_p[i] = !dims[i];
1218 if (!CUDA_CALL_EXISTS (cuOccupancyMaxPotentialBlockSize))
1220 for (i = 0; i != GOMP_DIM_MAX; i++)
1221 if (default_dim_p[i])
1222 dims[i] = nvthd->ptx_dev->default_dims[i];
1224 if (default_dim_p[GOMP_DIM_VECTOR])
1225 dims[GOMP_DIM_VECTOR]
1226 = MIN (dims[GOMP_DIM_VECTOR],
1227 (targ_fn->max_threads_per_block / warp_size
1228 * warp_size));
1230 if (default_dim_p[GOMP_DIM_WORKER])
1231 dims[GOMP_DIM_WORKER]
1232 = MIN (dims[GOMP_DIM_WORKER],
1233 targ_fn->max_threads_per_block / dims[GOMP_DIM_VECTOR]);
1235 else
1237 /* Handle the case that the compiler allows the runtime to choose
1238 the vector-length conservatively, by ignoring
1239 gomp_openacc_dims[GOMP_DIM_VECTOR]. TODO: actually handle
1240 it. */
1241 int vectors = 0;
1242 /* TODO: limit gomp_openacc_dims[GOMP_DIM_WORKER] such that that
1243 gomp_openacc_dims[GOMP_DIM_WORKER] * actual_vectors does not
1244 exceed targ_fn->max_threads_per_block. */
1245 int workers = gomp_openacc_dims[GOMP_DIM_WORKER];
1246 int gangs = gomp_openacc_dims[GOMP_DIM_GANG];
1247 int grids, blocks;
1249 CUDA_CALL_ASSERT (cuOccupancyMaxPotentialBlockSize, &grids,
1250 &blocks, function, NULL, 0,
1251 dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]);
1252 GOMP_PLUGIN_debug (0, "cuOccupancyMaxPotentialBlockSize: "
1253 "grid = %d, block = %d\n", grids, blocks);
1255 /* Keep the num_gangs proportional to the block size. In
1256 the case were a block size is limited by shared-memory
1257 or the register file capacity, the runtime will not
1258 excessively over assign gangs to the multiprocessor
1259 units if their state is going to be swapped out even
1260 more than necessary. The constant factor 2 is there to
1261 prevent threads from idling when there is insufficient
1262 work for them. */
1263 if (gangs == 0)
1264 gangs = 2 * grids * (blocks / warp_size);
1266 if (vectors == 0)
1267 vectors = warp_size;
1269 if (workers == 0)
1271 int actual_vectors = (default_dim_p[GOMP_DIM_VECTOR]
1272 ? vectors
1273 : dims[GOMP_DIM_VECTOR]);
1274 workers = blocks / actual_vectors;
1277 for (i = 0; i != GOMP_DIM_MAX; i++)
1278 if (default_dim_p[i])
1279 switch (i)
1281 case GOMP_DIM_GANG: dims[i] = gangs; break;
1282 case GOMP_DIM_WORKER: dims[i] = workers; break;
1283 case GOMP_DIM_VECTOR: dims[i] = vectors; break;
1284 default: GOMP_PLUGIN_fatal ("invalid dim");
1290 /* Check if the accelerator has sufficient hardware resources to
1291 launch the offloaded kernel. */
1292 if (dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]
1293 > targ_fn->max_threads_per_block)
1295 int suggest_workers
1296 = targ_fn->max_threads_per_block / dims[GOMP_DIM_VECTOR];
1297 GOMP_PLUGIN_fatal ("The Nvidia accelerator has insufficient resources to"
1298 " launch '%s' with num_workers = %d; recompile the"
1299 " program with 'num_workers = %d' on that offloaded"
1300 " region or '-fopenacc-dim=:%d'",
1301 targ_fn->launch->fn, dims[GOMP_DIM_WORKER],
1302 suggest_workers, suggest_workers);
1305 /* This reserves a chunk of a pre-allocated page of memory mapped on both
1306 the host and the device. HP is a host pointer to the new chunk, and DP is
1307 the corresponding device pointer. */
1308 pthread_mutex_lock (&ptx_event_lock);
1309 dp = map_push (dev_str, mapnum * sizeof (void *));
1310 pthread_mutex_unlock (&ptx_event_lock);
1312 GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__);
1314 /* Copy the array of arguments to the mapped page. */
1315 hp = alloca(sizeof(void *) * mapnum);
1316 for (i = 0; i < mapnum; i++)
1317 ((void **) hp)[i] = devaddrs[i];
1319 /* Copy the (device) pointers to arguments to the device */
1320 CUDA_CALL_ASSERT (cuMemcpyHtoD, dp, hp,
1321 mapnum * sizeof (void *));
1322 GOMP_PLUGIN_debug (0, " %s: kernel %s: launch"
1323 " gangs=%u, workers=%u, vectors=%u\n",
1324 __FUNCTION__, targ_fn->launch->fn, dims[GOMP_DIM_GANG],
1325 dims[GOMP_DIM_WORKER], dims[GOMP_DIM_VECTOR]);
1327 // OpenACC CUDA
1329 // num_gangs nctaid.x
1330 // num_workers ntid.y
1331 // vector length ntid.x
1333 kargs[0] = &dp;
1334 CUDA_CALL_ASSERT (cuLaunchKernel, function,
1335 dims[GOMP_DIM_GANG], 1, 1,
1336 dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1,
1337 0, dev_str->stream, kargs, 0);
1339 #ifndef DISABLE_ASYNC
1340 if (async < acc_async_noval)
1342 r = CUDA_CALL_NOCHECK (cuStreamSynchronize, dev_str->stream);
1343 if (r == CUDA_ERROR_LAUNCH_FAILED)
1344 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
1345 maybe_abort_msg);
1346 else if (r != CUDA_SUCCESS)
1347 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1349 else
1351 CUevent *e;
1353 e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1355 r = CUDA_CALL_NOCHECK (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1356 if (r == CUDA_ERROR_LAUNCH_FAILED)
1357 GOMP_PLUGIN_fatal ("cuEventCreate error: %s %s\n", cuda_error (r),
1358 maybe_abort_msg);
1359 else if (r != CUDA_SUCCESS)
1360 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1362 event_gc (true);
1364 CUDA_CALL_ASSERT (cuEventRecord, *e, dev_str->stream);
1366 event_add (PTX_EVT_KNL, e, (void *)dev_str, 0);
1368 #else
1369 r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
1370 if (r == CUDA_ERROR_LAUNCH_FAILED)
1371 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
1372 maybe_abort_msg);
1373 else if (r != CUDA_SUCCESS)
1374 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
1375 #endif
1377 GOMP_PLUGIN_debug (0, " %s: kernel %s: finished\n", __FUNCTION__,
1378 targ_fn->launch->fn);
1380 #ifndef DISABLE_ASYNC
1381 if (async < acc_async_noval)
1382 #endif
1383 map_pop (dev_str);
1386 void * openacc_get_current_cuda_context (void);
1388 static void *
1389 nvptx_alloc (size_t s)
1391 CUdeviceptr d;
1393 CUDA_CALL_ERET (NULL, cuMemAlloc, &d, s);
1394 return (void *) d;
1397 static bool
1398 nvptx_free (void *p)
1400 CUdeviceptr pb;
1401 size_t ps;
1403 CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) p);
1404 if ((CUdeviceptr) p != pb)
1406 GOMP_PLUGIN_error ("invalid device address");
1407 return false;
1410 CUDA_CALL (cuMemFree, (CUdeviceptr) p);
1411 return true;
1415 static bool
1416 nvptx_host2dev (void *d, const void *h, size_t s)
1418 CUdeviceptr pb;
1419 size_t ps;
1420 struct nvptx_thread *nvthd = nvptx_thread ();
1422 if (!s)
1423 return true;
1424 if (!d)
1426 GOMP_PLUGIN_error ("invalid device address");
1427 return false;
1430 CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
1432 if (!pb)
1434 GOMP_PLUGIN_error ("invalid device address");
1435 return false;
1437 if (!h)
1439 GOMP_PLUGIN_error ("invalid host address");
1440 return false;
1442 if (d == h)
1444 GOMP_PLUGIN_error ("invalid host or device address");
1445 return false;
1447 if ((void *)(d + s) > (void *)(pb + ps))
1449 GOMP_PLUGIN_error ("invalid size");
1450 return false;
1453 #ifndef DISABLE_ASYNC
1454 if (nvthd && nvthd->current_stream != nvthd->ptx_dev->null_stream)
1456 CUevent *e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1457 CUDA_CALL (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1458 event_gc (false);
1459 CUDA_CALL (cuMemcpyHtoDAsync,
1460 (CUdeviceptr) d, h, s, nvthd->current_stream->stream);
1461 CUDA_CALL (cuEventRecord, *e, nvthd->current_stream->stream);
1462 event_add (PTX_EVT_MEM, e, (void *)h, 0);
1464 else
1465 #endif
1466 CUDA_CALL (cuMemcpyHtoD, (CUdeviceptr) d, h, s);
1468 return true;
1471 static bool
1472 nvptx_dev2host (void *h, const void *d, size_t s)
1474 CUdeviceptr pb;
1475 size_t ps;
1476 struct nvptx_thread *nvthd = nvptx_thread ();
1478 if (!s)
1479 return true;
1480 if (!d)
1482 GOMP_PLUGIN_error ("invalid device address");
1483 return false;
1486 CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
1488 if (!pb)
1490 GOMP_PLUGIN_error ("invalid device address");
1491 return false;
1493 if (!h)
1495 GOMP_PLUGIN_error ("invalid host address");
1496 return false;
1498 if (d == h)
1500 GOMP_PLUGIN_error ("invalid host or device address");
1501 return false;
1503 if ((void *)(d + s) > (void *)(pb + ps))
1505 GOMP_PLUGIN_error ("invalid size");
1506 return false;
1509 #ifndef DISABLE_ASYNC
1510 if (nvthd && nvthd->current_stream != nvthd->ptx_dev->null_stream)
1512 CUevent *e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1513 CUDA_CALL (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1514 event_gc (false);
1515 CUDA_CALL (cuMemcpyDtoHAsync,
1516 h, (CUdeviceptr) d, s, nvthd->current_stream->stream);
1517 CUDA_CALL (cuEventRecord, *e, nvthd->current_stream->stream);
1518 event_add (PTX_EVT_MEM, e, (void *)h, 0);
1520 else
1521 #endif
1522 CUDA_CALL (cuMemcpyDtoH, h, (CUdeviceptr) d, s);
1524 return true;
1527 static void
1528 nvptx_set_async (int async)
1530 struct nvptx_thread *nvthd = nvptx_thread ();
1531 nvthd->current_stream
1532 = select_stream_for_async (async, pthread_self (), true, NULL);
1535 static int
1536 nvptx_async_test (int async)
1538 CUresult r;
1539 struct ptx_stream *s;
1541 s = select_stream_for_async (async, pthread_self (), false, NULL);
1543 if (!s)
1544 GOMP_PLUGIN_fatal ("unknown async %d", async);
1546 r = CUDA_CALL_NOCHECK (cuStreamQuery, s->stream);
1547 if (r == CUDA_SUCCESS)
1549 /* The oacc-parallel.c:goacc_wait function calls this hook to determine
1550 whether all work has completed on this stream, and if so omits the call
1551 to the wait hook. If that happens, event_gc might not get called
1552 (which prevents variables from getting unmapped and their associated
1553 device storage freed), so call it here. */
1554 event_gc (true);
1555 return 1;
1557 else if (r == CUDA_ERROR_NOT_READY)
1558 return 0;
1560 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
1562 return 0;
1565 static int
1566 nvptx_async_test_all (void)
1568 struct ptx_stream *s;
1569 pthread_t self = pthread_self ();
1570 struct nvptx_thread *nvthd = nvptx_thread ();
1572 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1574 for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
1576 if ((s->multithreaded || pthread_equal (s->host_thread, self))
1577 && CUDA_CALL_NOCHECK (cuStreamQuery,
1578 s->stream) == CUDA_ERROR_NOT_READY)
1580 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1581 return 0;
1585 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1587 event_gc (true);
1589 return 1;
1592 static void
1593 nvptx_wait (int async)
1595 struct ptx_stream *s;
1597 s = select_stream_for_async (async, pthread_self (), false, NULL);
1598 if (!s)
1599 GOMP_PLUGIN_fatal ("unknown async %d", async);
1601 CUDA_CALL_ASSERT (cuStreamSynchronize, s->stream);
1603 event_gc (true);
1606 static void
1607 nvptx_wait_async (int async1, int async2)
1609 CUevent *e;
1610 struct ptx_stream *s1, *s2;
1611 pthread_t self = pthread_self ();
1613 /* The stream that is waiting (rather than being waited for) doesn't
1614 necessarily have to exist already. */
1615 s2 = select_stream_for_async (async2, self, true, NULL);
1617 s1 = select_stream_for_async (async1, self, false, NULL);
1618 if (!s1)
1619 GOMP_PLUGIN_fatal ("invalid async 1\n");
1621 if (s1 == s2)
1622 GOMP_PLUGIN_fatal ("identical parameters");
1624 e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1626 CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1628 event_gc (true);
1630 CUDA_CALL_ASSERT (cuEventRecord, *e, s1->stream);
1632 event_add (PTX_EVT_SYNC, e, NULL, 0);
1634 CUDA_CALL_ASSERT (cuStreamWaitEvent, s2->stream, *e, 0);
1637 static void
1638 nvptx_wait_all (void)
1640 CUresult r;
1641 struct ptx_stream *s;
1642 pthread_t self = pthread_self ();
1643 struct nvptx_thread *nvthd = nvptx_thread ();
1645 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1647 /* Wait for active streams initiated by this thread (or by multiple threads)
1648 to complete. */
1649 for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
1651 if (s->multithreaded || pthread_equal (s->host_thread, self))
1653 r = CUDA_CALL_NOCHECK (cuStreamQuery, s->stream);
1654 if (r == CUDA_SUCCESS)
1655 continue;
1656 else if (r != CUDA_ERROR_NOT_READY)
1657 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
1659 CUDA_CALL_ASSERT (cuStreamSynchronize, s->stream);
1663 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1665 event_gc (true);
1668 static void
1669 nvptx_wait_all_async (int async)
1671 struct ptx_stream *waiting_stream, *other_stream;
1672 CUevent *e;
1673 struct nvptx_thread *nvthd = nvptx_thread ();
1674 pthread_t self = pthread_self ();
1676 /* The stream doing the waiting. This could be the first mention of the
1677 stream, so create it if necessary. */
1678 waiting_stream
1679 = select_stream_for_async (async, pthread_self (), true, NULL);
1681 /* Launches on the null stream already block on other streams in the
1682 context. */
1683 if (!waiting_stream || waiting_stream == nvthd->ptx_dev->null_stream)
1684 return;
1686 event_gc (true);
1688 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1690 for (other_stream = nvthd->ptx_dev->active_streams;
1691 other_stream != NULL;
1692 other_stream = other_stream->next)
1694 if (!other_stream->multithreaded
1695 && !pthread_equal (other_stream->host_thread, self))
1696 continue;
1698 e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1700 CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1702 /* Record an event on the waited-for stream. */
1703 CUDA_CALL_ASSERT (cuEventRecord, *e, other_stream->stream);
1705 event_add (PTX_EVT_SYNC, e, NULL, 0);
1707 CUDA_CALL_ASSERT (cuStreamWaitEvent, waiting_stream->stream, *e, 0);
1710 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1713 static void *
1714 nvptx_get_current_cuda_device (void)
1716 struct nvptx_thread *nvthd = nvptx_thread ();
1718 if (!nvthd || !nvthd->ptx_dev)
1719 return NULL;
1721 return &nvthd->ptx_dev->dev;
1724 static void *
1725 nvptx_get_current_cuda_context (void)
1727 struct nvptx_thread *nvthd = nvptx_thread ();
1729 if (!nvthd || !nvthd->ptx_dev)
1730 return NULL;
1732 return nvthd->ptx_dev->ctx;
1735 static void *
1736 nvptx_get_cuda_stream (int async)
1738 struct ptx_stream *s;
1739 struct nvptx_thread *nvthd = nvptx_thread ();
1741 if (!nvthd || !nvthd->ptx_dev)
1742 return NULL;
1744 s = select_stream_for_async (async, pthread_self (), false, NULL);
1746 return s ? s->stream : NULL;
1749 static int
1750 nvptx_set_cuda_stream (int async, void *stream)
1752 struct ptx_stream *oldstream;
1753 pthread_t self = pthread_self ();
1754 struct nvptx_thread *nvthd = nvptx_thread ();
1756 if (async < 0)
1757 GOMP_PLUGIN_fatal ("bad async %d", async);
1759 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1761 /* We have a list of active streams and an array mapping async values to
1762 entries of that list. We need to take "ownership" of the passed-in stream,
1763 and add it to our list, removing the previous entry also (if there was one)
1764 in order to prevent resource leaks. Note the potential for surprise
1765 here: maybe we should keep track of passed-in streams and leave it up to
1766 the user to tidy those up, but that doesn't work for stream handles
1767 returned from acc_get_cuda_stream above... */
1769 oldstream = select_stream_for_async (async, self, false, NULL);
1771 if (oldstream)
1773 if (nvthd->ptx_dev->active_streams == oldstream)
1774 nvthd->ptx_dev->active_streams = nvthd->ptx_dev->active_streams->next;
1775 else
1777 struct ptx_stream *s = nvthd->ptx_dev->active_streams;
1778 while (s->next != oldstream)
1779 s = s->next;
1780 s->next = s->next->next;
1783 CUDA_CALL_ASSERT (cuStreamDestroy, oldstream->stream);
1785 if (!map_fini (oldstream))
1786 GOMP_PLUGIN_fatal ("error when freeing host memory");
1788 free (oldstream);
1791 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1793 (void) select_stream_for_async (async, self, true, (CUstream) stream);
1795 return 1;
1798 /* Plugin entry points. */
1800 const char *
1801 GOMP_OFFLOAD_get_name (void)
1803 return "nvptx";
1806 unsigned int
1807 GOMP_OFFLOAD_get_caps (void)
1809 return GOMP_OFFLOAD_CAP_OPENACC_200 | GOMP_OFFLOAD_CAP_OPENMP_400;
1813 GOMP_OFFLOAD_get_type (void)
1815 return OFFLOAD_TARGET_TYPE_NVIDIA_PTX;
1819 GOMP_OFFLOAD_get_num_devices (void)
1821 return nvptx_get_num_devices ();
1824 bool
1825 GOMP_OFFLOAD_init_device (int n)
1827 struct ptx_device *dev;
1829 pthread_mutex_lock (&ptx_dev_lock);
1831 if (!nvptx_init () || ptx_devices[n] != NULL)
1833 pthread_mutex_unlock (&ptx_dev_lock);
1834 return false;
1837 dev = nvptx_open_device (n);
1838 if (dev)
1840 ptx_devices[n] = dev;
1841 instantiated_devices++;
1844 pthread_mutex_unlock (&ptx_dev_lock);
1846 return dev != NULL;
1849 bool
1850 GOMP_OFFLOAD_fini_device (int n)
1852 pthread_mutex_lock (&ptx_dev_lock);
1854 if (ptx_devices[n] != NULL)
1856 if (!nvptx_attach_host_thread_to_device (n)
1857 || !nvptx_close_device (ptx_devices[n]))
1859 pthread_mutex_unlock (&ptx_dev_lock);
1860 return false;
1862 ptx_devices[n] = NULL;
1863 instantiated_devices--;
1866 pthread_mutex_unlock (&ptx_dev_lock);
1867 return true;
1870 /* Return the libgomp version number we're compatible with. There is
1871 no requirement for cross-version compatibility. */
1873 unsigned
1874 GOMP_OFFLOAD_version (void)
1876 return GOMP_VERSION;
1879 /* Initialize __nvptx_clocktick, if present in MODULE. */
1881 static void
1882 nvptx_set_clocktick (CUmodule module, struct ptx_device *dev)
1884 CUdeviceptr dptr;
1885 CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &dptr, NULL,
1886 module, "__nvptx_clocktick");
1887 if (r == CUDA_ERROR_NOT_FOUND)
1888 return;
1889 if (r != CUDA_SUCCESS)
1890 GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
1891 double __nvptx_clocktick = 1e-3 / dev->clock_khz;
1892 r = CUDA_CALL_NOCHECK (cuMemcpyHtoD, dptr, &__nvptx_clocktick,
1893 sizeof (__nvptx_clocktick));
1894 if (r != CUDA_SUCCESS)
1895 GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
1898 /* Load the (partial) program described by TARGET_DATA to device
1899 number ORD. Allocate and return TARGET_TABLE. */
1902 GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
1903 struct addr_pair **target_table)
1905 CUmodule module;
1906 const char *const *var_names;
1907 const struct targ_fn_launch *fn_descs;
1908 unsigned int fn_entries, var_entries, i, j;
1909 struct targ_fn_descriptor *targ_fns;
1910 struct addr_pair *targ_tbl;
1911 const nvptx_tdata_t *img_header = (const nvptx_tdata_t *) target_data;
1912 struct ptx_image_data *new_image;
1913 struct ptx_device *dev;
1915 if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1917 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1918 " (expected %u, received %u)",
1919 GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1920 return -1;
1923 if (!nvptx_attach_host_thread_to_device (ord)
1924 || !link_ptx (&module, img_header->ptx_objs, img_header->ptx_num))
1925 return -1;
1927 dev = ptx_devices[ord];
1929 /* The mkoffload utility emits a struct of pointers/integers at the
1930 start of each offload image. The array of kernel names and the
1931 functions addresses form a one-to-one correspondence. */
1933 var_entries = img_header->var_num;
1934 var_names = img_header->var_names;
1935 fn_entries = img_header->fn_num;
1936 fn_descs = img_header->fn_descs;
1938 targ_tbl = GOMP_PLUGIN_malloc (sizeof (struct addr_pair)
1939 * (fn_entries + var_entries));
1940 targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
1941 * fn_entries);
1943 *target_table = targ_tbl;
1945 new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data));
1946 new_image->target_data = target_data;
1947 new_image->module = module;
1948 new_image->fns = targ_fns;
1950 pthread_mutex_lock (&dev->image_lock);
1951 new_image->next = dev->images;
1952 dev->images = new_image;
1953 pthread_mutex_unlock (&dev->image_lock);
1955 for (i = 0; i < fn_entries; i++, targ_fns++, targ_tbl++)
1957 CUfunction function;
1958 int nregs, mthrs;
1960 CUDA_CALL_ERET (-1, cuModuleGetFunction, &function, module,
1961 fn_descs[i].fn);
1962 CUDA_CALL_ERET (-1, cuFuncGetAttribute, &nregs,
1963 CU_FUNC_ATTRIBUTE_NUM_REGS, function);
1964 CUDA_CALL_ERET (-1, cuFuncGetAttribute, &mthrs,
1965 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, function);
1967 targ_fns->fn = function;
1968 targ_fns->launch = &fn_descs[i];
1969 targ_fns->regs_per_thread = nregs;
1970 targ_fns->max_threads_per_block = mthrs;
1972 targ_tbl->start = (uintptr_t) targ_fns;
1973 targ_tbl->end = targ_tbl->start + 1;
1976 for (j = 0; j < var_entries; j++, targ_tbl++)
1978 CUdeviceptr var;
1979 size_t bytes;
1981 CUDA_CALL_ERET (-1, cuModuleGetGlobal,
1982 &var, &bytes, module, var_names[j]);
1984 targ_tbl->start = (uintptr_t) var;
1985 targ_tbl->end = targ_tbl->start + bytes;
1988 nvptx_set_clocktick (module, dev);
1990 return fn_entries + var_entries;
1993 /* Unload the program described by TARGET_DATA. DEV_DATA is the
1994 function descriptors allocated by G_O_load_image. */
1996 bool
1997 GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data)
1999 struct ptx_image_data *image, **prev_p;
2000 struct ptx_device *dev = ptx_devices[ord];
2002 if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
2004 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
2005 " (expected %u, received %u)",
2006 GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
2007 return false;
2010 bool ret = true;
2011 pthread_mutex_lock (&dev->image_lock);
2012 for (prev_p = &dev->images; (image = *prev_p) != 0; prev_p = &image->next)
2013 if (image->target_data == target_data)
2015 *prev_p = image->next;
2016 if (CUDA_CALL_NOCHECK (cuModuleUnload, image->module) != CUDA_SUCCESS)
2017 ret = false;
2018 free (image->fns);
2019 free (image);
2020 break;
2022 pthread_mutex_unlock (&dev->image_lock);
2023 return ret;
2026 void *
2027 GOMP_OFFLOAD_alloc (int ord, size_t size)
2029 if (!nvptx_attach_host_thread_to_device (ord))
2030 return NULL;
2031 return nvptx_alloc (size);
2034 bool
2035 GOMP_OFFLOAD_free (int ord, void *ptr)
2037 return (nvptx_attach_host_thread_to_device (ord)
2038 && nvptx_free (ptr));
2041 bool
2042 GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
2044 return (nvptx_attach_host_thread_to_device (ord)
2045 && nvptx_dev2host (dst, src, n));
2048 bool
2049 GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
2051 return (nvptx_attach_host_thread_to_device (ord)
2052 && nvptx_host2dev (dst, src, n));
2055 bool
2056 GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n)
2058 struct ptx_device *ptx_dev = ptx_devices[ord];
2059 CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n,
2060 ptx_dev->null_stream->stream);
2061 return true;
2064 void (*device_run) (int n, void *fn_ptr, void *vars) = NULL;
2066 void
2067 GOMP_OFFLOAD_openacc_exec (void (*fn) (void *), size_t mapnum,
2068 void **hostaddrs, void **devaddrs,
2069 int async, unsigned *dims, void *targ_mem_desc)
2071 nvptx_exec (fn, mapnum, hostaddrs, devaddrs, async, dims, targ_mem_desc);
2074 void
2075 GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc, int async)
2077 struct nvptx_thread *nvthd = nvptx_thread ();
2078 CUevent *e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
2080 CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
2081 CUDA_CALL_ASSERT (cuEventRecord, *e, nvthd->current_stream->stream);
2082 event_add (PTX_EVT_ASYNC_CLEANUP, e, targ_mem_desc, async);
2086 GOMP_OFFLOAD_openacc_async_test (int async)
2088 return nvptx_async_test (async);
2092 GOMP_OFFLOAD_openacc_async_test_all (void)
2094 return nvptx_async_test_all ();
2097 void
2098 GOMP_OFFLOAD_openacc_async_wait (int async)
2100 nvptx_wait (async);
2103 void
2104 GOMP_OFFLOAD_openacc_async_wait_async (int async1, int async2)
2106 nvptx_wait_async (async1, async2);
2109 void
2110 GOMP_OFFLOAD_openacc_async_wait_all (void)
2112 nvptx_wait_all ();
2115 void
2116 GOMP_OFFLOAD_openacc_async_wait_all_async (int async)
2118 nvptx_wait_all_async (async);
2121 void
2122 GOMP_OFFLOAD_openacc_async_set_async (int async)
2124 nvptx_set_async (async);
2127 void *
2128 GOMP_OFFLOAD_openacc_create_thread_data (int ord)
2130 struct ptx_device *ptx_dev;
2131 struct nvptx_thread *nvthd
2132 = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
2133 CUcontext thd_ctx;
2135 ptx_dev = ptx_devices[ord];
2137 assert (ptx_dev);
2139 CUDA_CALL_ASSERT (cuCtxGetCurrent, &thd_ctx);
2141 assert (ptx_dev->ctx);
2143 if (!thd_ctx)
2144 CUDA_CALL_ASSERT (cuCtxPushCurrent, ptx_dev->ctx);
2146 nvthd->current_stream = ptx_dev->null_stream;
2147 nvthd->ptx_dev = ptx_dev;
2149 return (void *) nvthd;
2152 void
2153 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
2155 free (data);
2158 void *
2159 GOMP_OFFLOAD_openacc_cuda_get_current_device (void)
2161 return nvptx_get_current_cuda_device ();
2164 void *
2165 GOMP_OFFLOAD_openacc_cuda_get_current_context (void)
2167 return nvptx_get_current_cuda_context ();
2170 /* NOTE: This returns a CUstream, not a ptx_stream pointer. */
2172 void *
2173 GOMP_OFFLOAD_openacc_cuda_get_stream (int async)
2175 return nvptx_get_cuda_stream (async);
2178 /* NOTE: This takes a CUstream, not a ptx_stream pointer. */
2181 GOMP_OFFLOAD_openacc_cuda_set_stream (int async, void *stream)
2183 return nvptx_set_cuda_stream (async, stream);
2186 /* Adjust launch dimensions: pick good values for number of blocks and warps
2187 and ensure that number of warps does not exceed CUDA limits as well as GCC's
2188 own limits. */
2190 static void
2191 nvptx_adjust_launch_bounds (struct targ_fn_descriptor *fn,
2192 struct ptx_device *ptx_dev,
2193 int *teams_p, int *threads_p)
2195 int max_warps_block = fn->max_threads_per_block / 32;
2196 /* Maximum 32 warps per block is an implementation limit in NVPTX backend
2197 and libgcc, which matches documented limit of all GPUs as of 2015. */
2198 if (max_warps_block > 32)
2199 max_warps_block = 32;
2200 if (*threads_p <= 0)
2201 *threads_p = 8;
2202 if (*threads_p > max_warps_block)
2203 *threads_p = max_warps_block;
2205 int regs_per_block = fn->regs_per_thread * 32 * *threads_p;
2206 /* This is an estimate of how many blocks the device can host simultaneously.
2207 Actual limit, which may be lower, can be queried with "occupancy control"
2208 driver interface (since CUDA 6.0). */
2209 int max_blocks = ptx_dev->regs_per_sm / regs_per_block * ptx_dev->num_sms;
2210 if (*teams_p <= 0 || *teams_p > max_blocks)
2211 *teams_p = max_blocks;
2214 /* Return the size of per-warp stacks (see gcc -msoft-stack) to use for OpenMP
2215 target regions. */
2217 static size_t
2218 nvptx_stacks_size ()
2220 return 128 * 1024;
2223 /* Return contiguous storage for NUM stacks, each SIZE bytes. */
2225 static void *
2226 nvptx_stacks_alloc (size_t size, int num)
2228 CUdeviceptr stacks;
2229 CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &stacks, size * num);
2230 if (r != CUDA_SUCCESS)
2231 GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
2232 return (void *) stacks;
2235 /* Release storage previously allocated by nvptx_stacks_alloc. */
2237 static void
2238 nvptx_stacks_free (void *p, int num)
2240 CUresult r = CUDA_CALL_NOCHECK (cuMemFree, (CUdeviceptr) p);
2241 if (r != CUDA_SUCCESS)
2242 GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
2245 void
2246 GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
2248 CUfunction function = ((struct targ_fn_descriptor *) tgt_fn)->fn;
2249 CUresult r;
2250 struct ptx_device *ptx_dev = ptx_devices[ord];
2251 const char *maybe_abort_msg = "(perhaps abort was called)";
2252 int teams = 0, threads = 0;
2254 if (!args)
2255 GOMP_PLUGIN_fatal ("No target arguments provided");
2256 while (*args)
2258 intptr_t id = (intptr_t) *args++, val;
2259 if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
2260 val = (intptr_t) *args++;
2261 else
2262 val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
2263 if ((id & GOMP_TARGET_ARG_DEVICE_MASK) != GOMP_TARGET_ARG_DEVICE_ALL)
2264 continue;
2265 val = val > INT_MAX ? INT_MAX : val;
2266 id &= GOMP_TARGET_ARG_ID_MASK;
2267 if (id == GOMP_TARGET_ARG_NUM_TEAMS)
2268 teams = val;
2269 else if (id == GOMP_TARGET_ARG_THREAD_LIMIT)
2270 threads = val;
2272 nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads);
2274 size_t stack_size = nvptx_stacks_size ();
2275 void *stacks = nvptx_stacks_alloc (stack_size, teams * threads);
2276 void *fn_args[] = {tgt_vars, stacks, (void *) stack_size};
2277 size_t fn_args_size = sizeof fn_args;
2278 void *config[] = {
2279 CU_LAUNCH_PARAM_BUFFER_POINTER, fn_args,
2280 CU_LAUNCH_PARAM_BUFFER_SIZE, &fn_args_size,
2281 CU_LAUNCH_PARAM_END
2283 r = CUDA_CALL_NOCHECK (cuLaunchKernel, function, teams, 1, 1,
2284 32, threads, 1, 0, ptx_dev->null_stream->stream,
2285 NULL, config);
2286 if (r != CUDA_SUCCESS)
2287 GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
2289 r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
2290 if (r == CUDA_ERROR_LAUNCH_FAILED)
2291 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
2292 maybe_abort_msg);
2293 else if (r != CUDA_SUCCESS)
2294 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
2295 nvptx_stacks_free (stacks, teams * threads);
2298 void
2299 GOMP_OFFLOAD_async_run (int ord, void *tgt_fn, void *tgt_vars, void **args,
2300 void *async_data)
2302 GOMP_PLUGIN_fatal ("GOMP_OFFLOAD_async_run unimplemented");