* config/arm/arm.c (arm_handle_cmse_nonsecure_call): Remove unused
[official-gcc.git] / libgomp / plugin / plugin-nvptx.c
blobca33c51db5aeeeab1ed12f5610d6899e4a428224
1 /* Plugin for NVPTX execution.
3 Copyright (C) 2013-2016 Free Software Foundation, Inc.
5 Contributed by Mentor Embedded.
7 This file is part of the GNU Offloading and Multi Processing Library
8 (libgomp).
10 Libgomp is free software; you can redistribute it and/or modify it
11 under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 3, or (at your option)
13 any later version.
15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
18 more details.
20 Under Section 7 of GPL version 3, you are granted additional
21 permissions described in the GCC Runtime Library Exception, version
22 3.1, as published by the Free Software Foundation.
24 You should have received a copy of the GNU General Public License and
25 a copy of the GCC Runtime Library Exception along with this program;
26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
27 <http://www.gnu.org/licenses/>. */
29 /* Nvidia PTX-specific parts of OpenACC support. The cuda driver
30 library appears to hold some implicit state, but the documentation
31 is not clear as to what that state might be. Or how one might
32 propagate it from one thread to another. */
34 #include "openacc.h"
35 #include "config.h"
36 #include "libgomp-plugin.h"
37 #include "oacc-plugin.h"
38 #include "gomp-constants.h"
40 #include <pthread.h>
41 #include <cuda.h>
42 #include <stdbool.h>
43 #include <stdint.h>
44 #include <limits.h>
45 #include <string.h>
46 #include <stdio.h>
47 #include <unistd.h>
48 #include <assert.h>
49 #include <errno.h>
51 static const char *
52 cuda_error (CUresult r)
54 #if CUDA_VERSION < 7000
55 /* Specified in documentation and present in library from at least
56 5.5. Not declared in header file prior to 7.0. */
57 extern CUresult cuGetErrorString (CUresult, const char **);
58 #endif
59 const char *desc;
61 r = cuGetErrorString (r, &desc);
62 if (r != CUDA_SUCCESS)
63 desc = "unknown cuda error";
65 return desc;
68 /* Convenience macros for the frequently used CUDA library call and
69 error handling sequence. This does not capture all the cases we
70 use in this file, but is common enough. */
72 #define CUDA_CALL_ERET(ERET, FN, ...) \
73 do { \
74 unsigned __r = FN (__VA_ARGS__); \
75 if (__r != CUDA_SUCCESS) \
76 { \
77 GOMP_PLUGIN_error (#FN " error: %s", \
78 cuda_error (__r)); \
79 return ERET; \
80 } \
81 } while (0)
83 #define CUDA_CALL(FN, ...) \
84 CUDA_CALL_ERET (false, (FN), __VA_ARGS__)
86 #define CUDA_CALL_ASSERT(FN, ...) \
87 do { \
88 unsigned __r = FN (__VA_ARGS__); \
89 if (__r != CUDA_SUCCESS) \
90 { \
91 GOMP_PLUGIN_fatal (#FN " error: %s", \
92 cuda_error (__r)); \
93 } \
94 } while (0)
96 static unsigned int instantiated_devices = 0;
97 static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
99 struct ptx_stream
101 CUstream stream;
102 pthread_t host_thread;
103 bool multithreaded;
105 CUdeviceptr d;
106 void *h;
107 void *h_begin;
108 void *h_end;
109 void *h_next;
110 void *h_prev;
111 void *h_tail;
113 struct ptx_stream *next;
116 /* Thread-specific data for PTX. */
118 struct nvptx_thread
120 struct ptx_stream *current_stream;
121 struct ptx_device *ptx_dev;
124 struct map
126 int async;
127 size_t size;
128 char mappings[0];
131 static bool
132 map_init (struct ptx_stream *s)
134 int size = getpagesize ();
136 assert (s);
137 assert (!s->d);
138 assert (!s->h);
140 CUDA_CALL (cuMemAllocHost, &s->h, size);
141 CUDA_CALL (cuMemHostGetDevicePointer, &s->d, s->h, 0);
143 assert (s->h);
145 s->h_begin = s->h;
146 s->h_end = s->h_begin + size;
147 s->h_next = s->h_prev = s->h_tail = s->h_begin;
149 assert (s->h_next);
150 assert (s->h_end);
151 return true;
154 static bool
155 map_fini (struct ptx_stream *s)
157 CUDA_CALL (cuMemFreeHost, s->h);
158 return true;
161 static void
162 map_pop (struct ptx_stream *s)
164 struct map *m;
166 assert (s != NULL);
167 assert (s->h_next);
168 assert (s->h_prev);
169 assert (s->h_tail);
171 m = s->h_tail;
173 s->h_tail += m->size;
175 if (s->h_tail >= s->h_end)
176 s->h_tail = s->h_begin + (int) (s->h_tail - s->h_end);
178 if (s->h_next == s->h_tail)
179 s->h_prev = s->h_next;
181 assert (s->h_next >= s->h_begin);
182 assert (s->h_tail >= s->h_begin);
183 assert (s->h_prev >= s->h_begin);
185 assert (s->h_next <= s->h_end);
186 assert (s->h_tail <= s->h_end);
187 assert (s->h_prev <= s->h_end);
190 static void
191 map_push (struct ptx_stream *s, int async, size_t size, void **h, void **d)
193 int left;
194 int offset;
195 struct map *m;
197 assert (s != NULL);
199 left = s->h_end - s->h_next;
200 size += sizeof (struct map);
202 assert (s->h_prev);
203 assert (s->h_next);
205 if (size >= left)
207 m = s->h_prev;
208 m->size += left;
209 s->h_next = s->h_begin;
211 if (s->h_next + size > s->h_end)
212 GOMP_PLUGIN_fatal ("unable to push map");
215 assert (s->h_next);
217 m = s->h_next;
218 m->async = async;
219 m->size = size;
221 offset = (void *)&m->mappings[0] - s->h;
223 *d = (void *)(s->d + offset);
224 *h = (void *)(s->h + offset);
226 s->h_prev = s->h_next;
227 s->h_next += size;
229 assert (s->h_prev);
230 assert (s->h_next);
232 assert (s->h_next >= s->h_begin);
233 assert (s->h_tail >= s->h_begin);
234 assert (s->h_prev >= s->h_begin);
235 assert (s->h_next <= s->h_end);
236 assert (s->h_tail <= s->h_end);
237 assert (s->h_prev <= s->h_end);
239 return;
242 /* Target data function launch information. */
244 struct targ_fn_launch
246 const char *fn;
247 unsigned short dim[GOMP_DIM_MAX];
250 /* Target PTX object information. */
252 struct targ_ptx_obj
254 const char *code;
255 size_t size;
258 /* Target data image information. */
260 typedef struct nvptx_tdata
262 const struct targ_ptx_obj *ptx_objs;
263 unsigned ptx_num;
265 const char *const *var_names;
266 unsigned var_num;
268 const struct targ_fn_launch *fn_descs;
269 unsigned fn_num;
270 } nvptx_tdata_t;
272 /* Descriptor of a loaded function. */
274 struct targ_fn_descriptor
276 CUfunction fn;
277 const struct targ_fn_launch *launch;
278 int regs_per_thread;
279 int max_threads_per_block;
282 /* A loaded PTX image. */
283 struct ptx_image_data
285 const void *target_data;
286 CUmodule module;
288 struct targ_fn_descriptor *fns; /* Array of functions. */
290 struct ptx_image_data *next;
293 struct ptx_device
295 CUcontext ctx;
296 bool ctx_shared;
297 CUdevice dev;
298 struct ptx_stream *null_stream;
299 /* All non-null streams associated with this device (actually context),
300 either created implicitly or passed in from the user (via
301 acc_set_cuda_stream). */
302 struct ptx_stream *active_streams;
303 struct {
304 struct ptx_stream **arr;
305 int size;
306 } async_streams;
307 /* A lock for use when manipulating the above stream list and array. */
308 pthread_mutex_t stream_lock;
309 int ord;
310 bool overlap;
311 bool map;
312 bool concur;
313 bool mkern;
314 int mode;
315 int clock_khz;
316 int num_sms;
317 int regs_per_block;
318 int regs_per_sm;
320 struct ptx_image_data *images; /* Images loaded on device. */
321 pthread_mutex_t image_lock; /* Lock for above list. */
323 struct ptx_device *next;
326 enum ptx_event_type
328 PTX_EVT_MEM,
329 PTX_EVT_KNL,
330 PTX_EVT_SYNC,
331 PTX_EVT_ASYNC_CLEANUP
334 struct ptx_event
336 CUevent *evt;
337 int type;
338 void *addr;
339 int ord;
340 int val;
342 struct ptx_event *next;
345 static pthread_mutex_t ptx_event_lock;
346 static struct ptx_event *ptx_events;
348 static struct ptx_device **ptx_devices;
350 static inline struct nvptx_thread *
351 nvptx_thread (void)
353 return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
356 static bool
357 init_streams_for_device (struct ptx_device *ptx_dev, int concurrency)
359 int i;
360 struct ptx_stream *null_stream
361 = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
363 null_stream->stream = NULL;
364 null_stream->host_thread = pthread_self ();
365 null_stream->multithreaded = true;
366 null_stream->d = (CUdeviceptr) NULL;
367 null_stream->h = NULL;
368 if (!map_init (null_stream))
369 return false;
371 ptx_dev->null_stream = null_stream;
372 ptx_dev->active_streams = NULL;
373 pthread_mutex_init (&ptx_dev->stream_lock, NULL);
375 if (concurrency < 1)
376 concurrency = 1;
378 /* This is just a guess -- make space for as many async streams as the
379 current device is capable of concurrently executing. This can grow
380 later as necessary. No streams are created yet. */
381 ptx_dev->async_streams.arr
382 = GOMP_PLUGIN_malloc (concurrency * sizeof (struct ptx_stream *));
383 ptx_dev->async_streams.size = concurrency;
385 for (i = 0; i < concurrency; i++)
386 ptx_dev->async_streams.arr[i] = NULL;
388 return true;
391 static bool
392 fini_streams_for_device (struct ptx_device *ptx_dev)
394 free (ptx_dev->async_streams.arr);
396 bool ret = true;
397 while (ptx_dev->active_streams != NULL)
399 struct ptx_stream *s = ptx_dev->active_streams;
400 ptx_dev->active_streams = ptx_dev->active_streams->next;
402 ret &= map_fini (s);
404 CUresult r = cuStreamDestroy (s->stream);
405 if (r != CUDA_SUCCESS)
407 GOMP_PLUGIN_error ("cuStreamDestroy error: %s", cuda_error (r));
408 ret = false;
410 free (s);
413 ret &= map_fini (ptx_dev->null_stream);
414 free (ptx_dev->null_stream);
415 return ret;
418 /* Select a stream for (OpenACC-semantics) ASYNC argument for the current
419 thread THREAD (and also current device/context). If CREATE is true, create
420 the stream if it does not exist (or use EXISTING if it is non-NULL), and
421 associate the stream with the same thread argument. Returns stream to use
422 as result. */
424 static struct ptx_stream *
425 select_stream_for_async (int async, pthread_t thread, bool create,
426 CUstream existing)
428 struct nvptx_thread *nvthd = nvptx_thread ();
429 /* Local copy of TLS variable. */
430 struct ptx_device *ptx_dev = nvthd->ptx_dev;
431 struct ptx_stream *stream = NULL;
432 int orig_async = async;
434 /* The special value acc_async_noval (-1) maps (for now) to an
435 implicitly-created stream, which is then handled the same as any other
436 numbered async stream. Other options are available, e.g. using the null
437 stream for anonymous async operations, or choosing an idle stream from an
438 active set. But, stick with this for now. */
439 if (async > acc_async_sync)
440 async++;
442 if (create)
443 pthread_mutex_lock (&ptx_dev->stream_lock);
445 /* NOTE: AFAICT there's no particular need for acc_async_sync to map to the
446 null stream, and in fact better performance may be obtainable if it doesn't
447 (because the null stream enforces overly-strict synchronisation with
448 respect to other streams for legacy reasons, and that's probably not
449 needed with OpenACC). Maybe investigate later. */
450 if (async == acc_async_sync)
451 stream = ptx_dev->null_stream;
452 else if (async >= 0 && async < ptx_dev->async_streams.size
453 && ptx_dev->async_streams.arr[async] && !(create && existing))
454 stream = ptx_dev->async_streams.arr[async];
455 else if (async >= 0 && create)
457 if (async >= ptx_dev->async_streams.size)
459 int i, newsize = ptx_dev->async_streams.size * 2;
461 if (async >= newsize)
462 newsize = async + 1;
464 ptx_dev->async_streams.arr
465 = GOMP_PLUGIN_realloc (ptx_dev->async_streams.arr,
466 newsize * sizeof (struct ptx_stream *));
468 for (i = ptx_dev->async_streams.size; i < newsize; i++)
469 ptx_dev->async_streams.arr[i] = NULL;
471 ptx_dev->async_streams.size = newsize;
474 /* Create a new stream on-demand if there isn't one already, or if we're
475 setting a particular async value to an existing (externally-provided)
476 stream. */
477 if (!ptx_dev->async_streams.arr[async] || existing)
479 CUresult r;
480 struct ptx_stream *s
481 = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
483 if (existing)
484 s->stream = existing;
485 else
487 r = cuStreamCreate (&s->stream, CU_STREAM_DEFAULT);
488 if (r != CUDA_SUCCESS)
490 pthread_mutex_unlock (&ptx_dev->stream_lock);
491 GOMP_PLUGIN_fatal ("cuStreamCreate error: %s",
492 cuda_error (r));
496 /* If CREATE is true, we're going to be queueing some work on this
497 stream. Associate it with the current host thread. */
498 s->host_thread = thread;
499 s->multithreaded = false;
501 s->d = (CUdeviceptr) NULL;
502 s->h = NULL;
503 if (!map_init (s))
505 pthread_mutex_unlock (&ptx_dev->stream_lock);
506 GOMP_PLUGIN_fatal ("map_init fail");
509 s->next = ptx_dev->active_streams;
510 ptx_dev->active_streams = s;
511 ptx_dev->async_streams.arr[async] = s;
514 stream = ptx_dev->async_streams.arr[async];
516 else if (async < 0)
518 if (create)
519 pthread_mutex_unlock (&ptx_dev->stream_lock);
520 GOMP_PLUGIN_fatal ("bad async %d", async);
523 if (create)
525 assert (stream != NULL);
527 /* If we're trying to use the same stream from different threads
528 simultaneously, set stream->multithreaded to true. This affects the
529 behaviour of acc_async_test_all and acc_wait_all, which are supposed to
530 only wait for asynchronous launches from the same host thread they are
531 invoked on. If multiple threads use the same async value, we make note
532 of that here and fall back to testing/waiting for all threads in those
533 functions. */
534 if (thread != stream->host_thread)
535 stream->multithreaded = true;
537 pthread_mutex_unlock (&ptx_dev->stream_lock);
539 else if (stream && !stream->multithreaded
540 && !pthread_equal (stream->host_thread, thread))
541 GOMP_PLUGIN_fatal ("async %d used on wrong thread", orig_async);
543 return stream;
546 /* Initialize the device. Return TRUE on success, else FALSE. PTX_DEV_LOCK
547 should be locked on entry and remains locked on exit. */
549 static bool
550 nvptx_init (void)
552 int ndevs;
554 if (instantiated_devices != 0)
555 return true;
557 CUDA_CALL (cuInit, 0);
558 ptx_events = NULL;
559 pthread_mutex_init (&ptx_event_lock, NULL);
561 CUDA_CALL (cuDeviceGetCount, &ndevs);
562 ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *)
563 * ndevs);
564 return true;
567 /* Select the N'th PTX device for the current host thread. The device must
568 have been previously opened before calling this function. */
570 static bool
571 nvptx_attach_host_thread_to_device (int n)
573 CUdevice dev;
574 CUresult r;
575 struct ptx_device *ptx_dev;
576 CUcontext thd_ctx;
578 r = cuCtxGetDevice (&dev);
579 if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
581 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
582 return false;
585 if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n)
586 return true;
587 else
589 CUcontext old_ctx;
591 ptx_dev = ptx_devices[n];
592 if (!ptx_dev)
594 GOMP_PLUGIN_error ("device %d not found", n);
595 return false;
598 CUDA_CALL (cuCtxGetCurrent, &thd_ctx);
600 /* We don't necessarily have a current context (e.g. if it has been
601 destroyed. Pop it if we do though. */
602 if (thd_ctx != NULL)
603 CUDA_CALL (cuCtxPopCurrent, &old_ctx);
605 CUDA_CALL (cuCtxPushCurrent, ptx_dev->ctx);
607 return true;
610 static struct ptx_device *
611 nvptx_open_device (int n)
613 struct ptx_device *ptx_dev;
614 CUdevice dev, ctx_dev;
615 CUresult r;
616 int async_engines, pi;
618 CUDA_CALL_ERET (NULL, cuDeviceGet, &dev, n);
620 ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device));
622 ptx_dev->ord = n;
623 ptx_dev->dev = dev;
624 ptx_dev->ctx_shared = false;
626 r = cuCtxGetDevice (&ctx_dev);
627 if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
629 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
630 return NULL;
633 if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev)
635 /* The current host thread has an active context for a different device.
636 Detach it. */
637 CUcontext old_ctx;
638 CUDA_CALL_ERET (NULL, cuCtxPopCurrent, &old_ctx);
641 CUDA_CALL_ERET (NULL, cuCtxGetCurrent, &ptx_dev->ctx);
643 if (!ptx_dev->ctx)
644 CUDA_CALL_ERET (NULL, cuCtxCreate, &ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
645 else
646 ptx_dev->ctx_shared = true;
648 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
649 &pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
650 ptx_dev->overlap = pi;
652 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
653 &pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
654 ptx_dev->map = pi;
656 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
657 &pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
658 ptx_dev->concur = pi;
660 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
661 &pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
662 ptx_dev->mode = pi;
664 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
665 &pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
666 ptx_dev->mkern = pi;
668 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
669 &pi, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
670 ptx_dev->clock_khz = pi;
672 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
673 &pi, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
674 ptx_dev->num_sms = pi;
676 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
677 &pi, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev);
678 ptx_dev->regs_per_block = pi;
680 /* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82 is defined only
681 in CUDA 6.0 and newer. */
682 r = cuDeviceGetAttribute (&pi, 82, dev);
683 /* Fallback: use limit of registers per block, which is usually equal. */
684 if (r == CUDA_ERROR_INVALID_VALUE)
685 pi = ptx_dev->regs_per_block;
686 else if (r != CUDA_SUCCESS)
688 GOMP_PLUGIN_error ("cuDeviceGetAttribute error: %s", cuda_error (r));
689 return NULL;
691 ptx_dev->regs_per_sm = pi;
693 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
694 &pi, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev);
695 if (pi != 32)
697 GOMP_PLUGIN_error ("Only warp size 32 is supported");
698 return NULL;
701 r = cuDeviceGetAttribute (&async_engines,
702 CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
703 if (r != CUDA_SUCCESS)
704 async_engines = 1;
706 ptx_dev->images = NULL;
707 pthread_mutex_init (&ptx_dev->image_lock, NULL);
709 if (!init_streams_for_device (ptx_dev, async_engines))
710 return NULL;
712 return ptx_dev;
715 static bool
716 nvptx_close_device (struct ptx_device *ptx_dev)
718 if (!ptx_dev)
719 return true;
721 if (!fini_streams_for_device (ptx_dev))
722 return false;
724 pthread_mutex_destroy (&ptx_dev->image_lock);
726 if (!ptx_dev->ctx_shared)
727 CUDA_CALL (cuCtxDestroy, ptx_dev->ctx);
729 free (ptx_dev);
730 return true;
733 static int
734 nvptx_get_num_devices (void)
736 int n;
738 /* PR libgomp/65099: Currently, we only support offloading in 64-bit
739 configurations. */
740 if (sizeof (void *) != 8)
741 return 0;
743 /* This function will be called before the plugin has been initialized in
744 order to enumerate available devices, but CUDA API routines can't be used
745 until cuInit has been called. Just call it now (but don't yet do any
746 further initialization). */
747 if (instantiated_devices == 0)
749 CUresult r = cuInit (0);
750 /* This is not an error: e.g. we may have CUDA libraries installed but
751 no devices available. */
752 if (r != CUDA_SUCCESS)
753 return 0;
756 CUDA_CALL_ERET (-1, cuDeviceGetCount, &n);
757 return n;
761 static bool
762 link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs,
763 unsigned num_objs)
765 CUjit_option opts[6];
766 void *optvals[6];
767 float elapsed = 0.0;
768 char elog[1024];
769 char ilog[16384];
770 CUlinkState linkstate;
771 CUresult r;
772 void *linkout;
773 size_t linkoutsize __attribute__ ((unused));
775 opts[0] = CU_JIT_WALL_TIME;
776 optvals[0] = &elapsed;
778 opts[1] = CU_JIT_INFO_LOG_BUFFER;
779 optvals[1] = &ilog[0];
781 opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
782 optvals[2] = (void *) sizeof ilog;
784 opts[3] = CU_JIT_ERROR_LOG_BUFFER;
785 optvals[3] = &elog[0];
787 opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
788 optvals[4] = (void *) sizeof elog;
790 opts[5] = CU_JIT_LOG_VERBOSE;
791 optvals[5] = (void *) 1;
793 CUDA_CALL (cuLinkCreate, 6, opts, optvals, &linkstate);
795 for (; num_objs--; ptx_objs++)
797 /* cuLinkAddData's 'data' argument erroneously omits the const
798 qualifier. */
799 GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs->code);
800 r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, (char*)ptx_objs->code,
801 ptx_objs->size, 0, 0, 0, 0);
802 if (r != CUDA_SUCCESS)
804 GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
805 GOMP_PLUGIN_error ("cuLinkAddData (ptx_code) error: %s",
806 cuda_error (r));
807 return false;
811 GOMP_PLUGIN_debug (0, "Linking\n");
812 r = cuLinkComplete (linkstate, &linkout, &linkoutsize);
814 GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed);
815 GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]);
817 if (r != CUDA_SUCCESS)
819 GOMP_PLUGIN_error ("cuLinkComplete error: %s", cuda_error (r));
820 return false;
823 CUDA_CALL (cuModuleLoadData, module, linkout);
824 CUDA_CALL (cuLinkDestroy, linkstate);
825 return true;
828 static void
829 event_gc (bool memmap_lockable)
831 struct ptx_event *ptx_event = ptx_events;
832 struct ptx_event *async_cleanups = NULL;
833 struct nvptx_thread *nvthd = nvptx_thread ();
835 pthread_mutex_lock (&ptx_event_lock);
837 while (ptx_event != NULL)
839 CUresult r;
840 struct ptx_event *e = ptx_event;
842 ptx_event = ptx_event->next;
844 if (e->ord != nvthd->ptx_dev->ord)
845 continue;
847 r = cuEventQuery (*e->evt);
848 if (r == CUDA_SUCCESS)
850 bool append_async = false;
851 CUevent *te;
853 te = e->evt;
855 switch (e->type)
857 case PTX_EVT_MEM:
858 case PTX_EVT_SYNC:
859 break;
861 case PTX_EVT_KNL:
862 map_pop (e->addr);
863 break;
865 case PTX_EVT_ASYNC_CLEANUP:
867 /* The function gomp_plugin_async_unmap_vars needs to claim the
868 memory-map splay tree lock for the current device, so we
869 can't call it when one of our callers has already claimed
870 the lock. In that case, just delay the GC for this event
871 until later. */
872 if (!memmap_lockable)
873 continue;
875 append_async = true;
877 break;
880 cuEventDestroy (*te);
881 free ((void *)te);
883 /* Unlink 'e' from ptx_events list. */
884 if (ptx_events == e)
885 ptx_events = ptx_events->next;
886 else
888 struct ptx_event *e_ = ptx_events;
889 while (e_->next != e)
890 e_ = e_->next;
891 e_->next = e_->next->next;
894 if (append_async)
896 e->next = async_cleanups;
897 async_cleanups = e;
899 else
900 free (e);
904 pthread_mutex_unlock (&ptx_event_lock);
906 /* We have to do these here, after ptx_event_lock is released. */
907 while (async_cleanups)
909 struct ptx_event *e = async_cleanups;
910 async_cleanups = async_cleanups->next;
912 GOMP_PLUGIN_async_unmap_vars (e->addr, e->val);
913 free (e);
917 static void
918 event_add (enum ptx_event_type type, CUevent *e, void *h, int val)
920 struct ptx_event *ptx_event;
921 struct nvptx_thread *nvthd = nvptx_thread ();
923 assert (type == PTX_EVT_MEM || type == PTX_EVT_KNL || type == PTX_EVT_SYNC
924 || type == PTX_EVT_ASYNC_CLEANUP);
926 ptx_event = GOMP_PLUGIN_malloc (sizeof (struct ptx_event));
927 ptx_event->type = type;
928 ptx_event->evt = e;
929 ptx_event->addr = h;
930 ptx_event->ord = nvthd->ptx_dev->ord;
931 ptx_event->val = val;
933 pthread_mutex_lock (&ptx_event_lock);
935 ptx_event->next = ptx_events;
936 ptx_events = ptx_event;
938 pthread_mutex_unlock (&ptx_event_lock);
941 void
942 nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
943 int async, unsigned *dims, void *targ_mem_desc)
945 struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
946 CUfunction function;
947 CUresult r;
948 int i;
949 struct ptx_stream *dev_str;
950 void *kargs[1];
951 void *hp, *dp;
952 struct nvptx_thread *nvthd = nvptx_thread ();
953 const char *maybe_abort_msg = "(perhaps abort was called)";
955 function = targ_fn->fn;
957 dev_str = select_stream_for_async (async, pthread_self (), false, NULL);
958 assert (dev_str == nvthd->current_stream);
960 /* Initialize the launch dimensions. Typically this is constant,
961 provided by the device compiler, but we must permit runtime
962 values. */
963 int seen_zero = 0;
964 for (i = 0; i != GOMP_DIM_MAX; i++)
966 if (targ_fn->launch->dim[i])
967 dims[i] = targ_fn->launch->dim[i];
968 if (!dims[i])
969 seen_zero = 1;
972 if (seen_zero)
974 /* See if the user provided GOMP_OPENACC_DIM environment
975 variable to specify runtime defaults. */
976 static int default_dims[GOMP_DIM_MAX];
978 pthread_mutex_lock (&ptx_dev_lock);
979 if (!default_dims[0])
981 /* We only read the environment variable once. You can't
982 change it in the middle of execution. The syntax is
983 the same as for the -fopenacc-dim compilation option. */
984 const char *env_var = getenv ("GOMP_OPENACC_DIM");
985 if (env_var)
987 const char *pos = env_var;
989 for (i = 0; *pos && i != GOMP_DIM_MAX; i++)
991 if (i && *pos++ != ':')
992 break;
993 if (*pos != ':')
995 const char *eptr;
997 errno = 0;
998 long val = strtol (pos, (char **)&eptr, 10);
999 if (errno || val < 0 || (unsigned)val != val)
1000 break;
1001 default_dims[i] = (int)val;
1002 pos = eptr;
1007 int warp_size, block_size, dev_size, cpu_size;
1008 CUdevice dev = nvptx_thread()->ptx_dev->dev;
1009 /* 32 is the default for known hardware. */
1010 int gang = 0, worker = 32, vector = 32;
1011 CUdevice_attribute cu_tpb, cu_ws, cu_mpc, cu_tpm;
1013 cu_tpb = CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK;
1014 cu_ws = CU_DEVICE_ATTRIBUTE_WARP_SIZE;
1015 cu_mpc = CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT;
1016 cu_tpm = CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR;
1018 if (cuDeviceGetAttribute (&block_size, cu_tpb, dev) == CUDA_SUCCESS
1019 && cuDeviceGetAttribute (&warp_size, cu_ws, dev) == CUDA_SUCCESS
1020 && cuDeviceGetAttribute (&dev_size, cu_mpc, dev) == CUDA_SUCCESS
1021 && cuDeviceGetAttribute (&cpu_size, cu_tpm, dev) == CUDA_SUCCESS)
1023 GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
1024 " dev_size=%d, cpu_size=%d\n",
1025 warp_size, block_size, dev_size, cpu_size);
1026 gang = (cpu_size / block_size) * dev_size;
1027 worker = block_size / warp_size;
1028 vector = warp_size;
1031 /* There is no upper bound on the gang size. The best size
1032 matches the hardware configuration. Logical gangs are
1033 scheduled onto physical hardware. To maximize usage, we
1034 should guess a large number. */
1035 if (default_dims[GOMP_DIM_GANG] < 1)
1036 default_dims[GOMP_DIM_GANG] = gang ? gang : 1024;
1037 /* The worker size must not exceed the hardware. */
1038 if (default_dims[GOMP_DIM_WORKER] < 1
1039 || (default_dims[GOMP_DIM_WORKER] > worker && gang))
1040 default_dims[GOMP_DIM_WORKER] = worker;
1041 /* The vector size must exactly match the hardware. */
1042 if (default_dims[GOMP_DIM_VECTOR] < 1
1043 || (default_dims[GOMP_DIM_VECTOR] != vector && gang))
1044 default_dims[GOMP_DIM_VECTOR] = vector;
1046 GOMP_PLUGIN_debug (0, " default dimensions [%d,%d,%d]\n",
1047 default_dims[GOMP_DIM_GANG],
1048 default_dims[GOMP_DIM_WORKER],
1049 default_dims[GOMP_DIM_VECTOR]);
1051 pthread_mutex_unlock (&ptx_dev_lock);
1053 for (i = 0; i != GOMP_DIM_MAX; i++)
1054 if (!dims[i])
1055 dims[i] = default_dims[i];
1058 /* This reserves a chunk of a pre-allocated page of memory mapped on both
1059 the host and the device. HP is a host pointer to the new chunk, and DP is
1060 the corresponding device pointer. */
1061 map_push (dev_str, async, mapnum * sizeof (void *), &hp, &dp);
1063 GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__);
1065 /* Copy the array of arguments to the mapped page. */
1066 for (i = 0; i < mapnum; i++)
1067 ((void **) hp)[i] = devaddrs[i];
1069 /* Copy the (device) pointers to arguments to the device (dp and hp might in
1070 fact have the same value on a unified-memory system). */
1071 CUDA_CALL_ASSERT (cuMemcpy, (CUdeviceptr) dp, (CUdeviceptr) hp,
1072 mapnum * sizeof (void *));
1073 GOMP_PLUGIN_debug (0, " %s: kernel %s: launch"
1074 " gangs=%u, workers=%u, vectors=%u\n",
1075 __FUNCTION__, targ_fn->launch->fn, dims[GOMP_DIM_GANG],
1076 dims[GOMP_DIM_WORKER], dims[GOMP_DIM_VECTOR]);
1078 // OpenACC CUDA
1080 // num_gangs nctaid.x
1081 // num_workers ntid.y
1082 // vector length ntid.x
1084 kargs[0] = &dp;
1085 CUDA_CALL_ASSERT (cuLaunchKernel, function,
1086 dims[GOMP_DIM_GANG], 1, 1,
1087 dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1,
1088 0, dev_str->stream, kargs, 0);
1090 #ifndef DISABLE_ASYNC
1091 if (async < acc_async_noval)
1093 r = cuStreamSynchronize (dev_str->stream);
1094 if (r == CUDA_ERROR_LAUNCH_FAILED)
1095 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
1096 maybe_abort_msg);
1097 else if (r != CUDA_SUCCESS)
1098 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1100 else
1102 CUevent *e;
1104 e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1106 r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1107 if (r == CUDA_ERROR_LAUNCH_FAILED)
1108 GOMP_PLUGIN_fatal ("cuEventCreate error: %s %s\n", cuda_error (r),
1109 maybe_abort_msg);
1110 else if (r != CUDA_SUCCESS)
1111 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1113 event_gc (true);
1115 CUDA_CALL_ASSERT (cuEventRecord, *e, dev_str->stream);
1117 event_add (PTX_EVT_KNL, e, (void *)dev_str, 0);
1119 #else
1120 r = cuCtxSynchronize ();
1121 if (r == CUDA_ERROR_LAUNCH_FAILED)
1122 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
1123 maybe_abort_msg);
1124 else if (r != CUDA_SUCCESS)
1125 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
1126 #endif
1128 GOMP_PLUGIN_debug (0, " %s: kernel %s: finished\n", __FUNCTION__,
1129 targ_fn->launch->fn);
1131 #ifndef DISABLE_ASYNC
1132 if (async < acc_async_noval)
1133 #endif
1134 map_pop (dev_str);
1137 void * openacc_get_current_cuda_context (void);
1139 static void *
1140 nvptx_alloc (size_t s)
1142 CUdeviceptr d;
1144 CUDA_CALL_ERET (NULL, cuMemAlloc, &d, s);
1145 return (void *) d;
1148 static bool
1149 nvptx_free (void *p)
1151 CUdeviceptr pb;
1152 size_t ps;
1154 CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) p);
1155 if ((CUdeviceptr) p != pb)
1157 GOMP_PLUGIN_error ("invalid device address");
1158 return false;
1161 CUDA_CALL (cuMemFree, (CUdeviceptr) p);
1162 return true;
1166 static bool
1167 nvptx_host2dev (void *d, const void *h, size_t s)
1169 CUdeviceptr pb;
1170 size_t ps;
1171 struct nvptx_thread *nvthd = nvptx_thread ();
1173 if (!s)
1174 return true;
1175 if (!d)
1177 GOMP_PLUGIN_error ("invalid device address");
1178 return false;
1181 CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
1183 if (!pb)
1185 GOMP_PLUGIN_error ("invalid device address");
1186 return false;
1188 if (!h)
1190 GOMP_PLUGIN_error ("invalid host address");
1191 return false;
1193 if (d == h)
1195 GOMP_PLUGIN_error ("invalid host or device address");
1196 return false;
1198 if ((void *)(d + s) > (void *)(pb + ps))
1200 GOMP_PLUGIN_error ("invalid size");
1201 return false;
1204 #ifndef DISABLE_ASYNC
1205 if (nvthd && nvthd->current_stream != nvthd->ptx_dev->null_stream)
1207 CUevent *e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1208 CUDA_CALL (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1209 event_gc (false);
1210 CUDA_CALL (cuMemcpyHtoDAsync,
1211 (CUdeviceptr) d, h, s, nvthd->current_stream->stream);
1212 CUDA_CALL (cuEventRecord, *e, nvthd->current_stream->stream);
1213 event_add (PTX_EVT_MEM, e, (void *)h, 0);
1215 else
1216 #endif
1217 CUDA_CALL (cuMemcpyHtoD, (CUdeviceptr) d, h, s);
1219 return true;
1222 static bool
1223 nvptx_dev2host (void *h, const void *d, size_t s)
1225 CUdeviceptr pb;
1226 size_t ps;
1227 struct nvptx_thread *nvthd = nvptx_thread ();
1229 if (!s)
1230 return true;
1231 if (!d)
1233 GOMP_PLUGIN_error ("invalid device address");
1234 return false;
1237 CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
1239 if (!pb)
1241 GOMP_PLUGIN_error ("invalid device address");
1242 return false;
1244 if (!h)
1246 GOMP_PLUGIN_error ("invalid host address");
1247 return false;
1249 if (d == h)
1251 GOMP_PLUGIN_error ("invalid host or device address");
1252 return false;
1254 if ((void *)(d + s) > (void *)(pb + ps))
1256 GOMP_PLUGIN_error ("invalid size");
1257 return false;
1260 #ifndef DISABLE_ASYNC
1261 if (nvthd && nvthd->current_stream != nvthd->ptx_dev->null_stream)
1263 CUevent *e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1264 CUDA_CALL (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1265 event_gc (false);
1266 CUDA_CALL (cuMemcpyDtoHAsync,
1267 h, (CUdeviceptr) d, s, nvthd->current_stream->stream);
1268 CUDA_CALL (cuEventRecord, *e, nvthd->current_stream->stream);
1269 event_add (PTX_EVT_MEM, e, (void *)h, 0);
1271 else
1272 #endif
1273 CUDA_CALL (cuMemcpyDtoH, h, (CUdeviceptr) d, s);
1275 return true;
1278 static void
1279 nvptx_set_async (int async)
1281 struct nvptx_thread *nvthd = nvptx_thread ();
1282 nvthd->current_stream
1283 = select_stream_for_async (async, pthread_self (), true, NULL);
1286 static int
1287 nvptx_async_test (int async)
1289 CUresult r;
1290 struct ptx_stream *s;
1292 s = select_stream_for_async (async, pthread_self (), false, NULL);
1294 if (!s)
1295 GOMP_PLUGIN_fatal ("unknown async %d", async);
1297 r = cuStreamQuery (s->stream);
1298 if (r == CUDA_SUCCESS)
1300 /* The oacc-parallel.c:goacc_wait function calls this hook to determine
1301 whether all work has completed on this stream, and if so omits the call
1302 to the wait hook. If that happens, event_gc might not get called
1303 (which prevents variables from getting unmapped and their associated
1304 device storage freed), so call it here. */
1305 event_gc (true);
1306 return 1;
1308 else if (r == CUDA_ERROR_NOT_READY)
1309 return 0;
1311 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
1313 return 0;
1316 static int
1317 nvptx_async_test_all (void)
1319 struct ptx_stream *s;
1320 pthread_t self = pthread_self ();
1321 struct nvptx_thread *nvthd = nvptx_thread ();
1323 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1325 for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
1327 if ((s->multithreaded || pthread_equal (s->host_thread, self))
1328 && cuStreamQuery (s->stream) == CUDA_ERROR_NOT_READY)
1330 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1331 return 0;
1335 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1337 event_gc (true);
1339 return 1;
1342 static void
1343 nvptx_wait (int async)
1345 struct ptx_stream *s;
1347 s = select_stream_for_async (async, pthread_self (), false, NULL);
1348 if (!s)
1349 GOMP_PLUGIN_fatal ("unknown async %d", async);
1351 CUDA_CALL_ASSERT (cuStreamSynchronize, s->stream);
1353 event_gc (true);
1356 static void
1357 nvptx_wait_async (int async1, int async2)
1359 CUevent *e;
1360 struct ptx_stream *s1, *s2;
1361 pthread_t self = pthread_self ();
1363 /* The stream that is waiting (rather than being waited for) doesn't
1364 necessarily have to exist already. */
1365 s2 = select_stream_for_async (async2, self, true, NULL);
1367 s1 = select_stream_for_async (async1, self, false, NULL);
1368 if (!s1)
1369 GOMP_PLUGIN_fatal ("invalid async 1\n");
1371 if (s1 == s2)
1372 GOMP_PLUGIN_fatal ("identical parameters");
1374 e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1376 CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1378 event_gc (true);
1380 CUDA_CALL_ASSERT (cuEventRecord, *e, s1->stream);
1382 event_add (PTX_EVT_SYNC, e, NULL, 0);
1384 CUDA_CALL_ASSERT (cuStreamWaitEvent, s2->stream, *e, 0);
1387 static void
1388 nvptx_wait_all (void)
1390 CUresult r;
1391 struct ptx_stream *s;
1392 pthread_t self = pthread_self ();
1393 struct nvptx_thread *nvthd = nvptx_thread ();
1395 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1397 /* Wait for active streams initiated by this thread (or by multiple threads)
1398 to complete. */
1399 for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
1401 if (s->multithreaded || pthread_equal (s->host_thread, self))
1403 r = cuStreamQuery (s->stream);
1404 if (r == CUDA_SUCCESS)
1405 continue;
1406 else if (r != CUDA_ERROR_NOT_READY)
1407 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
1409 CUDA_CALL_ASSERT (cuStreamSynchronize, s->stream);
1413 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1415 event_gc (true);
1418 static void
1419 nvptx_wait_all_async (int async)
1421 struct ptx_stream *waiting_stream, *other_stream;
1422 CUevent *e;
1423 struct nvptx_thread *nvthd = nvptx_thread ();
1424 pthread_t self = pthread_self ();
1426 /* The stream doing the waiting. This could be the first mention of the
1427 stream, so create it if necessary. */
1428 waiting_stream
1429 = select_stream_for_async (async, pthread_self (), true, NULL);
1431 /* Launches on the null stream already block on other streams in the
1432 context. */
1433 if (!waiting_stream || waiting_stream == nvthd->ptx_dev->null_stream)
1434 return;
1436 event_gc (true);
1438 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1440 for (other_stream = nvthd->ptx_dev->active_streams;
1441 other_stream != NULL;
1442 other_stream = other_stream->next)
1444 if (!other_stream->multithreaded
1445 && !pthread_equal (other_stream->host_thread, self))
1446 continue;
1448 e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1450 CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1452 /* Record an event on the waited-for stream. */
1453 CUDA_CALL_ASSERT (cuEventRecord, *e, other_stream->stream);
1455 event_add (PTX_EVT_SYNC, e, NULL, 0);
1457 CUDA_CALL_ASSERT (cuStreamWaitEvent, waiting_stream->stream, *e, 0);
1460 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1463 static void *
1464 nvptx_get_current_cuda_device (void)
1466 struct nvptx_thread *nvthd = nvptx_thread ();
1468 if (!nvthd || !nvthd->ptx_dev)
1469 return NULL;
1471 return &nvthd->ptx_dev->dev;
1474 static void *
1475 nvptx_get_current_cuda_context (void)
1477 struct nvptx_thread *nvthd = nvptx_thread ();
1479 if (!nvthd || !nvthd->ptx_dev)
1480 return NULL;
1482 return nvthd->ptx_dev->ctx;
1485 static void *
1486 nvptx_get_cuda_stream (int async)
1488 struct ptx_stream *s;
1489 struct nvptx_thread *nvthd = nvptx_thread ();
1491 if (!nvthd || !nvthd->ptx_dev)
1492 return NULL;
1494 s = select_stream_for_async (async, pthread_self (), false, NULL);
1496 return s ? s->stream : NULL;
1499 static int
1500 nvptx_set_cuda_stream (int async, void *stream)
1502 struct ptx_stream *oldstream;
1503 pthread_t self = pthread_self ();
1504 struct nvptx_thread *nvthd = nvptx_thread ();
1506 if (async < 0)
1507 GOMP_PLUGIN_fatal ("bad async %d", async);
1509 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1511 /* We have a list of active streams and an array mapping async values to
1512 entries of that list. We need to take "ownership" of the passed-in stream,
1513 and add it to our list, removing the previous entry also (if there was one)
1514 in order to prevent resource leaks. Note the potential for surprise
1515 here: maybe we should keep track of passed-in streams and leave it up to
1516 the user to tidy those up, but that doesn't work for stream handles
1517 returned from acc_get_cuda_stream above... */
1519 oldstream = select_stream_for_async (async, self, false, NULL);
1521 if (oldstream)
1523 if (nvthd->ptx_dev->active_streams == oldstream)
1524 nvthd->ptx_dev->active_streams = nvthd->ptx_dev->active_streams->next;
1525 else
1527 struct ptx_stream *s = nvthd->ptx_dev->active_streams;
1528 while (s->next != oldstream)
1529 s = s->next;
1530 s->next = s->next->next;
1533 CUDA_CALL_ASSERT (cuStreamDestroy, oldstream->stream);
1535 if (!map_fini (oldstream))
1536 GOMP_PLUGIN_fatal ("error when freeing host memory");
1538 free (oldstream);
1541 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1543 (void) select_stream_for_async (async, self, true, (CUstream) stream);
1545 return 1;
1548 /* Plugin entry points. */
1550 const char *
1551 GOMP_OFFLOAD_get_name (void)
1553 return "nvptx";
1556 unsigned int
1557 GOMP_OFFLOAD_get_caps (void)
1559 return GOMP_OFFLOAD_CAP_OPENACC_200 | GOMP_OFFLOAD_CAP_OPENMP_400;
1563 GOMP_OFFLOAD_get_type (void)
1565 return OFFLOAD_TARGET_TYPE_NVIDIA_PTX;
1569 GOMP_OFFLOAD_get_num_devices (void)
1571 return nvptx_get_num_devices ();
1574 bool
1575 GOMP_OFFLOAD_init_device (int n)
1577 struct ptx_device *dev;
1579 pthread_mutex_lock (&ptx_dev_lock);
1581 if (!nvptx_init () || ptx_devices[n] != NULL)
1583 pthread_mutex_unlock (&ptx_dev_lock);
1584 return false;
1587 dev = nvptx_open_device (n);
1588 if (dev)
1590 ptx_devices[n] = dev;
1591 instantiated_devices++;
1594 pthread_mutex_unlock (&ptx_dev_lock);
1596 return dev != NULL;
1599 bool
1600 GOMP_OFFLOAD_fini_device (int n)
1602 pthread_mutex_lock (&ptx_dev_lock);
1604 if (ptx_devices[n] != NULL)
1606 if (!nvptx_attach_host_thread_to_device (n)
1607 || !nvptx_close_device (ptx_devices[n]))
1609 pthread_mutex_unlock (&ptx_dev_lock);
1610 return false;
1612 ptx_devices[n] = NULL;
1613 instantiated_devices--;
1616 pthread_mutex_unlock (&ptx_dev_lock);
1617 return true;
1620 /* Return the libgomp version number we're compatible with. There is
1621 no requirement for cross-version compatibility. */
1623 unsigned
1624 GOMP_OFFLOAD_version (void)
1626 return GOMP_VERSION;
1629 /* Initialize __nvptx_clocktick, if present in MODULE. */
1631 static void
1632 nvptx_set_clocktick (CUmodule module, struct ptx_device *dev)
1634 CUdeviceptr dptr;
1635 CUresult r = cuModuleGetGlobal (&dptr, NULL, module, "__nvptx_clocktick");
1636 if (r == CUDA_ERROR_NOT_FOUND)
1637 return;
1638 if (r != CUDA_SUCCESS)
1639 GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
1640 double __nvptx_clocktick = 1e-3 / dev->clock_khz;
1641 r = cuMemcpyHtoD (dptr, &__nvptx_clocktick, sizeof (__nvptx_clocktick));
1642 if (r != CUDA_SUCCESS)
1643 GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
1646 /* Load the (partial) program described by TARGET_DATA to device
1647 number ORD. Allocate and return TARGET_TABLE. */
1650 GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
1651 struct addr_pair **target_table)
1653 CUmodule module;
1654 const char *const *var_names;
1655 const struct targ_fn_launch *fn_descs;
1656 unsigned int fn_entries, var_entries, i, j;
1657 struct targ_fn_descriptor *targ_fns;
1658 struct addr_pair *targ_tbl;
1659 const nvptx_tdata_t *img_header = (const nvptx_tdata_t *) target_data;
1660 struct ptx_image_data *new_image;
1661 struct ptx_device *dev;
1663 if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1665 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1666 " (expected %u, received %u)",
1667 GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1668 return -1;
1671 if (!nvptx_attach_host_thread_to_device (ord)
1672 || !link_ptx (&module, img_header->ptx_objs, img_header->ptx_num))
1673 return -1;
1675 dev = ptx_devices[ord];
1677 /* The mkoffload utility emits a struct of pointers/integers at the
1678 start of each offload image. The array of kernel names and the
1679 functions addresses form a one-to-one correspondence. */
1681 var_entries = img_header->var_num;
1682 var_names = img_header->var_names;
1683 fn_entries = img_header->fn_num;
1684 fn_descs = img_header->fn_descs;
1686 targ_tbl = GOMP_PLUGIN_malloc (sizeof (struct addr_pair)
1687 * (fn_entries + var_entries));
1688 targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
1689 * fn_entries);
1691 *target_table = targ_tbl;
1693 new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data));
1694 new_image->target_data = target_data;
1695 new_image->module = module;
1696 new_image->fns = targ_fns;
1698 pthread_mutex_lock (&dev->image_lock);
1699 new_image->next = dev->images;
1700 dev->images = new_image;
1701 pthread_mutex_unlock (&dev->image_lock);
1703 for (i = 0; i < fn_entries; i++, targ_fns++, targ_tbl++)
1705 CUfunction function;
1706 int nregs, mthrs;
1708 CUDA_CALL_ERET (-1, cuModuleGetFunction, &function, module,
1709 fn_descs[i].fn);
1710 CUDA_CALL_ERET (-1, cuFuncGetAttribute, &nregs,
1711 CU_FUNC_ATTRIBUTE_NUM_REGS, function);
1712 CUDA_CALL_ERET (-1, cuFuncGetAttribute, &mthrs,
1713 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, function);
1715 targ_fns->fn = function;
1716 targ_fns->launch = &fn_descs[i];
1717 targ_fns->regs_per_thread = nregs;
1718 targ_fns->max_threads_per_block = mthrs;
1720 targ_tbl->start = (uintptr_t) targ_fns;
1721 targ_tbl->end = targ_tbl->start + 1;
1724 for (j = 0; j < var_entries; j++, targ_tbl++)
1726 CUdeviceptr var;
1727 size_t bytes;
1729 CUDA_CALL_ERET (-1, cuModuleGetGlobal,
1730 &var, &bytes, module, var_names[j]);
1732 targ_tbl->start = (uintptr_t) var;
1733 targ_tbl->end = targ_tbl->start + bytes;
1736 nvptx_set_clocktick (module, dev);
1738 return fn_entries + var_entries;
1741 /* Unload the program described by TARGET_DATA. DEV_DATA is the
1742 function descriptors allocated by G_O_load_image. */
1744 bool
1745 GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data)
1747 struct ptx_image_data *image, **prev_p;
1748 struct ptx_device *dev = ptx_devices[ord];
1750 if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1752 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1753 " (expected %u, received %u)",
1754 GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1755 return false;
1758 bool ret = true;
1759 pthread_mutex_lock (&dev->image_lock);
1760 for (prev_p = &dev->images; (image = *prev_p) != 0; prev_p = &image->next)
1761 if (image->target_data == target_data)
1763 *prev_p = image->next;
1764 if (cuModuleUnload (image->module) != CUDA_SUCCESS)
1765 ret = false;
1766 free (image->fns);
1767 free (image);
1768 break;
1770 pthread_mutex_unlock (&dev->image_lock);
1771 return ret;
1774 void *
1775 GOMP_OFFLOAD_alloc (int ord, size_t size)
1777 if (!nvptx_attach_host_thread_to_device (ord))
1778 return NULL;
1779 return nvptx_alloc (size);
1782 bool
1783 GOMP_OFFLOAD_free (int ord, void *ptr)
1785 return (nvptx_attach_host_thread_to_device (ord)
1786 && nvptx_free (ptr));
1789 bool
1790 GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
1792 return (nvptx_attach_host_thread_to_device (ord)
1793 && nvptx_dev2host (dst, src, n));
1796 bool
1797 GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
1799 return (nvptx_attach_host_thread_to_device (ord)
1800 && nvptx_host2dev (dst, src, n));
1803 bool
1804 GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n)
1806 struct ptx_device *ptx_dev = ptx_devices[ord];
1807 CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n,
1808 ptx_dev->null_stream->stream);
1809 return true;
1812 void (*device_run) (int n, void *fn_ptr, void *vars) = NULL;
1814 void
1815 GOMP_OFFLOAD_openacc_parallel (void (*fn) (void *), size_t mapnum,
1816 void **hostaddrs, void **devaddrs,
1817 int async, unsigned *dims, void *targ_mem_desc)
1819 nvptx_exec (fn, mapnum, hostaddrs, devaddrs, async, dims, targ_mem_desc);
1822 void
1823 GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc, int async)
1825 struct nvptx_thread *nvthd = nvptx_thread ();
1826 CUevent *e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1828 CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1829 CUDA_CALL_ASSERT (cuEventRecord, *e, nvthd->current_stream->stream);
1830 event_add (PTX_EVT_ASYNC_CLEANUP, e, targ_mem_desc, async);
1834 GOMP_OFFLOAD_openacc_async_test (int async)
1836 return nvptx_async_test (async);
1840 GOMP_OFFLOAD_openacc_async_test_all (void)
1842 return nvptx_async_test_all ();
1845 void
1846 GOMP_OFFLOAD_openacc_async_wait (int async)
1848 nvptx_wait (async);
1851 void
1852 GOMP_OFFLOAD_openacc_async_wait_async (int async1, int async2)
1854 nvptx_wait_async (async1, async2);
1857 void
1858 GOMP_OFFLOAD_openacc_async_wait_all (void)
1860 nvptx_wait_all ();
1863 void
1864 GOMP_OFFLOAD_openacc_async_wait_all_async (int async)
1866 nvptx_wait_all_async (async);
1869 void
1870 GOMP_OFFLOAD_openacc_async_set_async (int async)
1872 nvptx_set_async (async);
1875 void *
1876 GOMP_OFFLOAD_openacc_create_thread_data (int ord)
1878 struct ptx_device *ptx_dev;
1879 struct nvptx_thread *nvthd
1880 = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
1881 CUcontext thd_ctx;
1883 ptx_dev = ptx_devices[ord];
1885 assert (ptx_dev);
1887 CUDA_CALL_ASSERT (cuCtxGetCurrent, &thd_ctx);
1889 assert (ptx_dev->ctx);
1891 if (!thd_ctx)
1892 CUDA_CALL_ASSERT (cuCtxPushCurrent, ptx_dev->ctx);
1894 nvthd->current_stream = ptx_dev->null_stream;
1895 nvthd->ptx_dev = ptx_dev;
1897 return (void *) nvthd;
1900 void
1901 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
1903 free (data);
1906 void *
1907 GOMP_OFFLOAD_openacc_get_current_cuda_device (void)
1909 return nvptx_get_current_cuda_device ();
1912 void *
1913 GOMP_OFFLOAD_openacc_get_current_cuda_context (void)
1915 return nvptx_get_current_cuda_context ();
1918 /* NOTE: This returns a CUstream, not a ptx_stream pointer. */
1920 void *
1921 GOMP_OFFLOAD_openacc_get_cuda_stream (int async)
1923 return nvptx_get_cuda_stream (async);
1926 /* NOTE: This takes a CUstream, not a ptx_stream pointer. */
1929 GOMP_OFFLOAD_openacc_set_cuda_stream (int async, void *stream)
1931 return nvptx_set_cuda_stream (async, stream);
1934 /* Adjust launch dimensions: pick good values for number of blocks and warps
1935 and ensure that number of warps does not exceed CUDA limits as well as GCC's
1936 own limits. */
1938 static void
1939 nvptx_adjust_launch_bounds (struct targ_fn_descriptor *fn,
1940 struct ptx_device *ptx_dev,
1941 int *teams_p, int *threads_p)
1943 int max_warps_block = fn->max_threads_per_block / 32;
1944 /* Maximum 32 warps per block is an implementation limit in NVPTX backend
1945 and libgcc, which matches documented limit of all GPUs as of 2015. */
1946 if (max_warps_block > 32)
1947 max_warps_block = 32;
1948 if (*threads_p <= 0)
1949 *threads_p = 8;
1950 if (*threads_p > max_warps_block)
1951 *threads_p = max_warps_block;
1953 int regs_per_block = fn->regs_per_thread * 32 * *threads_p;
1954 /* This is an estimate of how many blocks the device can host simultaneously.
1955 Actual limit, which may be lower, can be queried with "occupancy control"
1956 driver interface (since CUDA 6.0). */
1957 int max_blocks = ptx_dev->regs_per_sm / regs_per_block * ptx_dev->num_sms;
1958 if (*teams_p <= 0 || *teams_p > max_blocks)
1959 *teams_p = max_blocks;
1962 /* Return the size of per-warp stacks (see gcc -msoft-stack) to use for OpenMP
1963 target regions. */
1965 static size_t
1966 nvptx_stacks_size ()
1968 return 128 * 1024;
1971 /* Return contiguous storage for NUM stacks, each SIZE bytes. */
1973 static void *
1974 nvptx_stacks_alloc (size_t size, int num)
1976 CUdeviceptr stacks;
1977 CUresult r = cuMemAlloc (&stacks, size * num);
1978 if (r != CUDA_SUCCESS)
1979 GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
1980 return (void *) stacks;
1983 /* Release storage previously allocated by nvptx_stacks_alloc. */
1985 static void
1986 nvptx_stacks_free (void *p, int num)
1988 CUresult r = cuMemFree ((CUdeviceptr) p);
1989 if (r != CUDA_SUCCESS)
1990 GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
1993 void
1994 GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
1996 CUfunction function = ((struct targ_fn_descriptor *) tgt_fn)->fn;
1997 CUresult r;
1998 struct ptx_device *ptx_dev = ptx_devices[ord];
1999 const char *maybe_abort_msg = "(perhaps abort was called)";
2000 int teams = 0, threads = 0;
2002 if (!args)
2003 GOMP_PLUGIN_fatal ("No target arguments provided");
2004 while (*args)
2006 intptr_t id = (intptr_t) *args++, val;
2007 if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
2008 val = (intptr_t) *args++;
2009 else
2010 val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
2011 if ((id & GOMP_TARGET_ARG_DEVICE_MASK) != GOMP_TARGET_ARG_DEVICE_ALL)
2012 continue;
2013 val = val > INT_MAX ? INT_MAX : val;
2014 id &= GOMP_TARGET_ARG_ID_MASK;
2015 if (id == GOMP_TARGET_ARG_NUM_TEAMS)
2016 teams = val;
2017 else if (id == GOMP_TARGET_ARG_THREAD_LIMIT)
2018 threads = val;
2020 nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads);
2022 size_t stack_size = nvptx_stacks_size ();
2023 void *stacks = nvptx_stacks_alloc (stack_size, teams * threads);
2024 void *fn_args[] = {tgt_vars, stacks, (void *) stack_size};
2025 size_t fn_args_size = sizeof fn_args;
2026 void *config[] = {
2027 CU_LAUNCH_PARAM_BUFFER_POINTER, fn_args,
2028 CU_LAUNCH_PARAM_BUFFER_SIZE, &fn_args_size,
2029 CU_LAUNCH_PARAM_END
2031 r = cuLaunchKernel (function,
2032 teams, 1, 1,
2033 32, threads, 1,
2034 0, ptx_dev->null_stream->stream, NULL, config);
2035 if (r != CUDA_SUCCESS)
2036 GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
2038 r = cuCtxSynchronize ();
2039 if (r == CUDA_ERROR_LAUNCH_FAILED)
2040 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
2041 maybe_abort_msg);
2042 else if (r != CUDA_SUCCESS)
2043 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
2044 nvptx_stacks_free (stacks, teams * threads);
2047 void
2048 GOMP_OFFLOAD_async_run (int ord, void *tgt_fn, void *tgt_vars, void **args,
2049 void *async_data)
2051 GOMP_PLUGIN_fatal ("GOMP_OFFLOAD_async_run unimplemented");