RTEMS: Update RTEMS thread model
[official-gcc.git] / libgomp / plugin / plugin-nvptx.c
bloba2f950db580a2f5b454c3b603e3c4cc6d5ef4d68
1 /* Plugin for NVPTX execution.
3 Copyright (C) 2013-2015 Free Software Foundation, Inc.
5 Contributed by Mentor Embedded.
7 This file is part of the GNU Offloading and Multi Processing Library
8 (libgomp).
10 Libgomp is free software; you can redistribute it and/or modify it
11 under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 3, or (at your option)
13 any later version.
15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
18 more details.
20 Under Section 7 of GPL version 3, you are granted additional
21 permissions described in the GCC Runtime Library Exception, version
22 3.1, as published by the Free Software Foundation.
24 You should have received a copy of the GNU General Public License and
25 a copy of the GCC Runtime Library Exception along with this program;
26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
27 <http://www.gnu.org/licenses/>. */
29 /* Nvidia PTX-specific parts of OpenACC support. The cuda driver
30 library appears to hold some implicit state, but the documentation
31 is not clear as to what that state might be. Or how one might
32 propagate it from one thread to another. */
34 #include "openacc.h"
35 #include "config.h"
36 #include "libgomp-plugin.h"
37 #include "oacc-ptx.h"
38 #include "oacc-plugin.h"
39 #include "gomp-constants.h"
41 #include <pthread.h>
42 #include <cuda.h>
43 #include <stdbool.h>
44 #include <stdint.h>
45 #include <string.h>
46 #include <stdio.h>
47 #include <unistd.h>
48 #include <assert.h>
50 #define ARRAYSIZE(X) (sizeof (X) / sizeof ((X)[0]))
52 static const struct
54 CUresult r;
55 const char *m;
56 } cuda_errlist[]=
58 { CUDA_ERROR_INVALID_VALUE, "invalid value" },
59 { CUDA_ERROR_OUT_OF_MEMORY, "out of memory" },
60 { CUDA_ERROR_NOT_INITIALIZED, "not initialized" },
61 { CUDA_ERROR_DEINITIALIZED, "deinitialized" },
62 { CUDA_ERROR_PROFILER_DISABLED, "profiler disabled" },
63 { CUDA_ERROR_PROFILER_NOT_INITIALIZED, "profiler not initialized" },
64 { CUDA_ERROR_PROFILER_ALREADY_STARTED, "already started" },
65 { CUDA_ERROR_PROFILER_ALREADY_STOPPED, "already stopped" },
66 { CUDA_ERROR_NO_DEVICE, "no device" },
67 { CUDA_ERROR_INVALID_DEVICE, "invalid device" },
68 { CUDA_ERROR_INVALID_IMAGE, "invalid image" },
69 { CUDA_ERROR_INVALID_CONTEXT, "invalid context" },
70 { CUDA_ERROR_CONTEXT_ALREADY_CURRENT, "context already current" },
71 { CUDA_ERROR_MAP_FAILED, "map error" },
72 { CUDA_ERROR_UNMAP_FAILED, "unmap error" },
73 { CUDA_ERROR_ARRAY_IS_MAPPED, "array is mapped" },
74 { CUDA_ERROR_ALREADY_MAPPED, "already mapped" },
75 { CUDA_ERROR_NO_BINARY_FOR_GPU, "no binary for gpu" },
76 { CUDA_ERROR_ALREADY_ACQUIRED, "already acquired" },
77 { CUDA_ERROR_NOT_MAPPED, "not mapped" },
78 { CUDA_ERROR_NOT_MAPPED_AS_ARRAY, "not mapped as array" },
79 { CUDA_ERROR_NOT_MAPPED_AS_POINTER, "not mapped as pointer" },
80 { CUDA_ERROR_ECC_UNCORRECTABLE, "ecc uncorrectable" },
81 { CUDA_ERROR_UNSUPPORTED_LIMIT, "unsupported limit" },
82 { CUDA_ERROR_CONTEXT_ALREADY_IN_USE, "context already in use" },
83 { CUDA_ERROR_PEER_ACCESS_UNSUPPORTED, "peer access unsupported" },
84 { CUDA_ERROR_INVALID_SOURCE, "invalid source" },
85 { CUDA_ERROR_FILE_NOT_FOUND, "file not found" },
86 { CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
87 "shared object symbol not found" },
88 { CUDA_ERROR_SHARED_OBJECT_INIT_FAILED, "shared object init error" },
89 { CUDA_ERROR_OPERATING_SYSTEM, "operating system" },
90 { CUDA_ERROR_INVALID_HANDLE, "invalid handle" },
91 { CUDA_ERROR_NOT_FOUND, "not found" },
92 { CUDA_ERROR_NOT_READY, "not ready" },
93 { CUDA_ERROR_LAUNCH_FAILED, "launch error" },
94 { CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, "launch out of resources" },
95 { CUDA_ERROR_LAUNCH_TIMEOUT, "launch timeout" },
96 { CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
97 "launch incompatibe texturing" },
98 { CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED, "peer access already enabled" },
99 { CUDA_ERROR_PEER_ACCESS_NOT_ENABLED, "peer access not enabled " },
100 { CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE, "primary cotext active" },
101 { CUDA_ERROR_CONTEXT_IS_DESTROYED, "context is destroyed" },
102 { CUDA_ERROR_ASSERT, "assert" },
103 { CUDA_ERROR_TOO_MANY_PEERS, "too many peers" },
104 { CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED,
105 "host memory already registered" },
106 { CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED, "host memory not registered" },
107 { CUDA_ERROR_NOT_PERMITTED, "not permitted" },
108 { CUDA_ERROR_NOT_SUPPORTED, "not supported" },
109 { CUDA_ERROR_UNKNOWN, "unknown" }
112 static const char *
113 cuda_error (CUresult r)
115 int i;
117 for (i = 0; i < ARRAYSIZE (cuda_errlist); i++)
119 if (cuda_errlist[i].r == r)
120 return cuda_errlist[i].m;
123 static char errmsg[30];
125 snprintf (errmsg, sizeof (errmsg), "unknown error code: %d", r);
127 return errmsg;
130 static unsigned int instantiated_devices = 0;
131 static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
133 struct ptx_stream
135 CUstream stream;
136 pthread_t host_thread;
137 bool multithreaded;
139 CUdeviceptr d;
140 void *h;
141 void *h_begin;
142 void *h_end;
143 void *h_next;
144 void *h_prev;
145 void *h_tail;
147 struct ptx_stream *next;
150 /* Thread-specific data for PTX. */
152 struct nvptx_thread
154 struct ptx_stream *current_stream;
155 struct ptx_device *ptx_dev;
158 struct map
160 int async;
161 size_t size;
162 char mappings[0];
165 static void
166 map_init (struct ptx_stream *s)
168 CUresult r;
170 int size = getpagesize ();
172 assert (s);
173 assert (!s->d);
174 assert (!s->h);
176 r = cuMemAllocHost (&s->h, size);
177 if (r != CUDA_SUCCESS)
178 GOMP_PLUGIN_fatal ("cuMemAllocHost error: %s", cuda_error (r));
180 r = cuMemHostGetDevicePointer (&s->d, s->h, 0);
181 if (r != CUDA_SUCCESS)
182 GOMP_PLUGIN_fatal ("cuMemHostGetDevicePointer error: %s", cuda_error (r));
184 assert (s->h);
186 s->h_begin = s->h;
187 s->h_end = s->h_begin + size;
188 s->h_next = s->h_prev = s->h_tail = s->h_begin;
190 assert (s->h_next);
191 assert (s->h_end);
194 static void
195 map_fini (struct ptx_stream *s)
197 CUresult r;
199 r = cuMemFreeHost (s->h);
200 if (r != CUDA_SUCCESS)
201 GOMP_PLUGIN_fatal ("cuMemFreeHost error: %s", cuda_error (r));
204 static void
205 map_pop (struct ptx_stream *s)
207 struct map *m;
209 assert (s != NULL);
210 assert (s->h_next);
211 assert (s->h_prev);
212 assert (s->h_tail);
214 m = s->h_tail;
216 s->h_tail += m->size;
218 if (s->h_tail >= s->h_end)
219 s->h_tail = s->h_begin + (int) (s->h_tail - s->h_end);
221 if (s->h_next == s->h_tail)
222 s->h_prev = s->h_next;
224 assert (s->h_next >= s->h_begin);
225 assert (s->h_tail >= s->h_begin);
226 assert (s->h_prev >= s->h_begin);
228 assert (s->h_next <= s->h_end);
229 assert (s->h_tail <= s->h_end);
230 assert (s->h_prev <= s->h_end);
233 static void
234 map_push (struct ptx_stream *s, int async, size_t size, void **h, void **d)
236 int left;
237 int offset;
238 struct map *m;
240 assert (s != NULL);
242 left = s->h_end - s->h_next;
243 size += sizeof (struct map);
245 assert (s->h_prev);
246 assert (s->h_next);
248 if (size >= left)
250 m = s->h_prev;
251 m->size += left;
252 s->h_next = s->h_begin;
254 if (s->h_next + size > s->h_end)
255 GOMP_PLUGIN_fatal ("unable to push map");
258 assert (s->h_next);
260 m = s->h_next;
261 m->async = async;
262 m->size = size;
264 offset = (void *)&m->mappings[0] - s->h;
266 *d = (void *)(s->d + offset);
267 *h = (void *)(s->h + offset);
269 s->h_prev = s->h_next;
270 s->h_next += size;
272 assert (s->h_prev);
273 assert (s->h_next);
275 assert (s->h_next >= s->h_begin);
276 assert (s->h_tail >= s->h_begin);
277 assert (s->h_prev >= s->h_begin);
278 assert (s->h_next <= s->h_end);
279 assert (s->h_tail <= s->h_end);
280 assert (s->h_prev <= s->h_end);
282 return;
285 /* Descriptor of a loaded function. */
287 struct targ_fn_descriptor
289 CUfunction fn;
290 const char *name;
293 /* A loaded PTX image. */
294 struct ptx_image_data
296 const void *target_data;
297 CUmodule module;
299 struct targ_fn_descriptor *fns; /* Array of functions. */
301 struct ptx_image_data *next;
304 struct ptx_device
306 CUcontext ctx;
307 bool ctx_shared;
308 CUdevice dev;
309 struct ptx_stream *null_stream;
310 /* All non-null streams associated with this device (actually context),
311 either created implicitly or passed in from the user (via
312 acc_set_cuda_stream). */
313 struct ptx_stream *active_streams;
314 struct {
315 struct ptx_stream **arr;
316 int size;
317 } async_streams;
318 /* A lock for use when manipulating the above stream list and array. */
319 pthread_mutex_t stream_lock;
320 int ord;
321 bool overlap;
322 bool map;
323 bool concur;
324 int mode;
325 bool mkern;
327 struct ptx_image_data *images; /* Images loaded on device. */
328 pthread_mutex_t image_lock; /* Lock for above list. */
330 struct ptx_device *next;
333 enum ptx_event_type
335 PTX_EVT_MEM,
336 PTX_EVT_KNL,
337 PTX_EVT_SYNC,
338 PTX_EVT_ASYNC_CLEANUP
341 struct ptx_event
343 CUevent *evt;
344 int type;
345 void *addr;
346 int ord;
348 struct ptx_event *next;
351 static pthread_mutex_t ptx_event_lock;
352 static struct ptx_event *ptx_events;
354 static struct ptx_device **ptx_devices;
356 static inline struct nvptx_thread *
357 nvptx_thread (void)
359 return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
362 static void
363 init_streams_for_device (struct ptx_device *ptx_dev, int concurrency)
365 int i;
366 struct ptx_stream *null_stream
367 = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
369 null_stream->stream = NULL;
370 null_stream->host_thread = pthread_self ();
371 null_stream->multithreaded = true;
372 null_stream->d = (CUdeviceptr) NULL;
373 null_stream->h = NULL;
374 map_init (null_stream);
375 ptx_dev->null_stream = null_stream;
377 ptx_dev->active_streams = NULL;
378 pthread_mutex_init (&ptx_dev->stream_lock, NULL);
380 if (concurrency < 1)
381 concurrency = 1;
383 /* This is just a guess -- make space for as many async streams as the
384 current device is capable of concurrently executing. This can grow
385 later as necessary. No streams are created yet. */
386 ptx_dev->async_streams.arr
387 = GOMP_PLUGIN_malloc (concurrency * sizeof (struct ptx_stream *));
388 ptx_dev->async_streams.size = concurrency;
390 for (i = 0; i < concurrency; i++)
391 ptx_dev->async_streams.arr[i] = NULL;
394 static void
395 fini_streams_for_device (struct ptx_device *ptx_dev)
397 free (ptx_dev->async_streams.arr);
399 while (ptx_dev->active_streams != NULL)
401 struct ptx_stream *s = ptx_dev->active_streams;
402 ptx_dev->active_streams = ptx_dev->active_streams->next;
404 map_fini (s);
405 cuStreamDestroy (s->stream);
406 free (s);
409 map_fini (ptx_dev->null_stream);
410 free (ptx_dev->null_stream);
413 /* Select a stream for (OpenACC-semantics) ASYNC argument for the current
414 thread THREAD (and also current device/context). If CREATE is true, create
415 the stream if it does not exist (or use EXISTING if it is non-NULL), and
416 associate the stream with the same thread argument. Returns stream to use
417 as result. */
419 static struct ptx_stream *
420 select_stream_for_async (int async, pthread_t thread, bool create,
421 CUstream existing)
423 struct nvptx_thread *nvthd = nvptx_thread ();
424 /* Local copy of TLS variable. */
425 struct ptx_device *ptx_dev = nvthd->ptx_dev;
426 struct ptx_stream *stream = NULL;
427 int orig_async = async;
429 /* The special value acc_async_noval (-1) maps (for now) to an
430 implicitly-created stream, which is then handled the same as any other
431 numbered async stream. Other options are available, e.g. using the null
432 stream for anonymous async operations, or choosing an idle stream from an
433 active set. But, stick with this for now. */
434 if (async > acc_async_sync)
435 async++;
437 if (create)
438 pthread_mutex_lock (&ptx_dev->stream_lock);
440 /* NOTE: AFAICT there's no particular need for acc_async_sync to map to the
441 null stream, and in fact better performance may be obtainable if it doesn't
442 (because the null stream enforces overly-strict synchronisation with
443 respect to other streams for legacy reasons, and that's probably not
444 needed with OpenACC). Maybe investigate later. */
445 if (async == acc_async_sync)
446 stream = ptx_dev->null_stream;
447 else if (async >= 0 && async < ptx_dev->async_streams.size
448 && ptx_dev->async_streams.arr[async] && !(create && existing))
449 stream = ptx_dev->async_streams.arr[async];
450 else if (async >= 0 && create)
452 if (async >= ptx_dev->async_streams.size)
454 int i, newsize = ptx_dev->async_streams.size * 2;
456 if (async >= newsize)
457 newsize = async + 1;
459 ptx_dev->async_streams.arr
460 = GOMP_PLUGIN_realloc (ptx_dev->async_streams.arr,
461 newsize * sizeof (struct ptx_stream *));
463 for (i = ptx_dev->async_streams.size; i < newsize; i++)
464 ptx_dev->async_streams.arr[i] = NULL;
466 ptx_dev->async_streams.size = newsize;
469 /* Create a new stream on-demand if there isn't one already, or if we're
470 setting a particular async value to an existing (externally-provided)
471 stream. */
472 if (!ptx_dev->async_streams.arr[async] || existing)
474 CUresult r;
475 struct ptx_stream *s
476 = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
478 if (existing)
479 s->stream = existing;
480 else
482 r = cuStreamCreate (&s->stream, CU_STREAM_DEFAULT);
483 if (r != CUDA_SUCCESS)
484 GOMP_PLUGIN_fatal ("cuStreamCreate error: %s", cuda_error (r));
487 /* If CREATE is true, we're going to be queueing some work on this
488 stream. Associate it with the current host thread. */
489 s->host_thread = thread;
490 s->multithreaded = false;
492 s->d = (CUdeviceptr) NULL;
493 s->h = NULL;
494 map_init (s);
496 s->next = ptx_dev->active_streams;
497 ptx_dev->active_streams = s;
498 ptx_dev->async_streams.arr[async] = s;
501 stream = ptx_dev->async_streams.arr[async];
503 else if (async < 0)
504 GOMP_PLUGIN_fatal ("bad async %d", async);
506 if (create)
508 assert (stream != NULL);
510 /* If we're trying to use the same stream from different threads
511 simultaneously, set stream->multithreaded to true. This affects the
512 behaviour of acc_async_test_all and acc_wait_all, which are supposed to
513 only wait for asynchronous launches from the same host thread they are
514 invoked on. If multiple threads use the same async value, we make note
515 of that here and fall back to testing/waiting for all threads in those
516 functions. */
517 if (thread != stream->host_thread)
518 stream->multithreaded = true;
520 pthread_mutex_unlock (&ptx_dev->stream_lock);
522 else if (stream && !stream->multithreaded
523 && !pthread_equal (stream->host_thread, thread))
524 GOMP_PLUGIN_fatal ("async %d used on wrong thread", orig_async);
526 return stream;
529 /* Initialize the device. Return TRUE on success, else FALSE. PTX_DEV_LOCK
530 should be locked on entry and remains locked on exit. */
532 static bool
533 nvptx_init (void)
535 CUresult r;
536 int ndevs;
538 if (instantiated_devices != 0)
539 return true;
541 r = cuInit (0);
542 if (r != CUDA_SUCCESS)
543 GOMP_PLUGIN_fatal ("cuInit error: %s", cuda_error (r));
545 ptx_events = NULL;
547 pthread_mutex_init (&ptx_event_lock, NULL);
549 r = cuDeviceGetCount (&ndevs);
550 if (r != CUDA_SUCCESS)
551 GOMP_PLUGIN_fatal ("cuDeviceGetCount error: %s", cuda_error (r));
553 ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *)
554 * ndevs);
556 return true;
559 /* Select the N'th PTX device for the current host thread. The device must
560 have been previously opened before calling this function. */
562 static void
563 nvptx_attach_host_thread_to_device (int n)
565 CUdevice dev;
566 CUresult r;
567 struct ptx_device *ptx_dev;
568 CUcontext thd_ctx;
570 r = cuCtxGetDevice (&dev);
571 if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
572 GOMP_PLUGIN_fatal ("cuCtxGetDevice error: %s", cuda_error (r));
574 if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n)
575 return;
576 else
578 CUcontext old_ctx;
580 ptx_dev = ptx_devices[n];
581 assert (ptx_dev);
583 r = cuCtxGetCurrent (&thd_ctx);
584 if (r != CUDA_SUCCESS)
585 GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));
587 /* We don't necessarily have a current context (e.g. if it has been
588 destroyed. Pop it if we do though. */
589 if (thd_ctx != NULL)
591 r = cuCtxPopCurrent (&old_ctx);
592 if (r != CUDA_SUCCESS)
593 GOMP_PLUGIN_fatal ("cuCtxPopCurrent error: %s", cuda_error (r));
596 r = cuCtxPushCurrent (ptx_dev->ctx);
597 if (r != CUDA_SUCCESS)
598 GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s", cuda_error (r));
602 static struct ptx_device *
603 nvptx_open_device (int n)
605 struct ptx_device *ptx_dev;
606 CUdevice dev, ctx_dev;
607 CUresult r;
608 int async_engines, pi;
610 r = cuDeviceGet (&dev, n);
611 if (r != CUDA_SUCCESS)
612 GOMP_PLUGIN_fatal ("cuDeviceGet error: %s", cuda_error (r));
614 ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device));
616 ptx_dev->ord = n;
617 ptx_dev->dev = dev;
618 ptx_dev->ctx_shared = false;
620 r = cuCtxGetDevice (&ctx_dev);
621 if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
622 GOMP_PLUGIN_fatal ("cuCtxGetDevice error: %s", cuda_error (r));
624 if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev)
626 /* The current host thread has an active context for a different device.
627 Detach it. */
628 CUcontext old_ctx;
630 r = cuCtxPopCurrent (&old_ctx);
631 if (r != CUDA_SUCCESS)
632 GOMP_PLUGIN_fatal ("cuCtxPopCurrent error: %s", cuda_error (r));
635 r = cuCtxGetCurrent (&ptx_dev->ctx);
636 if (r != CUDA_SUCCESS)
637 GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));
639 if (!ptx_dev->ctx)
641 r = cuCtxCreate (&ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
642 if (r != CUDA_SUCCESS)
643 GOMP_PLUGIN_fatal ("cuCtxCreate error: %s", cuda_error (r));
645 else
646 ptx_dev->ctx_shared = true;
648 r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
649 if (r != CUDA_SUCCESS)
650 GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
652 ptx_dev->overlap = pi;
654 r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
655 if (r != CUDA_SUCCESS)
656 GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
658 ptx_dev->map = pi;
660 r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
661 if (r != CUDA_SUCCESS)
662 GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
664 ptx_dev->concur = pi;
666 r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
667 if (r != CUDA_SUCCESS)
668 GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
670 ptx_dev->mode = pi;
672 r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
673 if (r != CUDA_SUCCESS)
674 GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
676 ptx_dev->mkern = pi;
678 r = cuDeviceGetAttribute (&async_engines,
679 CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
680 if (r != CUDA_SUCCESS)
681 async_engines = 1;
683 ptx_dev->images = NULL;
684 pthread_mutex_init (&ptx_dev->image_lock, NULL);
686 init_streams_for_device (ptx_dev, async_engines);
688 return ptx_dev;
691 static void
692 nvptx_close_device (struct ptx_device *ptx_dev)
694 CUresult r;
696 if (!ptx_dev)
697 return;
699 fini_streams_for_device (ptx_dev);
701 pthread_mutex_destroy (&ptx_dev->image_lock);
703 if (!ptx_dev->ctx_shared)
705 r = cuCtxDestroy (ptx_dev->ctx);
706 if (r != CUDA_SUCCESS)
707 GOMP_PLUGIN_fatal ("cuCtxDestroy error: %s", cuda_error (r));
710 free (ptx_dev);
713 static int
714 nvptx_get_num_devices (void)
716 int n;
717 CUresult r;
719 /* PR libgomp/65099: Currently, we only support offloading in 64-bit
720 configurations. */
721 if (sizeof (void *) != 8)
722 return 0;
724 /* This function will be called before the plugin has been initialized in
725 order to enumerate available devices, but CUDA API routines can't be used
726 until cuInit has been called. Just call it now (but don't yet do any
727 further initialization). */
728 if (instantiated_devices == 0)
730 r = cuInit (0);
731 /* This is not an error: e.g. we may have CUDA libraries installed but
732 no devices available. */
733 if (r != CUDA_SUCCESS)
734 return 0;
737 r = cuDeviceGetCount (&n);
738 if (r!= CUDA_SUCCESS)
739 GOMP_PLUGIN_fatal ("cuDeviceGetCount error: %s", cuda_error (r));
741 return n;
745 static void
746 link_ptx (CUmodule *module, const char *ptx_code)
748 CUjit_option opts[7];
749 void *optvals[7];
750 float elapsed = 0.0;
751 #define LOGSIZE 8192
752 char elog[LOGSIZE];
753 char ilog[LOGSIZE];
754 unsigned long logsize = LOGSIZE;
755 CUlinkState linkstate;
756 CUresult r;
757 void *linkout;
758 size_t linkoutsize __attribute__ ((unused));
760 GOMP_PLUGIN_debug (0, "attempting to load:\n---\n%s\n---\n", ptx_code);
762 opts[0] = CU_JIT_WALL_TIME;
763 optvals[0] = &elapsed;
765 opts[1] = CU_JIT_INFO_LOG_BUFFER;
766 optvals[1] = &ilog[0];
768 opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
769 optvals[2] = (void *) logsize;
771 opts[3] = CU_JIT_ERROR_LOG_BUFFER;
772 optvals[3] = &elog[0];
774 opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
775 optvals[4] = (void *) logsize;
777 opts[5] = CU_JIT_LOG_VERBOSE;
778 optvals[5] = (void *) 1;
780 opts[6] = CU_JIT_TARGET;
781 optvals[6] = (void *) CU_TARGET_COMPUTE_30;
783 r = cuLinkCreate (7, opts, optvals, &linkstate);
784 if (r != CUDA_SUCCESS)
785 GOMP_PLUGIN_fatal ("cuLinkCreate error: %s", cuda_error (r));
787 char *abort_ptx = ABORT_PTX;
788 r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, abort_ptx,
789 strlen (abort_ptx) + 1, 0, 0, 0, 0);
790 if (r != CUDA_SUCCESS)
792 GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
793 GOMP_PLUGIN_fatal ("cuLinkAddData (abort) error: %s", cuda_error (r));
796 char *acc_on_device_ptx = ACC_ON_DEVICE_PTX;
797 r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, acc_on_device_ptx,
798 strlen (acc_on_device_ptx) + 1, 0, 0, 0, 0);
799 if (r != CUDA_SUCCESS)
801 GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
802 GOMP_PLUGIN_fatal ("cuLinkAddData (acc_on_device) error: %s",
803 cuda_error (r));
806 char *goacc_internal_ptx = GOACC_INTERNAL_PTX;
807 r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, goacc_internal_ptx,
808 strlen (goacc_internal_ptx) + 1, 0, 0, 0, 0);
809 if (r != CUDA_SUCCESS)
811 GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
812 GOMP_PLUGIN_fatal ("cuLinkAddData (goacc_internal_ptx) error: %s",
813 cuda_error (r));
816 /* cuLinkAddData's 'data' argument erroneously omits the const qualifier. */
817 r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, (char *)ptx_code,
818 strlen (ptx_code) + 1, 0, 0, 0, 0);
819 if (r != CUDA_SUCCESS)
821 GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
822 GOMP_PLUGIN_fatal ("cuLinkAddData (ptx_code) error: %s", cuda_error (r));
825 r = cuLinkComplete (linkstate, &linkout, &linkoutsize);
826 if (r != CUDA_SUCCESS)
827 GOMP_PLUGIN_fatal ("cuLinkComplete error: %s", cuda_error (r));
829 GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed);
830 GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]);
832 r = cuModuleLoadData (module, linkout);
833 if (r != CUDA_SUCCESS)
834 GOMP_PLUGIN_fatal ("cuModuleLoadData error: %s", cuda_error (r));
837 static void
838 event_gc (bool memmap_lockable)
840 struct ptx_event *ptx_event = ptx_events;
841 struct nvptx_thread *nvthd = nvptx_thread ();
843 pthread_mutex_lock (&ptx_event_lock);
845 while (ptx_event != NULL)
847 CUresult r;
848 struct ptx_event *e = ptx_event;
850 ptx_event = ptx_event->next;
852 if (e->ord != nvthd->ptx_dev->ord)
853 continue;
855 r = cuEventQuery (*e->evt);
856 if (r == CUDA_SUCCESS)
858 CUevent *te;
860 te = e->evt;
862 switch (e->type)
864 case PTX_EVT_MEM:
865 case PTX_EVT_SYNC:
866 break;
868 case PTX_EVT_KNL:
869 map_pop (e->addr);
870 break;
872 case PTX_EVT_ASYNC_CLEANUP:
874 /* The function gomp_plugin_async_unmap_vars needs to claim the
875 memory-map splay tree lock for the current device, so we
876 can't call it when one of our callers has already claimed
877 the lock. In that case, just delay the GC for this event
878 until later. */
879 if (!memmap_lockable)
880 continue;
882 GOMP_PLUGIN_async_unmap_vars (e->addr);
884 break;
887 cuEventDestroy (*te);
888 free ((void *)te);
890 if (ptx_events == e)
891 ptx_events = ptx_events->next;
892 else
894 struct ptx_event *e_ = ptx_events;
895 while (e_->next != e)
896 e_ = e_->next;
897 e_->next = e_->next->next;
900 free (e);
904 pthread_mutex_unlock (&ptx_event_lock);
907 static void
908 event_add (enum ptx_event_type type, CUevent *e, void *h)
910 struct ptx_event *ptx_event;
911 struct nvptx_thread *nvthd = nvptx_thread ();
913 assert (type == PTX_EVT_MEM || type == PTX_EVT_KNL || type == PTX_EVT_SYNC
914 || type == PTX_EVT_ASYNC_CLEANUP);
916 ptx_event = GOMP_PLUGIN_malloc (sizeof (struct ptx_event));
917 ptx_event->type = type;
918 ptx_event->evt = e;
919 ptx_event->addr = h;
920 ptx_event->ord = nvthd->ptx_dev->ord;
922 pthread_mutex_lock (&ptx_event_lock);
924 ptx_event->next = ptx_events;
925 ptx_events = ptx_event;
927 pthread_mutex_unlock (&ptx_event_lock);
930 void
931 nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
932 size_t *sizes, unsigned short *kinds, int num_gangs, int num_workers,
933 int vector_length, int async, void *targ_mem_desc)
935 struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
936 CUfunction function;
937 CUresult r;
938 int i;
939 struct ptx_stream *dev_str;
940 void *kargs[1];
941 void *hp, *dp;
942 unsigned int nthreads_in_block;
943 struct nvptx_thread *nvthd = nvptx_thread ();
944 const char *maybe_abort_msg = "(perhaps abort was called)";
946 function = targ_fn->fn;
948 dev_str = select_stream_for_async (async, pthread_self (), false, NULL);
949 assert (dev_str == nvthd->current_stream);
951 /* This reserves a chunk of a pre-allocated page of memory mapped on both
952 the host and the device. HP is a host pointer to the new chunk, and DP is
953 the corresponding device pointer. */
954 map_push (dev_str, async, mapnum * sizeof (void *), &hp, &dp);
956 GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__);
958 /* Copy the array of arguments to the mapped page. */
959 for (i = 0; i < mapnum; i++)
960 ((void **) hp)[i] = devaddrs[i];
962 /* Copy the (device) pointers to arguments to the device (dp and hp might in
963 fact have the same value on a unified-memory system). */
964 r = cuMemcpy ((CUdeviceptr)dp, (CUdeviceptr)hp, mapnum * sizeof (void *));
965 if (r != CUDA_SUCCESS)
966 GOMP_PLUGIN_fatal ("cuMemcpy failed: %s", cuda_error (r));
968 GOMP_PLUGIN_debug (0, " %s: kernel %s: launch\n", __FUNCTION__, targ_fn->name);
970 // OpenACC CUDA
972 // num_gangs blocks
973 // num_workers warps (where a warp is equivalent to 32 threads)
974 // vector length threads
977 /* The openacc vector_length clause 'determines the vector length to use for
978 vector or SIMD operations'. The question is how to map this to CUDA.
980 In CUDA, the warp size is the vector length of a CUDA device. However, the
981 CUDA interface abstracts away from that, and only shows us warp size
982 indirectly in maximum number of threads per block, which is a product of
983 warp size and the number of hyperthreads of a multiprocessor.
985 We choose to map openacc vector_length directly onto the number of threads
986 in a block, in the x dimension. This is reflected in gcc code generation
987 that uses ThreadIdx.x to access vector elements.
989 Attempting to use an openacc vector_length of more than the maximum number
990 of threads per block will result in a cuda error. */
991 nthreads_in_block = vector_length;
993 kargs[0] = &dp;
994 r = cuLaunchKernel (function,
995 num_gangs, 1, 1,
996 nthreads_in_block, 1, 1,
997 0, dev_str->stream, kargs, 0);
998 if (r != CUDA_SUCCESS)
999 GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
1001 #ifndef DISABLE_ASYNC
1002 if (async < acc_async_noval)
1004 r = cuStreamSynchronize (dev_str->stream);
1005 if (r == CUDA_ERROR_LAUNCH_FAILED)
1006 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
1007 maybe_abort_msg);
1008 else if (r != CUDA_SUCCESS)
1009 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1011 else
1013 CUevent *e;
1015 e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1017 r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1018 if (r == CUDA_ERROR_LAUNCH_FAILED)
1019 GOMP_PLUGIN_fatal ("cuEventCreate error: %s %s\n", cuda_error (r),
1020 maybe_abort_msg);
1021 else if (r != CUDA_SUCCESS)
1022 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1024 event_gc (true);
1026 r = cuEventRecord (*e, dev_str->stream);
1027 if (r != CUDA_SUCCESS)
1028 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1030 event_add (PTX_EVT_KNL, e, (void *)dev_str);
1032 #else
1033 r = cuCtxSynchronize ();
1034 if (r == CUDA_ERROR_LAUNCH_FAILED)
1035 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
1036 maybe_abort_msg);
1037 else if (r != CUDA_SUCCESS)
1038 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
1039 #endif
1041 GOMP_PLUGIN_debug (0, " %s: kernel %s: finished\n", __FUNCTION__,
1042 targ_fn->name);
1044 #ifndef DISABLE_ASYNC
1045 if (async < acc_async_noval)
1046 #endif
1047 map_pop (dev_str);
1050 void * openacc_get_current_cuda_context (void);
1052 static void *
1053 nvptx_alloc (size_t s)
1055 CUdeviceptr d;
1056 CUresult r;
1058 r = cuMemAlloc (&d, s);
1059 if (r == CUDA_ERROR_OUT_OF_MEMORY)
1060 return 0;
1061 if (r != CUDA_SUCCESS)
1062 GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
1063 return (void *)d;
1066 static void
1067 nvptx_free (void *p)
1069 CUresult r;
1070 CUdeviceptr pb;
1071 size_t ps;
1073 r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)p);
1074 if (r != CUDA_SUCCESS)
1075 GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r));
1077 if ((CUdeviceptr)p != pb)
1078 GOMP_PLUGIN_fatal ("invalid device address");
1080 r = cuMemFree ((CUdeviceptr)p);
1081 if (r != CUDA_SUCCESS)
1082 GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
1085 static void *
1086 nvptx_host2dev (void *d, const void *h, size_t s)
1088 CUresult r;
1089 CUdeviceptr pb;
1090 size_t ps;
1091 struct nvptx_thread *nvthd = nvptx_thread ();
1093 if (!s)
1094 return 0;
1096 if (!d)
1097 GOMP_PLUGIN_fatal ("invalid device address");
1099 r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)d);
1100 if (r != CUDA_SUCCESS)
1101 GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r));
1103 if (!pb)
1104 GOMP_PLUGIN_fatal ("invalid device address");
1106 if (!h)
1107 GOMP_PLUGIN_fatal ("invalid host address");
1109 if (d == h)
1110 GOMP_PLUGIN_fatal ("invalid host or device address");
1112 if ((void *)(d + s) > (void *)(pb + ps))
1113 GOMP_PLUGIN_fatal ("invalid size");
1115 #ifndef DISABLE_ASYNC
1116 if (nvthd->current_stream != nvthd->ptx_dev->null_stream)
1118 CUevent *e;
1120 e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1122 r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1123 if (r != CUDA_SUCCESS)
1124 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1126 event_gc (false);
1128 r = cuMemcpyHtoDAsync ((CUdeviceptr)d, h, s,
1129 nvthd->current_stream->stream);
1130 if (r != CUDA_SUCCESS)
1131 GOMP_PLUGIN_fatal ("cuMemcpyHtoDAsync error: %s", cuda_error (r));
1133 r = cuEventRecord (*e, nvthd->current_stream->stream);
1134 if (r != CUDA_SUCCESS)
1135 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1137 event_add (PTX_EVT_MEM, e, (void *)h);
1139 else
1140 #endif
1142 r = cuMemcpyHtoD ((CUdeviceptr)d, h, s);
1143 if (r != CUDA_SUCCESS)
1144 GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
1147 return 0;
1150 static void *
1151 nvptx_dev2host (void *h, const void *d, size_t s)
1153 CUresult r;
1154 CUdeviceptr pb;
1155 size_t ps;
1156 struct nvptx_thread *nvthd = nvptx_thread ();
1158 if (!s)
1159 return 0;
1161 if (!d)
1162 GOMP_PLUGIN_fatal ("invalid device address");
1164 r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)d);
1165 if (r != CUDA_SUCCESS)
1166 GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r));
1168 if (!pb)
1169 GOMP_PLUGIN_fatal ("invalid device address");
1171 if (!h)
1172 GOMP_PLUGIN_fatal ("invalid host address");
1174 if (d == h)
1175 GOMP_PLUGIN_fatal ("invalid host or device address");
1177 if ((void *)(d + s) > (void *)(pb + ps))
1178 GOMP_PLUGIN_fatal ("invalid size");
1180 #ifndef DISABLE_ASYNC
1181 if (nvthd->current_stream != nvthd->ptx_dev->null_stream)
1183 CUevent *e;
1185 e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1187 r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1188 if (r != CUDA_SUCCESS)
1189 GOMP_PLUGIN_fatal ("cuEventCreate error: %s\n", cuda_error (r));
1191 event_gc (false);
1193 r = cuMemcpyDtoHAsync (h, (CUdeviceptr)d, s,
1194 nvthd->current_stream->stream);
1195 if (r != CUDA_SUCCESS)
1196 GOMP_PLUGIN_fatal ("cuMemcpyDtoHAsync error: %s", cuda_error (r));
1198 r = cuEventRecord (*e, nvthd->current_stream->stream);
1199 if (r != CUDA_SUCCESS)
1200 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1202 event_add (PTX_EVT_MEM, e, (void *)h);
1204 else
1205 #endif
1207 r = cuMemcpyDtoH (h, (CUdeviceptr)d, s);
1208 if (r != CUDA_SUCCESS)
1209 GOMP_PLUGIN_fatal ("cuMemcpyDtoH error: %s", cuda_error (r));
1212 return 0;
1215 static void
1216 nvptx_set_async (int async)
1218 struct nvptx_thread *nvthd = nvptx_thread ();
1219 nvthd->current_stream
1220 = select_stream_for_async (async, pthread_self (), true, NULL);
1223 static int
1224 nvptx_async_test (int async)
1226 CUresult r;
1227 struct ptx_stream *s;
1229 s = select_stream_for_async (async, pthread_self (), false, NULL);
1231 if (!s)
1232 GOMP_PLUGIN_fatal ("unknown async %d", async);
1234 r = cuStreamQuery (s->stream);
1235 if (r == CUDA_SUCCESS)
1237 /* The oacc-parallel.c:goacc_wait function calls this hook to determine
1238 whether all work has completed on this stream, and if so omits the call
1239 to the wait hook. If that happens, event_gc might not get called
1240 (which prevents variables from getting unmapped and their associated
1241 device storage freed), so call it here. */
1242 event_gc (true);
1243 return 1;
1245 else if (r == CUDA_ERROR_NOT_READY)
1246 return 0;
1248 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
1250 return 0;
1253 static int
1254 nvptx_async_test_all (void)
1256 struct ptx_stream *s;
1257 pthread_t self = pthread_self ();
1258 struct nvptx_thread *nvthd = nvptx_thread ();
1260 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1262 for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
1264 if ((s->multithreaded || pthread_equal (s->host_thread, self))
1265 && cuStreamQuery (s->stream) == CUDA_ERROR_NOT_READY)
1267 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1268 return 0;
1272 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1274 event_gc (true);
1276 return 1;
1279 static void
1280 nvptx_wait (int async)
1282 CUresult r;
1283 struct ptx_stream *s;
1285 s = select_stream_for_async (async, pthread_self (), false, NULL);
1287 if (!s)
1288 GOMP_PLUGIN_fatal ("unknown async %d", async);
1290 r = cuStreamSynchronize (s->stream);
1291 if (r != CUDA_SUCCESS)
1292 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1294 event_gc (true);
1297 static void
1298 nvptx_wait_async (int async1, int async2)
1300 CUresult r;
1301 CUevent *e;
1302 struct ptx_stream *s1, *s2;
1303 pthread_t self = pthread_self ();
1305 /* The stream that is waiting (rather than being waited for) doesn't
1306 necessarily have to exist already. */
1307 s2 = select_stream_for_async (async2, self, true, NULL);
1309 s1 = select_stream_for_async (async1, self, false, NULL);
1310 if (!s1)
1311 GOMP_PLUGIN_fatal ("invalid async 1\n");
1313 if (s1 == s2)
1314 GOMP_PLUGIN_fatal ("identical parameters");
1316 e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1318 r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1319 if (r != CUDA_SUCCESS)
1320 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1322 event_gc (true);
1324 r = cuEventRecord (*e, s1->stream);
1325 if (r != CUDA_SUCCESS)
1326 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1328 event_add (PTX_EVT_SYNC, e, NULL);
1330 r = cuStreamWaitEvent (s2->stream, *e, 0);
1331 if (r != CUDA_SUCCESS)
1332 GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuda_error (r));
1335 static void
1336 nvptx_wait_all (void)
1338 CUresult r;
1339 struct ptx_stream *s;
1340 pthread_t self = pthread_self ();
1341 struct nvptx_thread *nvthd = nvptx_thread ();
1343 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1345 /* Wait for active streams initiated by this thread (or by multiple threads)
1346 to complete. */
1347 for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
1349 if (s->multithreaded || pthread_equal (s->host_thread, self))
1351 r = cuStreamQuery (s->stream);
1352 if (r == CUDA_SUCCESS)
1353 continue;
1354 else if (r != CUDA_ERROR_NOT_READY)
1355 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
1357 r = cuStreamSynchronize (s->stream);
1358 if (r != CUDA_SUCCESS)
1359 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1363 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1365 event_gc (true);
1368 static void
1369 nvptx_wait_all_async (int async)
1371 CUresult r;
1372 struct ptx_stream *waiting_stream, *other_stream;
1373 CUevent *e;
1374 struct nvptx_thread *nvthd = nvptx_thread ();
1375 pthread_t self = pthread_self ();
1377 /* The stream doing the waiting. This could be the first mention of the
1378 stream, so create it if necessary. */
1379 waiting_stream
1380 = select_stream_for_async (async, pthread_self (), true, NULL);
1382 /* Launches on the null stream already block on other streams in the
1383 context. */
1384 if (!waiting_stream || waiting_stream == nvthd->ptx_dev->null_stream)
1385 return;
1387 event_gc (true);
1389 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1391 for (other_stream = nvthd->ptx_dev->active_streams;
1392 other_stream != NULL;
1393 other_stream = other_stream->next)
1395 if (!other_stream->multithreaded
1396 && !pthread_equal (other_stream->host_thread, self))
1397 continue;
1399 e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1401 r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1402 if (r != CUDA_SUCCESS)
1403 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1405 /* Record an event on the waited-for stream. */
1406 r = cuEventRecord (*e, other_stream->stream);
1407 if (r != CUDA_SUCCESS)
1408 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1410 event_add (PTX_EVT_SYNC, e, NULL);
1412 r = cuStreamWaitEvent (waiting_stream->stream, *e, 0);
1413 if (r != CUDA_SUCCESS)
1414 GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuda_error (r));
1417 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1420 static void *
1421 nvptx_get_current_cuda_device (void)
1423 struct nvptx_thread *nvthd = nvptx_thread ();
1425 if (!nvthd || !nvthd->ptx_dev)
1426 return NULL;
1428 return &nvthd->ptx_dev->dev;
1431 static void *
1432 nvptx_get_current_cuda_context (void)
1434 struct nvptx_thread *nvthd = nvptx_thread ();
1436 if (!nvthd || !nvthd->ptx_dev)
1437 return NULL;
1439 return nvthd->ptx_dev->ctx;
1442 static void *
1443 nvptx_get_cuda_stream (int async)
1445 struct ptx_stream *s;
1446 struct nvptx_thread *nvthd = nvptx_thread ();
1448 if (!nvthd || !nvthd->ptx_dev)
1449 return NULL;
1451 s = select_stream_for_async (async, pthread_self (), false, NULL);
1453 return s ? s->stream : NULL;
1456 static int
1457 nvptx_set_cuda_stream (int async, void *stream)
1459 struct ptx_stream *oldstream;
1460 pthread_t self = pthread_self ();
1461 struct nvptx_thread *nvthd = nvptx_thread ();
1463 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1465 if (async < 0)
1466 GOMP_PLUGIN_fatal ("bad async %d", async);
1468 /* We have a list of active streams and an array mapping async values to
1469 entries of that list. We need to take "ownership" of the passed-in stream,
1470 and add it to our list, removing the previous entry also (if there was one)
1471 in order to prevent resource leaks. Note the potential for surprise
1472 here: maybe we should keep track of passed-in streams and leave it up to
1473 the user to tidy those up, but that doesn't work for stream handles
1474 returned from acc_get_cuda_stream above... */
1476 oldstream = select_stream_for_async (async, self, false, NULL);
1478 if (oldstream)
1480 if (nvthd->ptx_dev->active_streams == oldstream)
1481 nvthd->ptx_dev->active_streams = nvthd->ptx_dev->active_streams->next;
1482 else
1484 struct ptx_stream *s = nvthd->ptx_dev->active_streams;
1485 while (s->next != oldstream)
1486 s = s->next;
1487 s->next = s->next->next;
1490 cuStreamDestroy (oldstream->stream);
1491 map_fini (oldstream);
1492 free (oldstream);
1495 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1497 (void) select_stream_for_async (async, self, true, (CUstream) stream);
1499 return 1;
1502 /* Plugin entry points. */
1504 const char *
1505 GOMP_OFFLOAD_get_name (void)
1507 return "nvptx";
1510 unsigned int
1511 GOMP_OFFLOAD_get_caps (void)
1513 return GOMP_OFFLOAD_CAP_OPENACC_200;
1517 GOMP_OFFLOAD_get_type (void)
1519 return OFFLOAD_TARGET_TYPE_NVIDIA_PTX;
1523 GOMP_OFFLOAD_get_num_devices (void)
1525 return nvptx_get_num_devices ();
1528 void
1529 GOMP_OFFLOAD_init_device (int n)
1531 pthread_mutex_lock (&ptx_dev_lock);
1533 if (!nvptx_init () || ptx_devices[n] != NULL)
1535 pthread_mutex_unlock (&ptx_dev_lock);
1536 return;
1539 ptx_devices[n] = nvptx_open_device (n);
1540 instantiated_devices++;
1542 pthread_mutex_unlock (&ptx_dev_lock);
1545 void
1546 GOMP_OFFLOAD_fini_device (int n)
1548 pthread_mutex_lock (&ptx_dev_lock);
1550 if (ptx_devices[n] != NULL)
1552 nvptx_attach_host_thread_to_device (n);
1553 nvptx_close_device (ptx_devices[n]);
1554 ptx_devices[n] = NULL;
1555 instantiated_devices--;
1558 pthread_mutex_unlock (&ptx_dev_lock);
1561 /* Data emitted by mkoffload. */
1563 typedef struct nvptx_tdata
1565 const char *ptx_src;
1567 const char *const *var_names;
1568 size_t var_num;
1570 const char *const *fn_names;
1571 size_t fn_num;
1572 } nvptx_tdata_t;
1574 /* Return the libgomp version number we're compatible with. There is
1575 no requirement for cross-version compatibility. */
1577 unsigned
1578 GOMP_OFFLOAD_version (void)
1580 return GOMP_VERSION;
1583 /* Load the (partial) program described by TARGET_DATA to device
1584 number ORD. Allocate and return TARGET_TABLE. */
1587 GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
1588 struct addr_pair **target_table)
1590 CUmodule module;
1591 const char *const *fn_names, *const *var_names;
1592 unsigned int fn_entries, var_entries, i, j;
1593 CUresult r;
1594 struct targ_fn_descriptor *targ_fns;
1595 struct addr_pair *targ_tbl;
1596 const nvptx_tdata_t *img_header = (const nvptx_tdata_t *) target_data;
1597 struct ptx_image_data *new_image;
1598 struct ptx_device *dev;
1600 if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1601 GOMP_PLUGIN_fatal ("Offload data incompatible with PTX plugin"
1602 " (expected %u, received %u)",
1603 GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1605 GOMP_OFFLOAD_init_device (ord);
1607 dev = ptx_devices[ord];
1609 nvptx_attach_host_thread_to_device (ord);
1611 link_ptx (&module, img_header->ptx_src);
1613 /* The mkoffload utility emits a struct of pointers/integers at the
1614 start of each offload image. The array of kernel names and the
1615 functions addresses form a one-to-one correspondence. */
1617 var_entries = img_header->var_num;
1618 var_names = img_header->var_names;
1619 fn_entries = img_header->fn_num;
1620 fn_names = img_header->fn_names;
1622 targ_tbl = GOMP_PLUGIN_malloc (sizeof (struct addr_pair)
1623 * (fn_entries + var_entries));
1624 targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
1625 * fn_entries);
1627 *target_table = targ_tbl;
1629 new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data));
1630 new_image->target_data = target_data;
1631 new_image->module = module;
1632 new_image->fns = targ_fns;
1634 pthread_mutex_lock (&dev->image_lock);
1635 new_image->next = dev->images;
1636 dev->images = new_image;
1637 pthread_mutex_unlock (&dev->image_lock);
1639 for (i = 0; i < fn_entries; i++, targ_fns++, targ_tbl++)
1641 CUfunction function;
1643 r = cuModuleGetFunction (&function, module, fn_names[i]);
1644 if (r != CUDA_SUCCESS)
1645 GOMP_PLUGIN_fatal ("cuModuleGetFunction error: %s", cuda_error (r));
1647 targ_fns->fn = function;
1648 targ_fns->name = (const char *) fn_names[i];
1650 targ_tbl->start = (uintptr_t) targ_fns;
1651 targ_tbl->end = targ_tbl->start + 1;
1654 for (j = 0; j < var_entries; j++, targ_tbl++)
1656 CUdeviceptr var;
1657 size_t bytes;
1659 r = cuModuleGetGlobal (&var, &bytes, module, var_names[j]);
1660 if (r != CUDA_SUCCESS)
1661 GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
1663 targ_tbl->start = (uintptr_t) var;
1664 targ_tbl->end = targ_tbl->start + bytes;
1667 return fn_entries + var_entries;
1670 /* Unload the program described by TARGET_DATA. DEV_DATA is the
1671 function descriptors allocated by G_O_load_image. */
1673 void
1674 GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data)
1676 struct ptx_image_data *image, **prev_p;
1677 struct ptx_device *dev = ptx_devices[ord];
1679 if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1680 return;
1682 pthread_mutex_lock (&dev->image_lock);
1683 for (prev_p = &dev->images; (image = *prev_p) != 0; prev_p = &image->next)
1684 if (image->target_data == target_data)
1686 *prev_p = image->next;
1687 cuModuleUnload (image->module);
1688 free (image->fns);
1689 free (image);
1690 break;
1692 pthread_mutex_unlock (&dev->image_lock);
1695 void *
1696 GOMP_OFFLOAD_alloc (int ord, size_t size)
1698 nvptx_attach_host_thread_to_device (ord);
1699 return nvptx_alloc (size);
1702 void
1703 GOMP_OFFLOAD_free (int ord, void *ptr)
1705 nvptx_attach_host_thread_to_device (ord);
1706 nvptx_free (ptr);
1709 void *
1710 GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
1712 nvptx_attach_host_thread_to_device (ord);
1713 return nvptx_dev2host (dst, src, n);
1716 void *
1717 GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
1719 nvptx_attach_host_thread_to_device (ord);
1720 return nvptx_host2dev (dst, src, n);
1723 void (*device_run) (int n, void *fn_ptr, void *vars) = NULL;
1725 void
1726 GOMP_OFFLOAD_openacc_parallel (void (*fn) (void *), size_t mapnum,
1727 void **hostaddrs, void **devaddrs, size_t *sizes,
1728 unsigned short *kinds, int num_gangs,
1729 int num_workers, int vector_length, int async,
1730 void *targ_mem_desc)
1732 nvptx_exec (fn, mapnum, hostaddrs, devaddrs, sizes, kinds, num_gangs,
1733 num_workers, vector_length, async, targ_mem_desc);
1736 void
1737 GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc)
1739 CUevent *e;
1740 CUresult r;
1741 struct nvptx_thread *nvthd = nvptx_thread ();
1743 e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1745 r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1746 if (r != CUDA_SUCCESS)
1747 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1749 r = cuEventRecord (*e, nvthd->current_stream->stream);
1750 if (r != CUDA_SUCCESS)
1751 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1753 event_add (PTX_EVT_ASYNC_CLEANUP, e, targ_mem_desc);
1757 GOMP_OFFLOAD_openacc_async_test (int async)
1759 return nvptx_async_test (async);
1763 GOMP_OFFLOAD_openacc_async_test_all (void)
1765 return nvptx_async_test_all ();
1768 void
1769 GOMP_OFFLOAD_openacc_async_wait (int async)
1771 nvptx_wait (async);
1774 void
1775 GOMP_OFFLOAD_openacc_async_wait_async (int async1, int async2)
1777 nvptx_wait_async (async1, async2);
1780 void
1781 GOMP_OFFLOAD_openacc_async_wait_all (void)
1783 nvptx_wait_all ();
1786 void
1787 GOMP_OFFLOAD_openacc_async_wait_all_async (int async)
1789 nvptx_wait_all_async (async);
1792 void
1793 GOMP_OFFLOAD_openacc_async_set_async (int async)
1795 nvptx_set_async (async);
1798 void *
1799 GOMP_OFFLOAD_openacc_create_thread_data (int ord)
1801 struct ptx_device *ptx_dev;
1802 struct nvptx_thread *nvthd
1803 = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
1804 CUresult r;
1805 CUcontext thd_ctx;
1807 ptx_dev = ptx_devices[ord];
1809 assert (ptx_dev);
1811 r = cuCtxGetCurrent (&thd_ctx);
1812 if (r != CUDA_SUCCESS)
1813 GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));
1815 assert (ptx_dev->ctx);
1817 if (!thd_ctx)
1819 r = cuCtxPushCurrent (ptx_dev->ctx);
1820 if (r != CUDA_SUCCESS)
1821 GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s", cuda_error (r));
1824 nvthd->current_stream = ptx_dev->null_stream;
1825 nvthd->ptx_dev = ptx_dev;
1827 return (void *) nvthd;
1830 void
1831 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
1833 free (data);
1836 void *
1837 GOMP_OFFLOAD_openacc_get_current_cuda_device (void)
1839 return nvptx_get_current_cuda_device ();
1842 void *
1843 GOMP_OFFLOAD_openacc_get_current_cuda_context (void)
1845 return nvptx_get_current_cuda_context ();
1848 /* NOTE: This returns a CUstream, not a ptx_stream pointer. */
1850 void *
1851 GOMP_OFFLOAD_openacc_get_cuda_stream (int async)
1853 return nvptx_get_cuda_stream (async);
1856 /* NOTE: This takes a CUstream, not a ptx_stream pointer. */
1859 GOMP_OFFLOAD_openacc_set_cuda_stream (int async, void *stream)
1861 return nvptx_set_cuda_stream (async, stream);