fix sched compare regression
[official-gcc.git] / libgomp / plugin / plugin-nvptx.c
blobd02a3fd4b9c2f7024a175512f03eef9dc31b0f3f
1 /* Plugin for NVPTX execution.
3 Copyright (C) 2013-2015 Free Software Foundation, Inc.
5 Contributed by Mentor Embedded.
7 This file is part of the GNU Offloading and Multi Processing Library
8 (libgomp).
10 Libgomp is free software; you can redistribute it and/or modify it
11 under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 3, or (at your option)
13 any later version.
15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
18 more details.
20 Under Section 7 of GPL version 3, you are granted additional
21 permissions described in the GCC Runtime Library Exception, version
22 3.1, as published by the Free Software Foundation.
24 You should have received a copy of the GNU General Public License and
25 a copy of the GCC Runtime Library Exception along with this program;
26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
27 <http://www.gnu.org/licenses/>. */
29 /* Nvidia PTX-specific parts of OpenACC support. The cuda driver
30 library appears to hold some implicit state, but the documentation
31 is not clear as to what that state might be. Or how one might
32 propagate it from one thread to another. */
34 #include "openacc.h"
35 #include "config.h"
36 #include "libgomp-plugin.h"
37 #include "oacc-ptx.h"
38 #include "oacc-plugin.h"
40 #include <pthread.h>
41 #include <cuda.h>
42 #include <stdbool.h>
43 #include <stdint.h>
44 #include <string.h>
45 #include <stdio.h>
46 #include <unistd.h>
47 #include <assert.h>
49 #define ARRAYSIZE(X) (sizeof (X) / sizeof ((X)[0]))
51 static const struct
53 CUresult r;
54 const char *m;
55 } cuda_errlist[]=
57 { CUDA_ERROR_INVALID_VALUE, "invalid value" },
58 { CUDA_ERROR_OUT_OF_MEMORY, "out of memory" },
59 { CUDA_ERROR_NOT_INITIALIZED, "not initialized" },
60 { CUDA_ERROR_DEINITIALIZED, "deinitialized" },
61 { CUDA_ERROR_PROFILER_DISABLED, "profiler disabled" },
62 { CUDA_ERROR_PROFILER_NOT_INITIALIZED, "profiler not initialized" },
63 { CUDA_ERROR_PROFILER_ALREADY_STARTED, "already started" },
64 { CUDA_ERROR_PROFILER_ALREADY_STOPPED, "already stopped" },
65 { CUDA_ERROR_NO_DEVICE, "no device" },
66 { CUDA_ERROR_INVALID_DEVICE, "invalid device" },
67 { CUDA_ERROR_INVALID_IMAGE, "invalid image" },
68 { CUDA_ERROR_INVALID_CONTEXT, "invalid context" },
69 { CUDA_ERROR_CONTEXT_ALREADY_CURRENT, "context already current" },
70 { CUDA_ERROR_MAP_FAILED, "map error" },
71 { CUDA_ERROR_UNMAP_FAILED, "unmap error" },
72 { CUDA_ERROR_ARRAY_IS_MAPPED, "array is mapped" },
73 { CUDA_ERROR_ALREADY_MAPPED, "already mapped" },
74 { CUDA_ERROR_NO_BINARY_FOR_GPU, "no binary for gpu" },
75 { CUDA_ERROR_ALREADY_ACQUIRED, "already acquired" },
76 { CUDA_ERROR_NOT_MAPPED, "not mapped" },
77 { CUDA_ERROR_NOT_MAPPED_AS_ARRAY, "not mapped as array" },
78 { CUDA_ERROR_NOT_MAPPED_AS_POINTER, "not mapped as pointer" },
79 { CUDA_ERROR_ECC_UNCORRECTABLE, "ecc uncorrectable" },
80 { CUDA_ERROR_UNSUPPORTED_LIMIT, "unsupported limit" },
81 { CUDA_ERROR_CONTEXT_ALREADY_IN_USE, "context already in use" },
82 { CUDA_ERROR_PEER_ACCESS_UNSUPPORTED, "peer access unsupported" },
83 { CUDA_ERROR_INVALID_SOURCE, "invalid source" },
84 { CUDA_ERROR_FILE_NOT_FOUND, "file not found" },
85 { CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
86 "shared object symbol not found" },
87 { CUDA_ERROR_SHARED_OBJECT_INIT_FAILED, "shared object init error" },
88 { CUDA_ERROR_OPERATING_SYSTEM, "operating system" },
89 { CUDA_ERROR_INVALID_HANDLE, "invalid handle" },
90 { CUDA_ERROR_NOT_FOUND, "not found" },
91 { CUDA_ERROR_NOT_READY, "not ready" },
92 { CUDA_ERROR_LAUNCH_FAILED, "launch error" },
93 { CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, "launch out of resources" },
94 { CUDA_ERROR_LAUNCH_TIMEOUT, "launch timeout" },
95 { CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
96 "launch incompatibe texturing" },
97 { CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED, "peer access already enabled" },
98 { CUDA_ERROR_PEER_ACCESS_NOT_ENABLED, "peer access not enabled " },
99 { CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE, "primary cotext active" },
100 { CUDA_ERROR_CONTEXT_IS_DESTROYED, "context is destroyed" },
101 { CUDA_ERROR_ASSERT, "assert" },
102 { CUDA_ERROR_TOO_MANY_PEERS, "too many peers" },
103 { CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED,
104 "host memory already registered" },
105 { CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED, "host memory not registered" },
106 { CUDA_ERROR_NOT_PERMITTED, "not permitted" },
107 { CUDA_ERROR_NOT_SUPPORTED, "not supported" },
108 { CUDA_ERROR_UNKNOWN, "unknown" }
111 static const char *
112 cuda_error (CUresult r)
114 int i;
116 for (i = 0; i < ARRAYSIZE (cuda_errlist); i++)
118 if (cuda_errlist[i].r == r)
119 return cuda_errlist[i].m;
122 static char errmsg[30];
124 snprintf (errmsg, sizeof (errmsg), "unknown error code: %d", r);
126 return errmsg;
129 static unsigned int instantiated_devices = 0;
130 static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
132 struct ptx_stream
134 CUstream stream;
135 pthread_t host_thread;
136 bool multithreaded;
138 CUdeviceptr d;
139 void *h;
140 void *h_begin;
141 void *h_end;
142 void *h_next;
143 void *h_prev;
144 void *h_tail;
146 struct ptx_stream *next;
149 /* Thread-specific data for PTX. */
151 struct nvptx_thread
153 struct ptx_stream *current_stream;
154 struct ptx_device *ptx_dev;
157 struct map
159 int async;
160 size_t size;
161 char mappings[0];
164 static void
165 map_init (struct ptx_stream *s)
167 CUresult r;
169 int size = getpagesize ();
171 assert (s);
172 assert (!s->d);
173 assert (!s->h);
175 r = cuMemAllocHost (&s->h, size);
176 if (r != CUDA_SUCCESS)
177 GOMP_PLUGIN_fatal ("cuMemAllocHost error: %s", cuda_error (r));
179 r = cuMemHostGetDevicePointer (&s->d, s->h, 0);
180 if (r != CUDA_SUCCESS)
181 GOMP_PLUGIN_fatal ("cuMemHostGetDevicePointer error: %s", cuda_error (r));
183 assert (s->h);
185 s->h_begin = s->h;
186 s->h_end = s->h_begin + size;
187 s->h_next = s->h_prev = s->h_tail = s->h_begin;
189 assert (s->h_next);
190 assert (s->h_end);
193 static void
194 map_fini (struct ptx_stream *s)
196 CUresult r;
198 r = cuMemFreeHost (s->h);
199 if (r != CUDA_SUCCESS)
200 GOMP_PLUGIN_fatal ("cuMemFreeHost error: %s", cuda_error (r));
203 static void
204 map_pop (struct ptx_stream *s)
206 struct map *m;
208 assert (s != NULL);
209 assert (s->h_next);
210 assert (s->h_prev);
211 assert (s->h_tail);
213 m = s->h_tail;
215 s->h_tail += m->size;
217 if (s->h_tail >= s->h_end)
218 s->h_tail = s->h_begin + (int) (s->h_tail - s->h_end);
220 if (s->h_next == s->h_tail)
221 s->h_prev = s->h_next;
223 assert (s->h_next >= s->h_begin);
224 assert (s->h_tail >= s->h_begin);
225 assert (s->h_prev >= s->h_begin);
227 assert (s->h_next <= s->h_end);
228 assert (s->h_tail <= s->h_end);
229 assert (s->h_prev <= s->h_end);
232 static void
233 map_push (struct ptx_stream *s, int async, size_t size, void **h, void **d)
235 int left;
236 int offset;
237 struct map *m;
239 assert (s != NULL);
241 left = s->h_end - s->h_next;
242 size += sizeof (struct map);
244 assert (s->h_prev);
245 assert (s->h_next);
247 if (size >= left)
249 m = s->h_prev;
250 m->size += left;
251 s->h_next = s->h_begin;
253 if (s->h_next + size > s->h_end)
254 GOMP_PLUGIN_fatal ("unable to push map");
257 assert (s->h_next);
259 m = s->h_next;
260 m->async = async;
261 m->size = size;
263 offset = (void *)&m->mappings[0] - s->h;
265 *d = (void *)(s->d + offset);
266 *h = (void *)(s->h + offset);
268 s->h_prev = s->h_next;
269 s->h_next += size;
271 assert (s->h_prev);
272 assert (s->h_next);
274 assert (s->h_next >= s->h_begin);
275 assert (s->h_tail >= s->h_begin);
276 assert (s->h_prev >= s->h_begin);
277 assert (s->h_next <= s->h_end);
278 assert (s->h_tail <= s->h_end);
279 assert (s->h_prev <= s->h_end);
281 return;
284 /* Descriptor of a loaded function. */
286 struct targ_fn_descriptor
288 CUfunction fn;
289 const char *name;
292 /* A loaded PTX image. */
293 struct ptx_image_data
295 const void *target_data;
296 CUmodule module;
298 struct targ_fn_descriptor *fns; /* Array of functions. */
300 struct ptx_image_data *next;
303 struct ptx_device
305 CUcontext ctx;
306 bool ctx_shared;
307 CUdevice dev;
308 struct ptx_stream *null_stream;
309 /* All non-null streams associated with this device (actually context),
310 either created implicitly or passed in from the user (via
311 acc_set_cuda_stream). */
312 struct ptx_stream *active_streams;
313 struct {
314 struct ptx_stream **arr;
315 int size;
316 } async_streams;
317 /* A lock for use when manipulating the above stream list and array. */
318 pthread_mutex_t stream_lock;
319 int ord;
320 bool overlap;
321 bool map;
322 bool concur;
323 int mode;
324 bool mkern;
326 struct ptx_image_data *images; /* Images loaded on device. */
327 pthread_mutex_t image_lock; /* Lock for above list. */
329 struct ptx_device *next;
332 enum ptx_event_type
334 PTX_EVT_MEM,
335 PTX_EVT_KNL,
336 PTX_EVT_SYNC,
337 PTX_EVT_ASYNC_CLEANUP
340 struct ptx_event
342 CUevent *evt;
343 int type;
344 void *addr;
345 int ord;
347 struct ptx_event *next;
350 static pthread_mutex_t ptx_event_lock;
351 static struct ptx_event *ptx_events;
353 static struct ptx_device **ptx_devices;
355 static inline struct nvptx_thread *
356 nvptx_thread (void)
358 return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
361 static void
362 init_streams_for_device (struct ptx_device *ptx_dev, int concurrency)
364 int i;
365 struct ptx_stream *null_stream
366 = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
368 null_stream->stream = NULL;
369 null_stream->host_thread = pthread_self ();
370 null_stream->multithreaded = true;
371 null_stream->d = (CUdeviceptr) NULL;
372 null_stream->h = NULL;
373 map_init (null_stream);
374 ptx_dev->null_stream = null_stream;
376 ptx_dev->active_streams = NULL;
377 pthread_mutex_init (&ptx_dev->stream_lock, NULL);
379 if (concurrency < 1)
380 concurrency = 1;
382 /* This is just a guess -- make space for as many async streams as the
383 current device is capable of concurrently executing. This can grow
384 later as necessary. No streams are created yet. */
385 ptx_dev->async_streams.arr
386 = GOMP_PLUGIN_malloc (concurrency * sizeof (struct ptx_stream *));
387 ptx_dev->async_streams.size = concurrency;
389 for (i = 0; i < concurrency; i++)
390 ptx_dev->async_streams.arr[i] = NULL;
393 static void
394 fini_streams_for_device (struct ptx_device *ptx_dev)
396 free (ptx_dev->async_streams.arr);
398 while (ptx_dev->active_streams != NULL)
400 struct ptx_stream *s = ptx_dev->active_streams;
401 ptx_dev->active_streams = ptx_dev->active_streams->next;
403 map_fini (s);
404 cuStreamDestroy (s->stream);
405 free (s);
408 map_fini (ptx_dev->null_stream);
409 free (ptx_dev->null_stream);
412 /* Select a stream for (OpenACC-semantics) ASYNC argument for the current
413 thread THREAD (and also current device/context). If CREATE is true, create
414 the stream if it does not exist (or use EXISTING if it is non-NULL), and
415 associate the stream with the same thread argument. Returns stream to use
416 as result. */
418 static struct ptx_stream *
419 select_stream_for_async (int async, pthread_t thread, bool create,
420 CUstream existing)
422 struct nvptx_thread *nvthd = nvptx_thread ();
423 /* Local copy of TLS variable. */
424 struct ptx_device *ptx_dev = nvthd->ptx_dev;
425 struct ptx_stream *stream = NULL;
426 int orig_async = async;
428 /* The special value acc_async_noval (-1) maps (for now) to an
429 implicitly-created stream, which is then handled the same as any other
430 numbered async stream. Other options are available, e.g. using the null
431 stream for anonymous async operations, or choosing an idle stream from an
432 active set. But, stick with this for now. */
433 if (async > acc_async_sync)
434 async++;
436 if (create)
437 pthread_mutex_lock (&ptx_dev->stream_lock);
439 /* NOTE: AFAICT there's no particular need for acc_async_sync to map to the
440 null stream, and in fact better performance may be obtainable if it doesn't
441 (because the null stream enforces overly-strict synchronisation with
442 respect to other streams for legacy reasons, and that's probably not
443 needed with OpenACC). Maybe investigate later. */
444 if (async == acc_async_sync)
445 stream = ptx_dev->null_stream;
446 else if (async >= 0 && async < ptx_dev->async_streams.size
447 && ptx_dev->async_streams.arr[async] && !(create && existing))
448 stream = ptx_dev->async_streams.arr[async];
449 else if (async >= 0 && create)
451 if (async >= ptx_dev->async_streams.size)
453 int i, newsize = ptx_dev->async_streams.size * 2;
455 if (async >= newsize)
456 newsize = async + 1;
458 ptx_dev->async_streams.arr
459 = GOMP_PLUGIN_realloc (ptx_dev->async_streams.arr,
460 newsize * sizeof (struct ptx_stream *));
462 for (i = ptx_dev->async_streams.size; i < newsize; i++)
463 ptx_dev->async_streams.arr[i] = NULL;
465 ptx_dev->async_streams.size = newsize;
468 /* Create a new stream on-demand if there isn't one already, or if we're
469 setting a particular async value to an existing (externally-provided)
470 stream. */
471 if (!ptx_dev->async_streams.arr[async] || existing)
473 CUresult r;
474 struct ptx_stream *s
475 = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
477 if (existing)
478 s->stream = existing;
479 else
481 r = cuStreamCreate (&s->stream, CU_STREAM_DEFAULT);
482 if (r != CUDA_SUCCESS)
483 GOMP_PLUGIN_fatal ("cuStreamCreate error: %s", cuda_error (r));
486 /* If CREATE is true, we're going to be queueing some work on this
487 stream. Associate it with the current host thread. */
488 s->host_thread = thread;
489 s->multithreaded = false;
491 s->d = (CUdeviceptr) NULL;
492 s->h = NULL;
493 map_init (s);
495 s->next = ptx_dev->active_streams;
496 ptx_dev->active_streams = s;
497 ptx_dev->async_streams.arr[async] = s;
500 stream = ptx_dev->async_streams.arr[async];
502 else if (async < 0)
503 GOMP_PLUGIN_fatal ("bad async %d", async);
505 if (create)
507 assert (stream != NULL);
509 /* If we're trying to use the same stream from different threads
510 simultaneously, set stream->multithreaded to true. This affects the
511 behaviour of acc_async_test_all and acc_wait_all, which are supposed to
512 only wait for asynchronous launches from the same host thread they are
513 invoked on. If multiple threads use the same async value, we make note
514 of that here and fall back to testing/waiting for all threads in those
515 functions. */
516 if (thread != stream->host_thread)
517 stream->multithreaded = true;
519 pthread_mutex_unlock (&ptx_dev->stream_lock);
521 else if (stream && !stream->multithreaded
522 && !pthread_equal (stream->host_thread, thread))
523 GOMP_PLUGIN_fatal ("async %d used on wrong thread", orig_async);
525 return stream;
528 /* Initialize the device. Return TRUE on success, else FALSE. PTX_DEV_LOCK
529 should be locked on entry and remains locked on exit. */
531 static bool
532 nvptx_init (void)
534 CUresult r;
535 int ndevs;
537 if (instantiated_devices != 0)
538 return true;
540 r = cuInit (0);
541 if (r != CUDA_SUCCESS)
542 GOMP_PLUGIN_fatal ("cuInit error: %s", cuda_error (r));
544 ptx_events = NULL;
546 pthread_mutex_init (&ptx_event_lock, NULL);
548 r = cuDeviceGetCount (&ndevs);
549 if (r != CUDA_SUCCESS)
550 GOMP_PLUGIN_fatal ("cuDeviceGetCount error: %s", cuda_error (r));
552 ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *)
553 * ndevs);
555 return true;
558 /* Select the N'th PTX device for the current host thread. The device must
559 have been previously opened before calling this function. */
561 static void
562 nvptx_attach_host_thread_to_device (int n)
564 CUdevice dev;
565 CUresult r;
566 struct ptx_device *ptx_dev;
567 CUcontext thd_ctx;
569 r = cuCtxGetDevice (&dev);
570 if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
571 GOMP_PLUGIN_fatal ("cuCtxGetDevice error: %s", cuda_error (r));
573 if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n)
574 return;
575 else
577 CUcontext old_ctx;
579 ptx_dev = ptx_devices[n];
580 assert (ptx_dev);
582 r = cuCtxGetCurrent (&thd_ctx);
583 if (r != CUDA_SUCCESS)
584 GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));
586 /* We don't necessarily have a current context (e.g. if it has been
587 destroyed. Pop it if we do though. */
588 if (thd_ctx != NULL)
590 r = cuCtxPopCurrent (&old_ctx);
591 if (r != CUDA_SUCCESS)
592 GOMP_PLUGIN_fatal ("cuCtxPopCurrent error: %s", cuda_error (r));
595 r = cuCtxPushCurrent (ptx_dev->ctx);
596 if (r != CUDA_SUCCESS)
597 GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s", cuda_error (r));
601 static struct ptx_device *
602 nvptx_open_device (int n)
604 struct ptx_device *ptx_dev;
605 CUdevice dev, ctx_dev;
606 CUresult r;
607 int async_engines, pi;
609 r = cuDeviceGet (&dev, n);
610 if (r != CUDA_SUCCESS)
611 GOMP_PLUGIN_fatal ("cuDeviceGet error: %s", cuda_error (r));
613 ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device));
615 ptx_dev->ord = n;
616 ptx_dev->dev = dev;
617 ptx_dev->ctx_shared = false;
619 r = cuCtxGetDevice (&ctx_dev);
620 if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
621 GOMP_PLUGIN_fatal ("cuCtxGetDevice error: %s", cuda_error (r));
623 if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev)
625 /* The current host thread has an active context for a different device.
626 Detach it. */
627 CUcontext old_ctx;
629 r = cuCtxPopCurrent (&old_ctx);
630 if (r != CUDA_SUCCESS)
631 GOMP_PLUGIN_fatal ("cuCtxPopCurrent error: %s", cuda_error (r));
634 r = cuCtxGetCurrent (&ptx_dev->ctx);
635 if (r != CUDA_SUCCESS)
636 GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));
638 if (!ptx_dev->ctx)
640 r = cuCtxCreate (&ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
641 if (r != CUDA_SUCCESS)
642 GOMP_PLUGIN_fatal ("cuCtxCreate error: %s", cuda_error (r));
644 else
645 ptx_dev->ctx_shared = true;
647 r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
648 if (r != CUDA_SUCCESS)
649 GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
651 ptx_dev->overlap = pi;
653 r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
654 if (r != CUDA_SUCCESS)
655 GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
657 ptx_dev->map = pi;
659 r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
660 if (r != CUDA_SUCCESS)
661 GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
663 ptx_dev->concur = pi;
665 r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
666 if (r != CUDA_SUCCESS)
667 GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
669 ptx_dev->mode = pi;
671 r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
672 if (r != CUDA_SUCCESS)
673 GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
675 ptx_dev->mkern = pi;
677 r = cuDeviceGetAttribute (&async_engines,
678 CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
679 if (r != CUDA_SUCCESS)
680 async_engines = 1;
682 ptx_dev->images = NULL;
683 pthread_mutex_init (&ptx_dev->image_lock, NULL);
685 init_streams_for_device (ptx_dev, async_engines);
687 return ptx_dev;
690 static void
691 nvptx_close_device (struct ptx_device *ptx_dev)
693 CUresult r;
695 if (!ptx_dev)
696 return;
698 fini_streams_for_device (ptx_dev);
700 pthread_mutex_destroy (&ptx_dev->image_lock);
702 if (!ptx_dev->ctx_shared)
704 r = cuCtxDestroy (ptx_dev->ctx);
705 if (r != CUDA_SUCCESS)
706 GOMP_PLUGIN_fatal ("cuCtxDestroy error: %s", cuda_error (r));
709 free (ptx_dev);
712 static int
713 nvptx_get_num_devices (void)
715 int n;
716 CUresult r;
718 /* PR libgomp/65099: Currently, we only support offloading in 64-bit
719 configurations. */
720 if (sizeof (void *) != 8)
721 return 0;
723 /* This function will be called before the plugin has been initialized in
724 order to enumerate available devices, but CUDA API routines can't be used
725 until cuInit has been called. Just call it now (but don't yet do any
726 further initialization). */
727 if (instantiated_devices == 0)
729 r = cuInit (0);
730 /* This is not an error: e.g. we may have CUDA libraries installed but
731 no devices available. */
732 if (r != CUDA_SUCCESS)
733 return 0;
736 r = cuDeviceGetCount (&n);
737 if (r!= CUDA_SUCCESS)
738 GOMP_PLUGIN_fatal ("cuDeviceGetCount error: %s", cuda_error (r));
740 return n;
744 static void
745 link_ptx (CUmodule *module, const char *ptx_code)
747 CUjit_option opts[7];
748 void *optvals[7];
749 float elapsed = 0.0;
750 #define LOGSIZE 8192
751 char elog[LOGSIZE];
752 char ilog[LOGSIZE];
753 unsigned long logsize = LOGSIZE;
754 CUlinkState linkstate;
755 CUresult r;
756 void *linkout;
757 size_t linkoutsize __attribute__ ((unused));
759 GOMP_PLUGIN_debug (0, "attempting to load:\n---\n%s\n---\n", ptx_code);
761 opts[0] = CU_JIT_WALL_TIME;
762 optvals[0] = &elapsed;
764 opts[1] = CU_JIT_INFO_LOG_BUFFER;
765 optvals[1] = &ilog[0];
767 opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
768 optvals[2] = (void *) logsize;
770 opts[3] = CU_JIT_ERROR_LOG_BUFFER;
771 optvals[3] = &elog[0];
773 opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
774 optvals[4] = (void *) logsize;
776 opts[5] = CU_JIT_LOG_VERBOSE;
777 optvals[5] = (void *) 1;
779 opts[6] = CU_JIT_TARGET;
780 optvals[6] = (void *) CU_TARGET_COMPUTE_30;
782 r = cuLinkCreate (7, opts, optvals, &linkstate);
783 if (r != CUDA_SUCCESS)
784 GOMP_PLUGIN_fatal ("cuLinkCreate error: %s", cuda_error (r));
786 char *abort_ptx = ABORT_PTX;
787 r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, abort_ptx,
788 strlen (abort_ptx) + 1, 0, 0, 0, 0);
789 if (r != CUDA_SUCCESS)
791 GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
792 GOMP_PLUGIN_fatal ("cuLinkAddData (abort) error: %s", cuda_error (r));
795 char *acc_on_device_ptx = ACC_ON_DEVICE_PTX;
796 r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, acc_on_device_ptx,
797 strlen (acc_on_device_ptx) + 1, 0, 0, 0, 0);
798 if (r != CUDA_SUCCESS)
800 GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
801 GOMP_PLUGIN_fatal ("cuLinkAddData (acc_on_device) error: %s",
802 cuda_error (r));
805 char *goacc_internal_ptx = GOACC_INTERNAL_PTX;
806 r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, goacc_internal_ptx,
807 strlen (goacc_internal_ptx) + 1, 0, 0, 0, 0);
808 if (r != CUDA_SUCCESS)
810 GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
811 GOMP_PLUGIN_fatal ("cuLinkAddData (goacc_internal_ptx) error: %s",
812 cuda_error (r));
815 /* cuLinkAddData's 'data' argument erroneously omits the const qualifier. */
816 r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, (char *)ptx_code,
817 strlen (ptx_code) + 1, 0, 0, 0, 0);
818 if (r != CUDA_SUCCESS)
820 GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
821 GOMP_PLUGIN_fatal ("cuLinkAddData (ptx_code) error: %s", cuda_error (r));
824 r = cuLinkComplete (linkstate, &linkout, &linkoutsize);
825 if (r != CUDA_SUCCESS)
826 GOMP_PLUGIN_fatal ("cuLinkComplete error: %s", cuda_error (r));
828 GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed);
829 GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]);
831 r = cuModuleLoadData (module, linkout);
832 if (r != CUDA_SUCCESS)
833 GOMP_PLUGIN_fatal ("cuModuleLoadData error: %s", cuda_error (r));
836 static void
837 event_gc (bool memmap_lockable)
839 struct ptx_event *ptx_event = ptx_events;
840 struct nvptx_thread *nvthd = nvptx_thread ();
842 pthread_mutex_lock (&ptx_event_lock);
844 while (ptx_event != NULL)
846 CUresult r;
847 struct ptx_event *e = ptx_event;
849 ptx_event = ptx_event->next;
851 if (e->ord != nvthd->ptx_dev->ord)
852 continue;
854 r = cuEventQuery (*e->evt);
855 if (r == CUDA_SUCCESS)
857 CUevent *te;
859 te = e->evt;
861 switch (e->type)
863 case PTX_EVT_MEM:
864 case PTX_EVT_SYNC:
865 break;
867 case PTX_EVT_KNL:
868 map_pop (e->addr);
869 break;
871 case PTX_EVT_ASYNC_CLEANUP:
873 /* The function gomp_plugin_async_unmap_vars needs to claim the
874 memory-map splay tree lock for the current device, so we
875 can't call it when one of our callers has already claimed
876 the lock. In that case, just delay the GC for this event
877 until later. */
878 if (!memmap_lockable)
879 continue;
881 GOMP_PLUGIN_async_unmap_vars (e->addr);
883 break;
886 cuEventDestroy (*te);
887 free ((void *)te);
889 if (ptx_events == e)
890 ptx_events = ptx_events->next;
891 else
893 struct ptx_event *e_ = ptx_events;
894 while (e_->next != e)
895 e_ = e_->next;
896 e_->next = e_->next->next;
899 free (e);
903 pthread_mutex_unlock (&ptx_event_lock);
906 static void
907 event_add (enum ptx_event_type type, CUevent *e, void *h)
909 struct ptx_event *ptx_event;
910 struct nvptx_thread *nvthd = nvptx_thread ();
912 assert (type == PTX_EVT_MEM || type == PTX_EVT_KNL || type == PTX_EVT_SYNC
913 || type == PTX_EVT_ASYNC_CLEANUP);
915 ptx_event = GOMP_PLUGIN_malloc (sizeof (struct ptx_event));
916 ptx_event->type = type;
917 ptx_event->evt = e;
918 ptx_event->addr = h;
919 ptx_event->ord = nvthd->ptx_dev->ord;
921 pthread_mutex_lock (&ptx_event_lock);
923 ptx_event->next = ptx_events;
924 ptx_events = ptx_event;
926 pthread_mutex_unlock (&ptx_event_lock);
929 void
930 nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
931 size_t *sizes, unsigned short *kinds, int num_gangs, int num_workers,
932 int vector_length, int async, void *targ_mem_desc)
934 struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
935 CUfunction function;
936 CUresult r;
937 int i;
938 struct ptx_stream *dev_str;
939 void *kargs[1];
940 void *hp, *dp;
941 unsigned int nthreads_in_block;
942 struct nvptx_thread *nvthd = nvptx_thread ();
943 const char *maybe_abort_msg = "(perhaps abort was called)";
945 function = targ_fn->fn;
947 dev_str = select_stream_for_async (async, pthread_self (), false, NULL);
948 assert (dev_str == nvthd->current_stream);
950 /* This reserves a chunk of a pre-allocated page of memory mapped on both
951 the host and the device. HP is a host pointer to the new chunk, and DP is
952 the corresponding device pointer. */
953 map_push (dev_str, async, mapnum * sizeof (void *), &hp, &dp);
955 GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__);
957 /* Copy the array of arguments to the mapped page. */
958 for (i = 0; i < mapnum; i++)
959 ((void **) hp)[i] = devaddrs[i];
961 /* Copy the (device) pointers to arguments to the device (dp and hp might in
962 fact have the same value on a unified-memory system). */
963 r = cuMemcpy ((CUdeviceptr)dp, (CUdeviceptr)hp, mapnum * sizeof (void *));
964 if (r != CUDA_SUCCESS)
965 GOMP_PLUGIN_fatal ("cuMemcpy failed: %s", cuda_error (r));
967 GOMP_PLUGIN_debug (0, " %s: kernel %s: launch\n", __FUNCTION__, targ_fn->name);
969 // OpenACC CUDA
971 // num_gangs blocks
972 // num_workers warps (where a warp is equivalent to 32 threads)
973 // vector length threads
976 /* The openacc vector_length clause 'determines the vector length to use for
977 vector or SIMD operations'. The question is how to map this to CUDA.
979 In CUDA, the warp size is the vector length of a CUDA device. However, the
980 CUDA interface abstracts away from that, and only shows us warp size
981 indirectly in maximum number of threads per block, which is a product of
982 warp size and the number of hyperthreads of a multiprocessor.
984 We choose to map openacc vector_length directly onto the number of threads
985 in a block, in the x dimension. This is reflected in gcc code generation
986 that uses ThreadIdx.x to access vector elements.
988 Attempting to use an openacc vector_length of more than the maximum number
989 of threads per block will result in a cuda error. */
990 nthreads_in_block = vector_length;
992 kargs[0] = &dp;
993 r = cuLaunchKernel (function,
994 num_gangs, 1, 1,
995 nthreads_in_block, 1, 1,
996 0, dev_str->stream, kargs, 0);
997 if (r != CUDA_SUCCESS)
998 GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
1000 #ifndef DISABLE_ASYNC
1001 if (async < acc_async_noval)
1003 r = cuStreamSynchronize (dev_str->stream);
1004 if (r == CUDA_ERROR_LAUNCH_FAILED)
1005 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
1006 maybe_abort_msg);
1007 else if (r != CUDA_SUCCESS)
1008 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1010 else
1012 CUevent *e;
1014 e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1016 r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1017 if (r == CUDA_ERROR_LAUNCH_FAILED)
1018 GOMP_PLUGIN_fatal ("cuEventCreate error: %s %s\n", cuda_error (r),
1019 maybe_abort_msg);
1020 else if (r != CUDA_SUCCESS)
1021 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1023 event_gc (true);
1025 r = cuEventRecord (*e, dev_str->stream);
1026 if (r != CUDA_SUCCESS)
1027 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1029 event_add (PTX_EVT_KNL, e, (void *)dev_str);
1031 #else
1032 r = cuCtxSynchronize ();
1033 if (r == CUDA_ERROR_LAUNCH_FAILED)
1034 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
1035 maybe_abort_msg);
1036 else if (r != CUDA_SUCCESS)
1037 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
1038 #endif
1040 GOMP_PLUGIN_debug (0, " %s: kernel %s: finished\n", __FUNCTION__,
1041 targ_fn->name);
1043 #ifndef DISABLE_ASYNC
1044 if (async < acc_async_noval)
1045 #endif
1046 map_pop (dev_str);
1049 void * openacc_get_current_cuda_context (void);
1051 static void *
1052 nvptx_alloc (size_t s)
1054 CUdeviceptr d;
1055 CUresult r;
1057 r = cuMemAlloc (&d, s);
1058 if (r == CUDA_ERROR_OUT_OF_MEMORY)
1059 return 0;
1060 if (r != CUDA_SUCCESS)
1061 GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
1062 return (void *)d;
1065 static void
1066 nvptx_free (void *p)
1068 CUresult r;
1069 CUdeviceptr pb;
1070 size_t ps;
1072 r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)p);
1073 if (r != CUDA_SUCCESS)
1074 GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r));
1076 if ((CUdeviceptr)p != pb)
1077 GOMP_PLUGIN_fatal ("invalid device address");
1079 r = cuMemFree ((CUdeviceptr)p);
1080 if (r != CUDA_SUCCESS)
1081 GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
1084 static void *
1085 nvptx_host2dev (void *d, const void *h, size_t s)
1087 CUresult r;
1088 CUdeviceptr pb;
1089 size_t ps;
1090 struct nvptx_thread *nvthd = nvptx_thread ();
1092 if (!s)
1093 return 0;
1095 if (!d)
1096 GOMP_PLUGIN_fatal ("invalid device address");
1098 r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)d);
1099 if (r != CUDA_SUCCESS)
1100 GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r));
1102 if (!pb)
1103 GOMP_PLUGIN_fatal ("invalid device address");
1105 if (!h)
1106 GOMP_PLUGIN_fatal ("invalid host address");
1108 if (d == h)
1109 GOMP_PLUGIN_fatal ("invalid host or device address");
1111 if ((void *)(d + s) > (void *)(pb + ps))
1112 GOMP_PLUGIN_fatal ("invalid size");
1114 #ifndef DISABLE_ASYNC
1115 if (nvthd->current_stream != nvthd->ptx_dev->null_stream)
1117 CUevent *e;
1119 e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1121 r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1122 if (r != CUDA_SUCCESS)
1123 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1125 event_gc (false);
1127 r = cuMemcpyHtoDAsync ((CUdeviceptr)d, h, s,
1128 nvthd->current_stream->stream);
1129 if (r != CUDA_SUCCESS)
1130 GOMP_PLUGIN_fatal ("cuMemcpyHtoDAsync error: %s", cuda_error (r));
1132 r = cuEventRecord (*e, nvthd->current_stream->stream);
1133 if (r != CUDA_SUCCESS)
1134 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1136 event_add (PTX_EVT_MEM, e, (void *)h);
1138 else
1139 #endif
1141 r = cuMemcpyHtoD ((CUdeviceptr)d, h, s);
1142 if (r != CUDA_SUCCESS)
1143 GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
1146 return 0;
1149 static void *
1150 nvptx_dev2host (void *h, const void *d, size_t s)
1152 CUresult r;
1153 CUdeviceptr pb;
1154 size_t ps;
1155 struct nvptx_thread *nvthd = nvptx_thread ();
1157 if (!s)
1158 return 0;
1160 if (!d)
1161 GOMP_PLUGIN_fatal ("invalid device address");
1163 r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)d);
1164 if (r != CUDA_SUCCESS)
1165 GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r));
1167 if (!pb)
1168 GOMP_PLUGIN_fatal ("invalid device address");
1170 if (!h)
1171 GOMP_PLUGIN_fatal ("invalid host address");
1173 if (d == h)
1174 GOMP_PLUGIN_fatal ("invalid host or device address");
1176 if ((void *)(d + s) > (void *)(pb + ps))
1177 GOMP_PLUGIN_fatal ("invalid size");
1179 #ifndef DISABLE_ASYNC
1180 if (nvthd->current_stream != nvthd->ptx_dev->null_stream)
1182 CUevent *e;
1184 e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1186 r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1187 if (r != CUDA_SUCCESS)
1188 GOMP_PLUGIN_fatal ("cuEventCreate error: %s\n", cuda_error (r));
1190 event_gc (false);
1192 r = cuMemcpyDtoHAsync (h, (CUdeviceptr)d, s,
1193 nvthd->current_stream->stream);
1194 if (r != CUDA_SUCCESS)
1195 GOMP_PLUGIN_fatal ("cuMemcpyDtoHAsync error: %s", cuda_error (r));
1197 r = cuEventRecord (*e, nvthd->current_stream->stream);
1198 if (r != CUDA_SUCCESS)
1199 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1201 event_add (PTX_EVT_MEM, e, (void *)h);
1203 else
1204 #endif
1206 r = cuMemcpyDtoH (h, (CUdeviceptr)d, s);
1207 if (r != CUDA_SUCCESS)
1208 GOMP_PLUGIN_fatal ("cuMemcpyDtoH error: %s", cuda_error (r));
1211 return 0;
1214 static void
1215 nvptx_set_async (int async)
1217 struct nvptx_thread *nvthd = nvptx_thread ();
1218 nvthd->current_stream
1219 = select_stream_for_async (async, pthread_self (), true, NULL);
1222 static int
1223 nvptx_async_test (int async)
1225 CUresult r;
1226 struct ptx_stream *s;
1228 s = select_stream_for_async (async, pthread_self (), false, NULL);
1230 if (!s)
1231 GOMP_PLUGIN_fatal ("unknown async %d", async);
1233 r = cuStreamQuery (s->stream);
1234 if (r == CUDA_SUCCESS)
1236 /* The oacc-parallel.c:goacc_wait function calls this hook to determine
1237 whether all work has completed on this stream, and if so omits the call
1238 to the wait hook. If that happens, event_gc might not get called
1239 (which prevents variables from getting unmapped and their associated
1240 device storage freed), so call it here. */
1241 event_gc (true);
1242 return 1;
1244 else if (r == CUDA_ERROR_NOT_READY)
1245 return 0;
1247 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
1249 return 0;
1252 static int
1253 nvptx_async_test_all (void)
1255 struct ptx_stream *s;
1256 pthread_t self = pthread_self ();
1257 struct nvptx_thread *nvthd = nvptx_thread ();
1259 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1261 for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
1263 if ((s->multithreaded || pthread_equal (s->host_thread, self))
1264 && cuStreamQuery (s->stream) == CUDA_ERROR_NOT_READY)
1266 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1267 return 0;
1271 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1273 event_gc (true);
1275 return 1;
1278 static void
1279 nvptx_wait (int async)
1281 CUresult r;
1282 struct ptx_stream *s;
1284 s = select_stream_for_async (async, pthread_self (), false, NULL);
1286 if (!s)
1287 GOMP_PLUGIN_fatal ("unknown async %d", async);
1289 r = cuStreamSynchronize (s->stream);
1290 if (r != CUDA_SUCCESS)
1291 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1293 event_gc (true);
1296 static void
1297 nvptx_wait_async (int async1, int async2)
1299 CUresult r;
1300 CUevent *e;
1301 struct ptx_stream *s1, *s2;
1302 pthread_t self = pthread_self ();
1304 /* The stream that is waiting (rather than being waited for) doesn't
1305 necessarily have to exist already. */
1306 s2 = select_stream_for_async (async2, self, true, NULL);
1308 s1 = select_stream_for_async (async1, self, false, NULL);
1309 if (!s1)
1310 GOMP_PLUGIN_fatal ("invalid async 1\n");
1312 if (s1 == s2)
1313 GOMP_PLUGIN_fatal ("identical parameters");
1315 e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1317 r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1318 if (r != CUDA_SUCCESS)
1319 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1321 event_gc (true);
1323 r = cuEventRecord (*e, s1->stream);
1324 if (r != CUDA_SUCCESS)
1325 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1327 event_add (PTX_EVT_SYNC, e, NULL);
1329 r = cuStreamWaitEvent (s2->stream, *e, 0);
1330 if (r != CUDA_SUCCESS)
1331 GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuda_error (r));
1334 static void
1335 nvptx_wait_all (void)
1337 CUresult r;
1338 struct ptx_stream *s;
1339 pthread_t self = pthread_self ();
1340 struct nvptx_thread *nvthd = nvptx_thread ();
1342 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1344 /* Wait for active streams initiated by this thread (or by multiple threads)
1345 to complete. */
1346 for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
1348 if (s->multithreaded || pthread_equal (s->host_thread, self))
1350 r = cuStreamQuery (s->stream);
1351 if (r == CUDA_SUCCESS)
1352 continue;
1353 else if (r != CUDA_ERROR_NOT_READY)
1354 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
1356 r = cuStreamSynchronize (s->stream);
1357 if (r != CUDA_SUCCESS)
1358 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1362 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1364 event_gc (true);
1367 static void
1368 nvptx_wait_all_async (int async)
1370 CUresult r;
1371 struct ptx_stream *waiting_stream, *other_stream;
1372 CUevent *e;
1373 struct nvptx_thread *nvthd = nvptx_thread ();
1374 pthread_t self = pthread_self ();
1376 /* The stream doing the waiting. This could be the first mention of the
1377 stream, so create it if necessary. */
1378 waiting_stream
1379 = select_stream_for_async (async, pthread_self (), true, NULL);
1381 /* Launches on the null stream already block on other streams in the
1382 context. */
1383 if (!waiting_stream || waiting_stream == nvthd->ptx_dev->null_stream)
1384 return;
1386 event_gc (true);
1388 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1390 for (other_stream = nvthd->ptx_dev->active_streams;
1391 other_stream != NULL;
1392 other_stream = other_stream->next)
1394 if (!other_stream->multithreaded
1395 && !pthread_equal (other_stream->host_thread, self))
1396 continue;
1398 e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1400 r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1401 if (r != CUDA_SUCCESS)
1402 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1404 /* Record an event on the waited-for stream. */
1405 r = cuEventRecord (*e, other_stream->stream);
1406 if (r != CUDA_SUCCESS)
1407 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1409 event_add (PTX_EVT_SYNC, e, NULL);
1411 r = cuStreamWaitEvent (waiting_stream->stream, *e, 0);
1412 if (r != CUDA_SUCCESS)
1413 GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuda_error (r));
1416 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1419 static void *
1420 nvptx_get_current_cuda_device (void)
1422 struct nvptx_thread *nvthd = nvptx_thread ();
1424 if (!nvthd || !nvthd->ptx_dev)
1425 return NULL;
1427 return &nvthd->ptx_dev->dev;
1430 static void *
1431 nvptx_get_current_cuda_context (void)
1433 struct nvptx_thread *nvthd = nvptx_thread ();
1435 if (!nvthd || !nvthd->ptx_dev)
1436 return NULL;
1438 return nvthd->ptx_dev->ctx;
1441 static void *
1442 nvptx_get_cuda_stream (int async)
1444 struct ptx_stream *s;
1445 struct nvptx_thread *nvthd = nvptx_thread ();
1447 if (!nvthd || !nvthd->ptx_dev)
1448 return NULL;
1450 s = select_stream_for_async (async, pthread_self (), false, NULL);
1452 return s ? s->stream : NULL;
1455 static int
1456 nvptx_set_cuda_stream (int async, void *stream)
1458 struct ptx_stream *oldstream;
1459 pthread_t self = pthread_self ();
1460 struct nvptx_thread *nvthd = nvptx_thread ();
1462 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1464 if (async < 0)
1465 GOMP_PLUGIN_fatal ("bad async %d", async);
1467 /* We have a list of active streams and an array mapping async values to
1468 entries of that list. We need to take "ownership" of the passed-in stream,
1469 and add it to our list, removing the previous entry also (if there was one)
1470 in order to prevent resource leaks. Note the potential for surprise
1471 here: maybe we should keep track of passed-in streams and leave it up to
1472 the user to tidy those up, but that doesn't work for stream handles
1473 returned from acc_get_cuda_stream above... */
1475 oldstream = select_stream_for_async (async, self, false, NULL);
1477 if (oldstream)
1479 if (nvthd->ptx_dev->active_streams == oldstream)
1480 nvthd->ptx_dev->active_streams = nvthd->ptx_dev->active_streams->next;
1481 else
1483 struct ptx_stream *s = nvthd->ptx_dev->active_streams;
1484 while (s->next != oldstream)
1485 s = s->next;
1486 s->next = s->next->next;
1489 cuStreamDestroy (oldstream->stream);
1490 map_fini (oldstream);
1491 free (oldstream);
1494 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1496 (void) select_stream_for_async (async, self, true, (CUstream) stream);
1498 return 1;
1501 /* Plugin entry points. */
1503 const char *
1504 GOMP_OFFLOAD_get_name (void)
1506 return "nvptx";
1509 unsigned int
1510 GOMP_OFFLOAD_get_caps (void)
1512 return GOMP_OFFLOAD_CAP_OPENACC_200;
1516 GOMP_OFFLOAD_get_type (void)
1518 return OFFLOAD_TARGET_TYPE_NVIDIA_PTX;
1522 GOMP_OFFLOAD_get_num_devices (void)
1524 return nvptx_get_num_devices ();
1527 void
1528 GOMP_OFFLOAD_init_device (int n)
1530 pthread_mutex_lock (&ptx_dev_lock);
1532 if (!nvptx_init () || ptx_devices[n] != NULL)
1534 pthread_mutex_unlock (&ptx_dev_lock);
1535 return;
1538 ptx_devices[n] = nvptx_open_device (n);
1539 instantiated_devices++;
1541 pthread_mutex_unlock (&ptx_dev_lock);
1544 void
1545 GOMP_OFFLOAD_fini_device (int n)
1547 pthread_mutex_lock (&ptx_dev_lock);
1549 if (ptx_devices[n] != NULL)
1551 nvptx_attach_host_thread_to_device (n);
1552 nvptx_close_device (ptx_devices[n]);
1553 ptx_devices[n] = NULL;
1554 instantiated_devices--;
1557 pthread_mutex_unlock (&ptx_dev_lock);
1560 /* Data emitted by mkoffload. */
1562 typedef struct nvptx_tdata
1564 const char *ptx_src;
1566 const char *const *var_names;
1567 size_t var_num;
1569 const char *const *fn_names;
1570 size_t fn_num;
1571 } nvptx_tdata_t;
1573 /* Load the (partial) program described by TARGET_DATA to device
1574 number ORD. Allocate and return TARGET_TABLE. */
1577 GOMP_OFFLOAD_load_image (int ord, const void *target_data,
1578 struct addr_pair **target_table)
1580 CUmodule module;
1581 const char *const *fn_names, *const *var_names;
1582 unsigned int fn_entries, var_entries, i, j;
1583 CUresult r;
1584 struct targ_fn_descriptor *targ_fns;
1585 struct addr_pair *targ_tbl;
1586 const nvptx_tdata_t *img_header = (const nvptx_tdata_t *) target_data;
1587 struct ptx_image_data *new_image;
1588 struct ptx_device *dev;
1590 GOMP_OFFLOAD_init_device (ord);
1592 dev = ptx_devices[ord];
1594 nvptx_attach_host_thread_to_device (ord);
1596 link_ptx (&module, img_header->ptx_src);
1598 /* The mkoffload utility emits a struct of pointers/integers at the
1599 start of each offload image. The array of kernel names and the
1600 functions addresses form a one-to-one correspondence. */
1602 var_entries = img_header->var_num;
1603 var_names = img_header->var_names;
1604 fn_entries = img_header->fn_num;
1605 fn_names = img_header->fn_names;
1607 targ_tbl = GOMP_PLUGIN_malloc (sizeof (struct addr_pair)
1608 * (fn_entries + var_entries));
1609 targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
1610 * fn_entries);
1612 *target_table = targ_tbl;
1614 new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data));
1615 new_image->target_data = target_data;
1616 new_image->module = module;
1617 new_image->fns = targ_fns;
1619 pthread_mutex_lock (&dev->image_lock);
1620 new_image->next = dev->images;
1621 dev->images = new_image;
1622 pthread_mutex_unlock (&dev->image_lock);
1624 for (i = 0; i < fn_entries; i++, targ_fns++, targ_tbl++)
1626 CUfunction function;
1628 r = cuModuleGetFunction (&function, module, fn_names[i]);
1629 if (r != CUDA_SUCCESS)
1630 GOMP_PLUGIN_fatal ("cuModuleGetFunction error: %s", cuda_error (r));
1632 targ_fns->fn = function;
1633 targ_fns->name = (const char *) fn_names[i];
1635 targ_tbl->start = (uintptr_t) targ_fns;
1636 targ_tbl->end = targ_tbl->start + 1;
1639 for (j = 0; j < var_entries; j++, targ_tbl++)
1641 CUdeviceptr var;
1642 size_t bytes;
1644 r = cuModuleGetGlobal (&var, &bytes, module, var_names[j]);
1645 if (r != CUDA_SUCCESS)
1646 GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
1648 targ_tbl->start = (uintptr_t) var;
1649 targ_tbl->end = targ_tbl->start + bytes;
1652 return fn_entries + var_entries;
1655 /* Unload the program described by TARGET_DATA. DEV_DATA is the
1656 function descriptors allocated by G_O_load_image. */
1658 void
1659 GOMP_OFFLOAD_unload_image (int ord, const void *target_data)
1661 struct ptx_image_data *image, **prev_p;
1662 struct ptx_device *dev = ptx_devices[ord];
1664 pthread_mutex_lock (&dev->image_lock);
1665 for (prev_p = &dev->images; (image = *prev_p) != 0; prev_p = &image->next)
1666 if (image->target_data == target_data)
1668 *prev_p = image->next;
1669 cuModuleUnload (image->module);
1670 free (image->fns);
1671 free (image);
1672 break;
1674 pthread_mutex_unlock (&dev->image_lock);
1677 void *
1678 GOMP_OFFLOAD_alloc (int ord, size_t size)
1680 nvptx_attach_host_thread_to_device (ord);
1681 return nvptx_alloc (size);
1684 void
1685 GOMP_OFFLOAD_free (int ord, void *ptr)
1687 nvptx_attach_host_thread_to_device (ord);
1688 nvptx_free (ptr);
1691 void *
1692 GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
1694 nvptx_attach_host_thread_to_device (ord);
1695 return nvptx_dev2host (dst, src, n);
1698 void *
1699 GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
1701 nvptx_attach_host_thread_to_device (ord);
1702 return nvptx_host2dev (dst, src, n);
1705 void (*device_run) (int n, void *fn_ptr, void *vars) = NULL;
1707 void
1708 GOMP_OFFLOAD_openacc_parallel (void (*fn) (void *), size_t mapnum,
1709 void **hostaddrs, void **devaddrs, size_t *sizes,
1710 unsigned short *kinds, int num_gangs,
1711 int num_workers, int vector_length, int async,
1712 void *targ_mem_desc)
1714 nvptx_exec (fn, mapnum, hostaddrs, devaddrs, sizes, kinds, num_gangs,
1715 num_workers, vector_length, async, targ_mem_desc);
1718 void
1719 GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc)
1721 CUevent *e;
1722 CUresult r;
1723 struct nvptx_thread *nvthd = nvptx_thread ();
1725 e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1727 r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1728 if (r != CUDA_SUCCESS)
1729 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1731 r = cuEventRecord (*e, nvthd->current_stream->stream);
1732 if (r != CUDA_SUCCESS)
1733 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1735 event_add (PTX_EVT_ASYNC_CLEANUP, e, targ_mem_desc);
1739 GOMP_OFFLOAD_openacc_async_test (int async)
1741 return nvptx_async_test (async);
1745 GOMP_OFFLOAD_openacc_async_test_all (void)
1747 return nvptx_async_test_all ();
1750 void
1751 GOMP_OFFLOAD_openacc_async_wait (int async)
1753 nvptx_wait (async);
1756 void
1757 GOMP_OFFLOAD_openacc_async_wait_async (int async1, int async2)
1759 nvptx_wait_async (async1, async2);
1762 void
1763 GOMP_OFFLOAD_openacc_async_wait_all (void)
1765 nvptx_wait_all ();
1768 void
1769 GOMP_OFFLOAD_openacc_async_wait_all_async (int async)
1771 nvptx_wait_all_async (async);
1774 void
1775 GOMP_OFFLOAD_openacc_async_set_async (int async)
1777 nvptx_set_async (async);
1780 void *
1781 GOMP_OFFLOAD_openacc_create_thread_data (int ord)
1783 struct ptx_device *ptx_dev;
1784 struct nvptx_thread *nvthd
1785 = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
1786 CUresult r;
1787 CUcontext thd_ctx;
1789 ptx_dev = ptx_devices[ord];
1791 assert (ptx_dev);
1793 r = cuCtxGetCurrent (&thd_ctx);
1794 if (r != CUDA_SUCCESS)
1795 GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));
1797 assert (ptx_dev->ctx);
1799 if (!thd_ctx)
1801 r = cuCtxPushCurrent (ptx_dev->ctx);
1802 if (r != CUDA_SUCCESS)
1803 GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s", cuda_error (r));
1806 nvthd->current_stream = ptx_dev->null_stream;
1807 nvthd->ptx_dev = ptx_dev;
1809 return (void *) nvthd;
1812 void
1813 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
1815 free (data);
1818 void *
1819 GOMP_OFFLOAD_openacc_get_current_cuda_device (void)
1821 return nvptx_get_current_cuda_device ();
1824 void *
1825 GOMP_OFFLOAD_openacc_get_current_cuda_context (void)
1827 return nvptx_get_current_cuda_context ();
1830 /* NOTE: This returns a CUstream, not a ptx_stream pointer. */
1832 void *
1833 GOMP_OFFLOAD_openacc_get_cuda_stream (int async)
1835 return nvptx_get_cuda_stream (async);
1838 /* NOTE: This takes a CUstream, not a ptx_stream pointer. */
1841 GOMP_OFFLOAD_openacc_set_cuda_stream (int async, void *stream)
1843 return nvptx_set_cuda_stream (async, stream);