2015-01-18 Paul Thomas <pault@gcc.gnu.org>
[official-gcc.git] / libgomp / plugin / plugin-nvptx.c
blob483cb7559e83ed62a1731075710bfff6258a256f
1 /* Plugin for NVPTX execution.
3 Copyright (C) 2013-2015 Free Software Foundation, Inc.
5 Contributed by Mentor Embedded.
7 This file is part of the GNU Offloading and Multi Processing Library
8 (libgomp).
10 Libgomp is free software; you can redistribute it and/or modify it
11 under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 3, or (at your option)
13 any later version.
15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
18 more details.
20 Under Section 7 of GPL version 3, you are granted additional
21 permissions described in the GCC Runtime Library Exception, version
22 3.1, as published by the Free Software Foundation.
24 You should have received a copy of the GNU General Public License and
25 a copy of the GCC Runtime Library Exception along with this program;
26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
27 <http://www.gnu.org/licenses/>. */
29 /* Nvidia PTX-specific parts of OpenACC support. The cuda driver
30 library appears to hold some implicit state, but the documentation
31 is not clear as to what that state might be. Or how one might
32 propagate it from one thread to another. */
34 #include "openacc.h"
35 #include "config.h"
36 #include "libgomp-plugin.h"
37 #include "oacc-ptx.h"
38 #include "oacc-plugin.h"
40 #include <pthread.h>
41 #include <cuda.h>
42 #include <stdbool.h>
43 #include <stdint.h>
44 #include <string.h>
45 #include <stdio.h>
46 #include <dlfcn.h>
47 #include <unistd.h>
48 #include <assert.h>
50 #define ARRAYSIZE(X) (sizeof (X) / sizeof ((X)[0]))
52 static struct
54 CUresult r;
55 char *m;
56 } cuda_errlist[]=
58 { CUDA_ERROR_INVALID_VALUE, "invalid value" },
59 { CUDA_ERROR_OUT_OF_MEMORY, "out of memory" },
60 { CUDA_ERROR_NOT_INITIALIZED, "not initialized" },
61 { CUDA_ERROR_DEINITIALIZED, "deinitialized" },
62 { CUDA_ERROR_PROFILER_DISABLED, "profiler disabled" },
63 { CUDA_ERROR_PROFILER_NOT_INITIALIZED, "profiler not initialized" },
64 { CUDA_ERROR_PROFILER_ALREADY_STARTED, "already started" },
65 { CUDA_ERROR_PROFILER_ALREADY_STOPPED, "already stopped" },
66 { CUDA_ERROR_NO_DEVICE, "no device" },
67 { CUDA_ERROR_INVALID_DEVICE, "invalid device" },
68 { CUDA_ERROR_INVALID_IMAGE, "invalid image" },
69 { CUDA_ERROR_INVALID_CONTEXT, "invalid context" },
70 { CUDA_ERROR_CONTEXT_ALREADY_CURRENT, "context already current" },
71 { CUDA_ERROR_MAP_FAILED, "map error" },
72 { CUDA_ERROR_UNMAP_FAILED, "unmap error" },
73 { CUDA_ERROR_ARRAY_IS_MAPPED, "array is mapped" },
74 { CUDA_ERROR_ALREADY_MAPPED, "already mapped" },
75 { CUDA_ERROR_NO_BINARY_FOR_GPU, "no binary for gpu" },
76 { CUDA_ERROR_ALREADY_ACQUIRED, "already acquired" },
77 { CUDA_ERROR_NOT_MAPPED, "not mapped" },
78 { CUDA_ERROR_NOT_MAPPED_AS_ARRAY, "not mapped as array" },
79 { CUDA_ERROR_NOT_MAPPED_AS_POINTER, "not mapped as pointer" },
80 { CUDA_ERROR_ECC_UNCORRECTABLE, "ecc uncorrectable" },
81 { CUDA_ERROR_UNSUPPORTED_LIMIT, "unsupported limit" },
82 { CUDA_ERROR_CONTEXT_ALREADY_IN_USE, "context already in use" },
83 { CUDA_ERROR_PEER_ACCESS_UNSUPPORTED, "peer access unsupported" },
84 { CUDA_ERROR_INVALID_SOURCE, "invalid source" },
85 { CUDA_ERROR_FILE_NOT_FOUND, "file not found" },
86 { CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
87 "shared object symbol not found" },
88 { CUDA_ERROR_SHARED_OBJECT_INIT_FAILED, "shared object init error" },
89 { CUDA_ERROR_OPERATING_SYSTEM, "operating system" },
90 { CUDA_ERROR_INVALID_HANDLE, "invalid handle" },
91 { CUDA_ERROR_NOT_FOUND, "not found" },
92 { CUDA_ERROR_NOT_READY, "not ready" },
93 { CUDA_ERROR_LAUNCH_FAILED, "launch error" },
94 { CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, "launch out of resources" },
95 { CUDA_ERROR_LAUNCH_TIMEOUT, "launch timeout" },
96 { CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
97 "launch incompatibe texturing" },
98 { CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED, "peer access already enabled" },
99 { CUDA_ERROR_PEER_ACCESS_NOT_ENABLED, "peer access not enabled " },
100 { CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE, "primary cotext active" },
101 { CUDA_ERROR_CONTEXT_IS_DESTROYED, "context is destroyed" },
102 { CUDA_ERROR_ASSERT, "assert" },
103 { CUDA_ERROR_TOO_MANY_PEERS, "too many peers" },
104 { CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED,
105 "host memory already registered" },
106 { CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED, "host memory not registered" },
107 { CUDA_ERROR_NOT_PERMITTED, "not permitted" },
108 { CUDA_ERROR_NOT_SUPPORTED, "not supported" },
109 { CUDA_ERROR_UNKNOWN, "unknown" }
112 static char errmsg[128];
114 static char *
115 cuda_error (CUresult r)
117 int i;
119 for (i = 0; i < ARRAYSIZE (cuda_errlist); i++)
121 if (cuda_errlist[i].r == r)
122 return &cuda_errlist[i].m[0];
125 sprintf (&errmsg[0], "unknown result code: %5d", r);
127 return &errmsg[0];
130 struct targ_fn_descriptor
132 CUfunction fn;
133 const char *name;
136 static bool ptx_inited = false;
138 struct ptx_stream
140 CUstream stream;
141 pthread_t host_thread;
142 bool multithreaded;
144 CUdeviceptr d;
145 void *h;
146 void *h_begin;
147 void *h_end;
148 void *h_next;
149 void *h_prev;
150 void *h_tail;
152 struct ptx_stream *next;
155 /* Thread-specific data for PTX. */
157 struct nvptx_thread
159 struct ptx_stream *current_stream;
160 struct ptx_device *ptx_dev;
163 struct map
165 int async;
166 size_t size;
167 char mappings[0];
170 static void
171 map_init (struct ptx_stream *s)
173 CUresult r;
175 int size = getpagesize ();
177 assert (s);
178 assert (!s->d);
179 assert (!s->h);
181 r = cuMemAllocHost (&s->h, size);
182 if (r != CUDA_SUCCESS)
183 GOMP_PLUGIN_fatal ("cuMemAllocHost error: %s", cuda_error (r));
185 r = cuMemHostGetDevicePointer (&s->d, s->h, 0);
186 if (r != CUDA_SUCCESS)
187 GOMP_PLUGIN_fatal ("cuMemHostGetDevicePointer error: %s", cuda_error (r));
189 assert (s->h);
191 s->h_begin = s->h;
192 s->h_end = s->h_begin + size;
193 s->h_next = s->h_prev = s->h_tail = s->h_begin;
195 assert (s->h_next);
196 assert (s->h_end);
199 static void
200 map_fini (struct ptx_stream *s)
202 CUresult r;
204 r = cuMemFreeHost (s->h);
205 if (r != CUDA_SUCCESS)
206 GOMP_PLUGIN_fatal ("cuMemFreeHost error: %s", cuda_error (r));
209 static void
210 map_pop (struct ptx_stream *s)
212 struct map *m;
214 assert (s != NULL);
215 assert (s->h_next);
216 assert (s->h_prev);
217 assert (s->h_tail);
219 m = s->h_tail;
221 s->h_tail += m->size;
223 if (s->h_tail >= s->h_end)
224 s->h_tail = s->h_begin + (int) (s->h_tail - s->h_end);
226 if (s->h_next == s->h_tail)
227 s->h_prev = s->h_next;
229 assert (s->h_next >= s->h_begin);
230 assert (s->h_tail >= s->h_begin);
231 assert (s->h_prev >= s->h_begin);
233 assert (s->h_next <= s->h_end);
234 assert (s->h_tail <= s->h_end);
235 assert (s->h_prev <= s->h_end);
238 static void
239 map_push (struct ptx_stream *s, int async, size_t size, void **h, void **d)
241 int left;
242 int offset;
243 struct map *m;
245 assert (s != NULL);
247 left = s->h_end - s->h_next;
248 size += sizeof (struct map);
250 assert (s->h_prev);
251 assert (s->h_next);
253 if (size >= left)
255 m = s->h_prev;
256 m->size += left;
257 s->h_next = s->h_begin;
259 if (s->h_next + size > s->h_end)
260 GOMP_PLUGIN_fatal ("unable to push map");
263 assert (s->h_next);
265 m = s->h_next;
266 m->async = async;
267 m->size = size;
269 offset = (void *)&m->mappings[0] - s->h;
271 *d = (void *)(s->d + offset);
272 *h = (void *)(s->h + offset);
274 s->h_prev = s->h_next;
275 s->h_next += size;
277 assert (s->h_prev);
278 assert (s->h_next);
280 assert (s->h_next >= s->h_begin);
281 assert (s->h_tail >= s->h_begin);
282 assert (s->h_prev >= s->h_begin);
283 assert (s->h_next <= s->h_end);
284 assert (s->h_tail <= s->h_end);
285 assert (s->h_prev <= s->h_end);
287 return;
290 struct ptx_device
292 CUcontext ctx;
293 bool ctx_shared;
294 CUdevice dev;
295 struct ptx_stream *null_stream;
296 /* All non-null streams associated with this device (actually context),
297 either created implicitly or passed in from the user (via
298 acc_set_cuda_stream). */
299 struct ptx_stream *active_streams;
300 struct {
301 struct ptx_stream **arr;
302 int size;
303 } async_streams;
304 /* A lock for use when manipulating the above stream list and array. */
305 pthread_mutex_t stream_lock;
306 int ord;
307 bool overlap;
308 bool map;
309 bool concur;
310 int mode;
311 bool mkern;
313 struct ptx_device *next;
316 enum ptx_event_type
318 PTX_EVT_MEM,
319 PTX_EVT_KNL,
320 PTX_EVT_SYNC,
321 PTX_EVT_ASYNC_CLEANUP
324 struct ptx_event
326 CUevent *evt;
327 int type;
328 void *addr;
329 int ord;
331 struct ptx_event *next;
334 static pthread_mutex_t ptx_event_lock;
335 static struct ptx_event *ptx_events;
337 #define _XSTR(s) _STR(s)
338 #define _STR(s) #s
340 static struct _synames
342 char *n;
343 } cuda_symnames[] =
345 { _XSTR (cuCtxCreate) },
346 { _XSTR (cuCtxDestroy) },
347 { _XSTR (cuCtxGetCurrent) },
348 { _XSTR (cuCtxPushCurrent) },
349 { _XSTR (cuCtxSynchronize) },
350 { _XSTR (cuDeviceGet) },
351 { _XSTR (cuDeviceGetAttribute) },
352 { _XSTR (cuDeviceGetCount) },
353 { _XSTR (cuEventCreate) },
354 { _XSTR (cuEventDestroy) },
355 { _XSTR (cuEventQuery) },
356 { _XSTR (cuEventRecord) },
357 { _XSTR (cuInit) },
358 { _XSTR (cuLaunchKernel) },
359 { _XSTR (cuLinkAddData) },
360 { _XSTR (cuLinkComplete) },
361 { _XSTR (cuLinkCreate) },
362 { _XSTR (cuMemAlloc) },
363 { _XSTR (cuMemAllocHost) },
364 { _XSTR (cuMemcpy) },
365 { _XSTR (cuMemcpyDtoH) },
366 { _XSTR (cuMemcpyDtoHAsync) },
367 { _XSTR (cuMemcpyHtoD) },
368 { _XSTR (cuMemcpyHtoDAsync) },
369 { _XSTR (cuMemFree) },
370 { _XSTR (cuMemFreeHost) },
371 { _XSTR (cuMemGetAddressRange) },
372 { _XSTR (cuMemHostGetDevicePointer) },
373 { _XSTR (cuMemHostRegister) },
374 { _XSTR (cuMemHostUnregister) },
375 { _XSTR (cuModuleGetFunction) },
376 { _XSTR (cuModuleLoadData) },
377 { _XSTR (cuStreamDestroy) },
378 { _XSTR (cuStreamQuery) },
379 { _XSTR (cuStreamSynchronize) },
380 { _XSTR (cuStreamWaitEvent) }
383 static int
384 verify_device_library (void)
386 int i;
387 void *dh, *ds;
389 dh = dlopen ("libcuda.so", RTLD_LAZY);
390 if (!dh)
391 return -1;
393 for (i = 0; i < ARRAYSIZE (cuda_symnames); i++)
395 ds = dlsym (dh, cuda_symnames[i].n);
396 if (!ds)
397 return -1;
400 dlclose (dh);
402 return 0;
405 static inline struct nvptx_thread *
406 nvptx_thread (void)
408 return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
411 static void
412 init_streams_for_device (struct ptx_device *ptx_dev, int concurrency)
414 int i;
415 struct ptx_stream *null_stream
416 = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
418 null_stream->stream = NULL;
419 null_stream->host_thread = pthread_self ();
420 null_stream->multithreaded = true;
421 null_stream->d = (CUdeviceptr) NULL;
422 null_stream->h = NULL;
423 map_init (null_stream);
424 ptx_dev->null_stream = null_stream;
426 ptx_dev->active_streams = NULL;
427 pthread_mutex_init (&ptx_dev->stream_lock, NULL);
429 if (concurrency < 1)
430 concurrency = 1;
432 /* This is just a guess -- make space for as many async streams as the
433 current device is capable of concurrently executing. This can grow
434 later as necessary. No streams are created yet. */
435 ptx_dev->async_streams.arr
436 = GOMP_PLUGIN_malloc (concurrency * sizeof (struct ptx_stream *));
437 ptx_dev->async_streams.size = concurrency;
439 for (i = 0; i < concurrency; i++)
440 ptx_dev->async_streams.arr[i] = NULL;
443 static void
444 fini_streams_for_device (struct ptx_device *ptx_dev)
446 free (ptx_dev->async_streams.arr);
448 while (ptx_dev->active_streams != NULL)
450 struct ptx_stream *s = ptx_dev->active_streams;
451 ptx_dev->active_streams = ptx_dev->active_streams->next;
453 cuStreamDestroy (s->stream);
454 map_fini (s);
455 free (s);
458 map_fini (ptx_dev->null_stream);
459 free (ptx_dev->null_stream);
462 /* Select a stream for (OpenACC-semantics) ASYNC argument for the current
463 thread THREAD (and also current device/context). If CREATE is true, create
464 the stream if it does not exist (or use EXISTING if it is non-NULL), and
465 associate the stream with the same thread argument. Returns stream to use
466 as result. */
468 static struct ptx_stream *
469 select_stream_for_async (int async, pthread_t thread, bool create,
470 CUstream existing)
472 struct nvptx_thread *nvthd = nvptx_thread ();
473 /* Local copy of TLS variable. */
474 struct ptx_device *ptx_dev = nvthd->ptx_dev;
475 struct ptx_stream *stream = NULL;
476 int orig_async = async;
478 /* The special value acc_async_noval (-1) maps (for now) to an
479 implicitly-created stream, which is then handled the same as any other
480 numbered async stream. Other options are available, e.g. using the null
481 stream for anonymous async operations, or choosing an idle stream from an
482 active set. But, stick with this for now. */
483 if (async > acc_async_sync)
484 async++;
486 if (create)
487 pthread_mutex_lock (&ptx_dev->stream_lock);
489 /* NOTE: AFAICT there's no particular need for acc_async_sync to map to the
490 null stream, and in fact better performance may be obtainable if it doesn't
491 (because the null stream enforces overly-strict synchronisation with
492 respect to other streams for legacy reasons, and that's probably not
493 needed with OpenACC). Maybe investigate later. */
494 if (async == acc_async_sync)
495 stream = ptx_dev->null_stream;
496 else if (async >= 0 && async < ptx_dev->async_streams.size
497 && ptx_dev->async_streams.arr[async] && !(create && existing))
498 stream = ptx_dev->async_streams.arr[async];
499 else if (async >= 0 && create)
501 if (async >= ptx_dev->async_streams.size)
503 int i, newsize = ptx_dev->async_streams.size * 2;
505 if (async >= newsize)
506 newsize = async + 1;
508 ptx_dev->async_streams.arr
509 = GOMP_PLUGIN_realloc (ptx_dev->async_streams.arr,
510 newsize * sizeof (struct ptx_stream *));
512 for (i = ptx_dev->async_streams.size; i < newsize; i++)
513 ptx_dev->async_streams.arr[i] = NULL;
515 ptx_dev->async_streams.size = newsize;
518 /* Create a new stream on-demand if there isn't one already, or if we're
519 setting a particular async value to an existing (externally-provided)
520 stream. */
521 if (!ptx_dev->async_streams.arr[async] || existing)
523 CUresult r;
524 struct ptx_stream *s
525 = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
527 if (existing)
528 s->stream = existing;
529 else
531 r = cuStreamCreate (&s->stream, CU_STREAM_DEFAULT);
532 if (r != CUDA_SUCCESS)
533 GOMP_PLUGIN_fatal ("cuStreamCreate error: %s", cuda_error (r));
536 /* If CREATE is true, we're going to be queueing some work on this
537 stream. Associate it with the current host thread. */
538 s->host_thread = thread;
539 s->multithreaded = false;
541 s->d = (CUdeviceptr) NULL;
542 s->h = NULL;
543 map_init (s);
545 s->next = ptx_dev->active_streams;
546 ptx_dev->active_streams = s;
547 ptx_dev->async_streams.arr[async] = s;
550 stream = ptx_dev->async_streams.arr[async];
552 else if (async < 0)
553 GOMP_PLUGIN_fatal ("bad async %d", async);
555 if (create)
557 assert (stream != NULL);
559 /* If we're trying to use the same stream from different threads
560 simultaneously, set stream->multithreaded to true. This affects the
561 behaviour of acc_async_test_all and acc_wait_all, which are supposed to
562 only wait for asynchronous launches from the same host thread they are
563 invoked on. If multiple threads use the same async value, we make note
564 of that here and fall back to testing/waiting for all threads in those
565 functions. */
566 if (thread != stream->host_thread)
567 stream->multithreaded = true;
569 pthread_mutex_unlock (&ptx_dev->stream_lock);
571 else if (stream && !stream->multithreaded
572 && !pthread_equal (stream->host_thread, thread))
573 GOMP_PLUGIN_fatal ("async %d used on wrong thread", orig_async);
575 return stream;
578 static int nvptx_get_num_devices (void);
580 /* Initialize the device. */
581 static int
582 nvptx_init (void)
584 CUresult r;
585 int rc;
587 if (ptx_inited)
588 return nvptx_get_num_devices ();
590 rc = verify_device_library ();
591 if (rc < 0)
592 return -1;
594 r = cuInit (0);
595 if (r != CUDA_SUCCESS)
596 GOMP_PLUGIN_fatal ("cuInit error: %s", cuda_error (r));
598 ptx_events = NULL;
600 pthread_mutex_init (&ptx_event_lock, NULL);
602 ptx_inited = true;
604 return nvptx_get_num_devices ();
607 static void
608 nvptx_fini (void)
610 ptx_inited = false;
613 static void *
614 nvptx_open_device (int n)
616 struct ptx_device *ptx_dev;
617 CUdevice dev;
618 CUresult r;
619 int async_engines, pi;
621 r = cuDeviceGet (&dev, n);
622 if (r != CUDA_SUCCESS)
623 GOMP_PLUGIN_fatal ("cuDeviceGet error: %s", cuda_error (r));
625 ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device));
627 ptx_dev->ord = n;
628 ptx_dev->dev = dev;
629 ptx_dev->ctx_shared = false;
631 r = cuCtxGetCurrent (&ptx_dev->ctx);
632 if (r != CUDA_SUCCESS)
633 GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));
635 if (!ptx_dev->ctx)
637 r = cuCtxCreate (&ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
638 if (r != CUDA_SUCCESS)
639 GOMP_PLUGIN_fatal ("cuCtxCreate error: %s", cuda_error (r));
641 else
642 ptx_dev->ctx_shared = true;
644 r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
645 if (r != CUDA_SUCCESS)
646 GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
648 ptx_dev->overlap = pi;
650 r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
651 if (r != CUDA_SUCCESS)
652 GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
654 ptx_dev->map = pi;
656 r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
657 if (r != CUDA_SUCCESS)
658 GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
660 ptx_dev->concur = pi;
662 r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
663 if (r != CUDA_SUCCESS)
664 GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
666 ptx_dev->mode = pi;
668 r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
669 if (r != CUDA_SUCCESS)
670 GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
672 ptx_dev->mkern = pi;
674 r = cuDeviceGetAttribute (&async_engines,
675 CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
676 if (r != CUDA_SUCCESS)
677 async_engines = 1;
679 init_streams_for_device (ptx_dev, async_engines);
681 return (void *) ptx_dev;
684 static int
685 nvptx_close_device (void *targ_data)
687 CUresult r;
688 struct ptx_device *ptx_dev = targ_data;
690 if (!ptx_dev)
691 return 0;
693 fini_streams_for_device (ptx_dev);
695 if (!ptx_dev->ctx_shared)
697 r = cuCtxDestroy (ptx_dev->ctx);
698 if (r != CUDA_SUCCESS)
699 GOMP_PLUGIN_fatal ("cuCtxDestroy error: %s", cuda_error (r));
702 free (ptx_dev);
704 return 0;
707 static int
708 nvptx_get_num_devices (void)
710 int n;
711 CUresult r;
713 /* This function will be called before the plugin has been initialized in
714 order to enumerate available devices, but CUDA API routines can't be used
715 until cuInit has been called. Just call it now (but don't yet do any
716 further initialization). */
717 if (!ptx_inited)
718 cuInit (0);
720 r = cuDeviceGetCount (&n);
721 if (r!= CUDA_SUCCESS)
722 GOMP_PLUGIN_fatal ("cuDeviceGetCount error: %s", cuda_error (r));
724 return n;
728 static void
729 link_ptx (CUmodule *module, char *ptx_code)
731 CUjit_option opts[7];
732 void *optvals[7];
733 float elapsed = 0.0;
734 #define LOGSIZE 8192
735 char elog[LOGSIZE];
736 char ilog[LOGSIZE];
737 unsigned long logsize = LOGSIZE;
738 CUlinkState linkstate;
739 CUresult r;
740 void *linkout;
741 size_t linkoutsize __attribute__ ((unused));
743 GOMP_PLUGIN_debug (0, "attempting to load:\n---\n%s\n---\n", ptx_code);
745 opts[0] = CU_JIT_WALL_TIME;
746 optvals[0] = &elapsed;
748 opts[1] = CU_JIT_INFO_LOG_BUFFER;
749 optvals[1] = &ilog[0];
751 opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
752 optvals[2] = (void *) logsize;
754 opts[3] = CU_JIT_ERROR_LOG_BUFFER;
755 optvals[3] = &elog[0];
757 opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
758 optvals[4] = (void *) logsize;
760 opts[5] = CU_JIT_LOG_VERBOSE;
761 optvals[5] = (void *) 1;
763 opts[6] = CU_JIT_TARGET;
764 optvals[6] = (void *) CU_TARGET_COMPUTE_30;
766 r = cuLinkCreate (7, opts, optvals, &linkstate);
767 if (r != CUDA_SUCCESS)
768 GOMP_PLUGIN_fatal ("cuLinkCreate error: %s", cuda_error (r));
770 char *abort_ptx = ABORT_PTX;
771 r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, abort_ptx,
772 strlen (abort_ptx) + 1, 0, 0, 0, 0);
773 if (r != CUDA_SUCCESS)
775 GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
776 GOMP_PLUGIN_fatal ("cuLinkAddData (abort) error: %s", cuda_error (r));
779 char *acc_on_device_ptx = ACC_ON_DEVICE_PTX;
780 r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, acc_on_device_ptx,
781 strlen (acc_on_device_ptx) + 1, 0, 0, 0, 0);
782 if (r != CUDA_SUCCESS)
784 GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
785 GOMP_PLUGIN_fatal ("cuLinkAddData (acc_on_device) error: %s",
786 cuda_error (r));
789 char *goacc_internal_ptx = GOACC_INTERNAL_PTX;
790 r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, goacc_internal_ptx,
791 strlen (goacc_internal_ptx) + 1, 0, 0, 0, 0);
792 if (r != CUDA_SUCCESS)
794 GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
795 GOMP_PLUGIN_fatal ("cuLinkAddData (goacc_internal_ptx) error: %s",
796 cuda_error (r));
799 r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, ptx_code,
800 strlen (ptx_code) + 1, 0, 0, 0, 0);
801 if (r != CUDA_SUCCESS)
803 GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
804 GOMP_PLUGIN_fatal ("cuLinkAddData (ptx_code) error: %s", cuda_error (r));
807 r = cuLinkComplete (linkstate, &linkout, &linkoutsize);
808 if (r != CUDA_SUCCESS)
809 GOMP_PLUGIN_fatal ("cuLinkComplete error: %s", cuda_error (r));
811 GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed);
812 GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]);
814 r = cuModuleLoadData (module, linkout);
815 if (r != CUDA_SUCCESS)
816 GOMP_PLUGIN_fatal ("cuModuleLoadData error: %s", cuda_error (r));
819 static void
820 event_gc (bool memmap_lockable)
822 struct ptx_event *ptx_event = ptx_events;
823 struct nvptx_thread *nvthd = nvptx_thread ();
825 pthread_mutex_lock (&ptx_event_lock);
827 while (ptx_event != NULL)
829 CUresult r;
830 struct ptx_event *e = ptx_event;
832 ptx_event = ptx_event->next;
834 if (e->ord != nvthd->ptx_dev->ord)
835 continue;
837 r = cuEventQuery (*e->evt);
838 if (r == CUDA_SUCCESS)
840 CUevent *te;
842 te = e->evt;
844 switch (e->type)
846 case PTX_EVT_MEM:
847 case PTX_EVT_SYNC:
848 break;
850 case PTX_EVT_KNL:
851 map_pop (e->addr);
852 break;
854 case PTX_EVT_ASYNC_CLEANUP:
856 /* The function gomp_plugin_async_unmap_vars needs to claim the
857 memory-map splay tree lock for the current device, so we
858 can't call it when one of our callers has already claimed
859 the lock. In that case, just delay the GC for this event
860 until later. */
861 if (!memmap_lockable)
862 continue;
864 GOMP_PLUGIN_async_unmap_vars (e->addr);
866 break;
869 cuEventDestroy (*te);
870 free ((void *)te);
872 if (ptx_events == e)
873 ptx_events = ptx_events->next;
874 else
876 struct ptx_event *e_ = ptx_events;
877 while (e_->next != e)
878 e_ = e_->next;
879 e_->next = e_->next->next;
882 free (e);
886 pthread_mutex_unlock (&ptx_event_lock);
889 static void
890 event_add (enum ptx_event_type type, CUevent *e, void *h)
892 struct ptx_event *ptx_event;
893 struct nvptx_thread *nvthd = nvptx_thread ();
895 assert (type == PTX_EVT_MEM || type == PTX_EVT_KNL || type == PTX_EVT_SYNC
896 || type == PTX_EVT_ASYNC_CLEANUP);
898 ptx_event = GOMP_PLUGIN_malloc (sizeof (struct ptx_event));
899 ptx_event->type = type;
900 ptx_event->evt = e;
901 ptx_event->addr = h;
902 ptx_event->ord = nvthd->ptx_dev->ord;
904 pthread_mutex_lock (&ptx_event_lock);
906 ptx_event->next = ptx_events;
907 ptx_events = ptx_event;
909 pthread_mutex_unlock (&ptx_event_lock);
912 void
913 nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
914 size_t *sizes, unsigned short *kinds, int num_gangs, int num_workers,
915 int vector_length, int async, void *targ_mem_desc)
917 struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
918 CUfunction function;
919 CUresult r;
920 int i;
921 struct ptx_stream *dev_str;
922 void *kargs[1];
923 void *hp, *dp;
924 unsigned int nthreads_in_block;
925 struct nvptx_thread *nvthd = nvptx_thread ();
926 const char *maybe_abort_msg = "(perhaps abort was called)";
928 function = targ_fn->fn;
930 dev_str = select_stream_for_async (async, pthread_self (), false, NULL);
931 assert (dev_str == nvthd->current_stream);
933 /* This reserves a chunk of a pre-allocated page of memory mapped on both
934 the host and the device. HP is a host pointer to the new chunk, and DP is
935 the corresponding device pointer. */
936 map_push (dev_str, async, mapnum * sizeof (void *), &hp, &dp);
938 GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__);
940 /* Copy the array of arguments to the mapped page. */
941 for (i = 0; i < mapnum; i++)
942 ((void **) hp)[i] = devaddrs[i];
944 /* Copy the (device) pointers to arguments to the device (dp and hp might in
945 fact have the same value on a unified-memory system). */
946 r = cuMemcpy ((CUdeviceptr)dp, (CUdeviceptr)hp, mapnum * sizeof (void *));
947 if (r != CUDA_SUCCESS)
948 GOMP_PLUGIN_fatal ("cuMemcpy failed: %s", cuda_error (r));
950 GOMP_PLUGIN_debug (0, " %s: kernel %s: launch\n", __FUNCTION__, targ_fn->name);
952 // OpenACC CUDA
954 // num_gangs blocks
955 // num_workers warps (where a warp is equivalent to 32 threads)
956 // vector length threads
959 /* The openacc vector_length clause 'determines the vector length to use for
960 vector or SIMD operations'. The question is how to map this to CUDA.
962 In CUDA, the warp size is the vector length of a CUDA device. However, the
963 CUDA interface abstracts away from that, and only shows us warp size
964 indirectly in maximum number of threads per block, which is a product of
965 warp size and the number of hyperthreads of a multiprocessor.
967 We choose to map openacc vector_length directly onto the number of threads
968 in a block, in the x dimension. This is reflected in gcc code generation
969 that uses ThreadIdx.x to access vector elements.
971 Attempting to use an openacc vector_length of more than the maximum number
972 of threads per block will result in a cuda error. */
973 nthreads_in_block = vector_length;
975 kargs[0] = &dp;
976 r = cuLaunchKernel (function,
977 num_gangs, 1, 1,
978 nthreads_in_block, 1, 1,
979 0, dev_str->stream, kargs, 0);
980 if (r != CUDA_SUCCESS)
981 GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
983 #ifndef DISABLE_ASYNC
984 if (async < acc_async_noval)
986 r = cuStreamSynchronize (dev_str->stream);
987 if (r == CUDA_ERROR_LAUNCH_FAILED)
988 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
989 maybe_abort_msg);
990 else if (r != CUDA_SUCCESS)
991 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
993 else
995 CUevent *e;
997 e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
999 r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1000 if (r == CUDA_ERROR_LAUNCH_FAILED)
1001 GOMP_PLUGIN_fatal ("cuEventCreate error: %s %s\n", cuda_error (r),
1002 maybe_abort_msg);
1003 else if (r != CUDA_SUCCESS)
1004 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1006 event_gc (true);
1008 r = cuEventRecord (*e, dev_str->stream);
1009 if (r != CUDA_SUCCESS)
1010 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1012 event_add (PTX_EVT_KNL, e, (void *)dev_str);
1014 #else
1015 r = cuCtxSynchronize ();
1016 if (r == CUDA_ERROR_LAUNCH_FAILED)
1017 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
1018 maybe_abort_msg);
1019 else if (r != CUDA_SUCCESS)
1020 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
1021 #endif
1023 GOMP_PLUGIN_debug (0, " %s: kernel %s: finished\n", __FUNCTION__,
1024 targ_fn->name);
1026 #ifndef DISABLE_ASYNC
1027 if (async < acc_async_noval)
1028 #endif
1029 map_pop (dev_str);
1032 void * openacc_get_current_cuda_context (void);
1034 static void *
1035 nvptx_alloc (size_t s)
1037 CUdeviceptr d;
1038 CUresult r;
1040 r = cuMemAlloc (&d, s);
1041 if (r == CUDA_ERROR_OUT_OF_MEMORY)
1042 return 0;
1043 if (r != CUDA_SUCCESS)
1044 GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
1045 return (void *)d;
1048 static void
1049 nvptx_free (void *p)
1051 CUresult r;
1052 CUdeviceptr pb;
1053 size_t ps;
1055 r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)p);
1056 if (r != CUDA_SUCCESS)
1057 GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r));
1059 if ((CUdeviceptr)p != pb)
1060 GOMP_PLUGIN_fatal ("invalid device address");
1062 r = cuMemFree ((CUdeviceptr)p);
1063 if (r != CUDA_SUCCESS)
1064 GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
1067 static void *
1068 nvptx_host2dev (void *d, const void *h, size_t s)
1070 CUresult r;
1071 CUdeviceptr pb;
1072 size_t ps;
1073 struct nvptx_thread *nvthd = nvptx_thread ();
1075 if (!s)
1076 return 0;
1078 if (!d)
1079 GOMP_PLUGIN_fatal ("invalid device address");
1081 r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)d);
1082 if (r != CUDA_SUCCESS)
1083 GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r));
1085 if (!pb)
1086 GOMP_PLUGIN_fatal ("invalid device address");
1088 if (!h)
1089 GOMP_PLUGIN_fatal ("invalid host address");
1091 if (d == h)
1092 GOMP_PLUGIN_fatal ("invalid host or device address");
1094 if ((void *)(d + s) > (void *)(pb + ps))
1095 GOMP_PLUGIN_fatal ("invalid size");
1097 #ifndef DISABLE_ASYNC
1098 if (nvthd->current_stream != nvthd->ptx_dev->null_stream)
1100 CUevent *e;
1102 e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1104 r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1105 if (r != CUDA_SUCCESS)
1106 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1108 event_gc (false);
1110 r = cuMemcpyHtoDAsync ((CUdeviceptr)d, h, s,
1111 nvthd->current_stream->stream);
1112 if (r != CUDA_SUCCESS)
1113 GOMP_PLUGIN_fatal ("cuMemcpyHtoDAsync error: %s", cuda_error (r));
1115 r = cuEventRecord (*e, nvthd->current_stream->stream);
1116 if (r != CUDA_SUCCESS)
1117 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1119 event_add (PTX_EVT_MEM, e, (void *)h);
1121 else
1122 #endif
1124 r = cuMemcpyHtoD ((CUdeviceptr)d, h, s);
1125 if (r != CUDA_SUCCESS)
1126 GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
1129 return 0;
1132 static void *
1133 nvptx_dev2host (void *h, const void *d, size_t s)
1135 CUresult r;
1136 CUdeviceptr pb;
1137 size_t ps;
1138 struct nvptx_thread *nvthd = nvptx_thread ();
1140 if (!s)
1141 return 0;
1143 if (!d)
1144 GOMP_PLUGIN_fatal ("invalid device address");
1146 r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)d);
1147 if (r != CUDA_SUCCESS)
1148 GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r));
1150 if (!pb)
1151 GOMP_PLUGIN_fatal ("invalid device address");
1153 if (!h)
1154 GOMP_PLUGIN_fatal ("invalid host address");
1156 if (d == h)
1157 GOMP_PLUGIN_fatal ("invalid host or device address");
1159 if ((void *)(d + s) > (void *)(pb + ps))
1160 GOMP_PLUGIN_fatal ("invalid size");
1162 #ifndef DISABLE_ASYNC
1163 if (nvthd->current_stream != nvthd->ptx_dev->null_stream)
1165 CUevent *e;
1167 e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1169 r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1170 if (r != CUDA_SUCCESS)
1171 GOMP_PLUGIN_fatal ("cuEventCreate error: %s\n", cuda_error (r));
1173 event_gc (false);
1175 r = cuMemcpyDtoHAsync (h, (CUdeviceptr)d, s,
1176 nvthd->current_stream->stream);
1177 if (r != CUDA_SUCCESS)
1178 GOMP_PLUGIN_fatal ("cuMemcpyDtoHAsync error: %s", cuda_error (r));
1180 r = cuEventRecord (*e, nvthd->current_stream->stream);
1181 if (r != CUDA_SUCCESS)
1182 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1184 event_add (PTX_EVT_MEM, e, (void *)h);
1186 else
1187 #endif
1189 r = cuMemcpyDtoH (h, (CUdeviceptr)d, s);
1190 if (r != CUDA_SUCCESS)
1191 GOMP_PLUGIN_fatal ("cuMemcpyDtoH error: %s", cuda_error (r));
1194 return 0;
1197 static void
1198 nvptx_set_async (int async)
1200 struct nvptx_thread *nvthd = nvptx_thread ();
1201 nvthd->current_stream
1202 = select_stream_for_async (async, pthread_self (), true, NULL);
1205 static int
1206 nvptx_async_test (int async)
1208 CUresult r;
1209 struct ptx_stream *s;
1211 s = select_stream_for_async (async, pthread_self (), false, NULL);
1213 if (!s)
1214 GOMP_PLUGIN_fatal ("unknown async %d", async);
1216 r = cuStreamQuery (s->stream);
1217 if (r == CUDA_SUCCESS)
1219 /* The oacc-parallel.c:goacc_wait function calls this hook to determine
1220 whether all work has completed on this stream, and if so omits the call
1221 to the wait hook. If that happens, event_gc might not get called
1222 (which prevents variables from getting unmapped and their associated
1223 device storage freed), so call it here. */
1224 event_gc (true);
1225 return 1;
1227 else if (r == CUDA_ERROR_NOT_READY)
1228 return 0;
1230 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
1232 return 0;
1235 static int
1236 nvptx_async_test_all (void)
1238 struct ptx_stream *s;
1239 pthread_t self = pthread_self ();
1240 struct nvptx_thread *nvthd = nvptx_thread ();
1242 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1244 for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
1246 if ((s->multithreaded || pthread_equal (s->host_thread, self))
1247 && cuStreamQuery (s->stream) == CUDA_ERROR_NOT_READY)
1249 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1250 return 0;
1254 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1256 event_gc (true);
1258 return 1;
1261 static void
1262 nvptx_wait (int async)
1264 CUresult r;
1265 struct ptx_stream *s;
1267 s = select_stream_for_async (async, pthread_self (), false, NULL);
1269 if (!s)
1270 GOMP_PLUGIN_fatal ("unknown async %d", async);
1272 r = cuStreamSynchronize (s->stream);
1273 if (r != CUDA_SUCCESS)
1274 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1276 event_gc (true);
1279 static void
1280 nvptx_wait_async (int async1, int async2)
1282 CUresult r;
1283 CUevent *e;
1284 struct ptx_stream *s1, *s2;
1285 pthread_t self = pthread_self ();
1287 /* The stream that is waiting (rather than being waited for) doesn't
1288 necessarily have to exist already. */
1289 s2 = select_stream_for_async (async2, self, true, NULL);
1291 s1 = select_stream_for_async (async1, self, false, NULL);
1292 if (!s1)
1293 GOMP_PLUGIN_fatal ("invalid async 1\n");
1295 if (s1 == s2)
1296 GOMP_PLUGIN_fatal ("identical parameters");
1298 e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1300 r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1301 if (r != CUDA_SUCCESS)
1302 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1304 event_gc (true);
1306 r = cuEventRecord (*e, s1->stream);
1307 if (r != CUDA_SUCCESS)
1308 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1310 event_add (PTX_EVT_SYNC, e, NULL);
1312 r = cuStreamWaitEvent (s2->stream, *e, 0);
1313 if (r != CUDA_SUCCESS)
1314 GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuda_error (r));
1317 static void
1318 nvptx_wait_all (void)
1320 CUresult r;
1321 struct ptx_stream *s;
1322 pthread_t self = pthread_self ();
1323 struct nvptx_thread *nvthd = nvptx_thread ();
1325 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1327 /* Wait for active streams initiated by this thread (or by multiple threads)
1328 to complete. */
1329 for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
1331 if (s->multithreaded || pthread_equal (s->host_thread, self))
1333 r = cuStreamQuery (s->stream);
1334 if (r == CUDA_SUCCESS)
1335 continue;
1336 else if (r != CUDA_ERROR_NOT_READY)
1337 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
1339 r = cuStreamSynchronize (s->stream);
1340 if (r != CUDA_SUCCESS)
1341 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1345 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1347 event_gc (true);
1350 static void
1351 nvptx_wait_all_async (int async)
1353 CUresult r;
1354 struct ptx_stream *waiting_stream, *other_stream;
1355 CUevent *e;
1356 struct nvptx_thread *nvthd = nvptx_thread ();
1357 pthread_t self = pthread_self ();
1359 /* The stream doing the waiting. This could be the first mention of the
1360 stream, so create it if necessary. */
1361 waiting_stream
1362 = select_stream_for_async (async, pthread_self (), true, NULL);
1364 /* Launches on the null stream already block on other streams in the
1365 context. */
1366 if (!waiting_stream || waiting_stream == nvthd->ptx_dev->null_stream)
1367 return;
1369 event_gc (true);
1371 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1373 for (other_stream = nvthd->ptx_dev->active_streams;
1374 other_stream != NULL;
1375 other_stream = other_stream->next)
1377 if (!other_stream->multithreaded
1378 && !pthread_equal (other_stream->host_thread, self))
1379 continue;
1381 e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1383 r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1384 if (r != CUDA_SUCCESS)
1385 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1387 /* Record an event on the waited-for stream. */
1388 r = cuEventRecord (*e, other_stream->stream);
1389 if (r != CUDA_SUCCESS)
1390 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1392 event_add (PTX_EVT_SYNC, e, NULL);
1394 r = cuStreamWaitEvent (waiting_stream->stream, *e, 0);
1395 if (r != CUDA_SUCCESS)
1396 GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuda_error (r));
1399 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1402 static void *
1403 nvptx_get_current_cuda_device (void)
1405 struct nvptx_thread *nvthd = nvptx_thread ();
1407 if (!nvthd || !nvthd->ptx_dev)
1408 return NULL;
1410 return &nvthd->ptx_dev->dev;
1413 static void *
1414 nvptx_get_current_cuda_context (void)
1416 struct nvptx_thread *nvthd = nvptx_thread ();
1418 if (!nvthd || !nvthd->ptx_dev)
1419 return NULL;
1421 return nvthd->ptx_dev->ctx;
1424 static void *
1425 nvptx_get_cuda_stream (int async)
1427 struct ptx_stream *s;
1428 struct nvptx_thread *nvthd = nvptx_thread ();
1430 if (!nvthd || !nvthd->ptx_dev)
1431 return NULL;
1433 s = select_stream_for_async (async, pthread_self (), false, NULL);
1435 return s ? s->stream : NULL;
1438 static int
1439 nvptx_set_cuda_stream (int async, void *stream)
1441 struct ptx_stream *oldstream;
1442 pthread_t self = pthread_self ();
1443 struct nvptx_thread *nvthd = nvptx_thread ();
1445 pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1447 if (async < 0)
1448 GOMP_PLUGIN_fatal ("bad async %d", async);
1450 /* We have a list of active streams and an array mapping async values to
1451 entries of that list. We need to take "ownership" of the passed-in stream,
1452 and add it to our list, removing the previous entry also (if there was one)
1453 in order to prevent resource leaks. Note the potential for surprise
1454 here: maybe we should keep track of passed-in streams and leave it up to
1455 the user to tidy those up, but that doesn't work for stream handles
1456 returned from acc_get_cuda_stream above... */
1458 oldstream = select_stream_for_async (async, self, false, NULL);
1460 if (oldstream)
1462 if (nvthd->ptx_dev->active_streams == oldstream)
1463 nvthd->ptx_dev->active_streams = nvthd->ptx_dev->active_streams->next;
1464 else
1466 struct ptx_stream *s = nvthd->ptx_dev->active_streams;
1467 while (s->next != oldstream)
1468 s = s->next;
1469 s->next = s->next->next;
1472 cuStreamDestroy (oldstream->stream);
1473 map_fini (oldstream);
1474 free (oldstream);
1477 pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1479 (void) select_stream_for_async (async, self, true, (CUstream) stream);
1481 return 1;
1484 /* Plugin entry points. */
1486 const char *
1487 GOMP_OFFLOAD_get_name (void)
1489 return "nvptx";
1492 unsigned int
1493 GOMP_OFFLOAD_get_caps (void)
1495 return GOMP_OFFLOAD_CAP_OPENACC_200;
1499 GOMP_OFFLOAD_get_type (void)
1501 return OFFLOAD_TARGET_TYPE_NVIDIA_PTX;
1505 GOMP_OFFLOAD_get_num_devices (void)
1507 return nvptx_get_num_devices ();
1510 static void **kernel_target_data;
1511 static void **kernel_host_table;
1513 void
1514 GOMP_OFFLOAD_register_image (void *host_table, void *target_data)
1516 kernel_target_data = target_data;
1517 kernel_host_table = host_table;
1520 void
1521 GOMP_OFFLOAD_init_device (int n __attribute__ ((unused)))
1523 (void) nvptx_init ();
1526 void
1527 GOMP_OFFLOAD_fini_device (int n __attribute__ ((unused)))
1529 nvptx_fini ();
1533 GOMP_OFFLOAD_get_table (int n __attribute__ ((unused)),
1534 struct mapping_table **tablep)
1536 CUmodule module;
1537 void **fn_table;
1538 char **fn_names;
1539 int fn_entries, i;
1540 CUresult r;
1541 struct targ_fn_descriptor *targ_fns;
1543 if (nvptx_init () <= 0)
1544 return 0;
1546 /* This isn't an error, because an image may legitimately have no offloaded
1547 regions and so will not call GOMP_offload_register. */
1548 if (kernel_target_data == NULL)
1549 return 0;
1551 link_ptx (&module, kernel_target_data[0]);
1553 /* kernel_target_data[0] -> ptx code
1554 kernel_target_data[1] -> variable mappings
1555 kernel_target_data[2] -> array of kernel names in ascii
1557 kernel_host_table[0] -> start of function addresses (__offload_func_table)
1558 kernel_host_table[1] -> end of function addresses (__offload_funcs_end)
1560 The array of kernel names and the functions addresses form a
1561 one-to-one correspondence. */
1563 fn_table = kernel_host_table[0];
1564 fn_names = (char **) kernel_target_data[2];
1565 fn_entries = (kernel_host_table[1] - kernel_host_table[0]) / sizeof (void *);
1567 *tablep = GOMP_PLUGIN_malloc (sizeof (struct mapping_table) * fn_entries);
1568 targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
1569 * fn_entries);
1571 for (i = 0; i < fn_entries; i++)
1573 CUfunction function;
1575 r = cuModuleGetFunction (&function, module, fn_names[i]);
1576 if (r != CUDA_SUCCESS)
1577 GOMP_PLUGIN_fatal ("cuModuleGetFunction error: %s", cuda_error (r));
1579 targ_fns[i].fn = function;
1580 targ_fns[i].name = (const char *) fn_names[i];
1582 (*tablep)[i].host_start = (uintptr_t) fn_table[i];
1583 (*tablep)[i].host_end = (*tablep)[i].host_start + 1;
1584 (*tablep)[i].tgt_start = (uintptr_t) &targ_fns[i];
1585 (*tablep)[i].tgt_end = (*tablep)[i].tgt_start + 1;
1588 return fn_entries;
1591 void *
1592 GOMP_OFFLOAD_alloc (int n __attribute__ ((unused)), size_t size)
1594 return nvptx_alloc (size);
1597 void
1598 GOMP_OFFLOAD_free (int n __attribute__ ((unused)), void *ptr)
1600 nvptx_free (ptr);
1603 void *
1604 GOMP_OFFLOAD_dev2host (int ord __attribute__ ((unused)), void *dst,
1605 const void *src, size_t n)
1607 return nvptx_dev2host (dst, src, n);
1610 void *
1611 GOMP_OFFLOAD_host2dev (int ord __attribute__ ((unused)), void *dst,
1612 const void *src, size_t n)
1614 return nvptx_host2dev (dst, src, n);
1617 void (*device_run) (int n, void *fn_ptr, void *vars) = NULL;
1619 void
1620 GOMP_OFFLOAD_openacc_parallel (void (*fn) (void *), size_t mapnum,
1621 void **hostaddrs, void **devaddrs, size_t *sizes,
1622 unsigned short *kinds, int num_gangs,
1623 int num_workers, int vector_length, int async,
1624 void *targ_mem_desc)
1626 nvptx_exec (fn, mapnum, hostaddrs, devaddrs, sizes, kinds, num_gangs,
1627 num_workers, vector_length, async, targ_mem_desc);
1630 void *
1631 GOMP_OFFLOAD_openacc_open_device (int n)
1633 return nvptx_open_device (n);
1637 GOMP_OFFLOAD_openacc_close_device (void *h)
1639 return nvptx_close_device (h);
1642 void
1643 GOMP_OFFLOAD_openacc_set_device_num (int n)
1645 struct nvptx_thread *nvthd = nvptx_thread ();
1647 assert (n >= 0);
1649 if (!nvthd->ptx_dev || nvthd->ptx_dev->ord != n)
1650 (void) nvptx_open_device (n);
1653 /* This can be called before the device is "opened" for the current thread, in
1654 which case we can't tell which device number should be returned. We don't
1655 actually want to open the device here, so just return -1 and let the caller
1656 (oacc-init.c:acc_get_device_num) handle it. */
1659 GOMP_OFFLOAD_openacc_get_device_num (void)
1661 struct nvptx_thread *nvthd = nvptx_thread ();
1663 if (nvthd && nvthd->ptx_dev)
1664 return nvthd->ptx_dev->ord;
1665 else
1666 return -1;
1669 void
1670 GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc)
1672 CUevent *e;
1673 CUresult r;
1674 struct nvptx_thread *nvthd = nvptx_thread ();
1676 e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1678 r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1679 if (r != CUDA_SUCCESS)
1680 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1682 r = cuEventRecord (*e, nvthd->current_stream->stream);
1683 if (r != CUDA_SUCCESS)
1684 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1686 event_add (PTX_EVT_ASYNC_CLEANUP, e, targ_mem_desc);
1690 GOMP_OFFLOAD_openacc_async_test (int async)
1692 return nvptx_async_test (async);
1696 GOMP_OFFLOAD_openacc_async_test_all (void)
1698 return nvptx_async_test_all ();
1701 void
1702 GOMP_OFFLOAD_openacc_async_wait (int async)
1704 nvptx_wait (async);
1707 void
1708 GOMP_OFFLOAD_openacc_async_wait_async (int async1, int async2)
1710 nvptx_wait_async (async1, async2);
1713 void
1714 GOMP_OFFLOAD_openacc_async_wait_all (void)
1716 nvptx_wait_all ();
1719 void
1720 GOMP_OFFLOAD_openacc_async_wait_all_async (int async)
1722 nvptx_wait_all_async (async);
1725 void
1726 GOMP_OFFLOAD_openacc_async_set_async (int async)
1728 nvptx_set_async (async);
1731 void *
1732 GOMP_OFFLOAD_openacc_create_thread_data (void *targ_data)
1734 struct ptx_device *ptx_dev = (struct ptx_device *) targ_data;
1735 struct nvptx_thread *nvthd
1736 = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
1737 CUresult r;
1738 CUcontext thd_ctx;
1740 r = cuCtxGetCurrent (&thd_ctx);
1741 if (r != CUDA_SUCCESS)
1742 GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));
1744 assert (ptx_dev->ctx);
1746 if (!thd_ctx)
1748 r = cuCtxPushCurrent (ptx_dev->ctx);
1749 if (r != CUDA_SUCCESS)
1750 GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s", cuda_error (r));
1753 nvthd->current_stream = ptx_dev->null_stream;
1754 nvthd->ptx_dev = ptx_dev;
1756 return (void *) nvthd;
1759 void
1760 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
1762 free (data);
1765 void *
1766 GOMP_OFFLOAD_openacc_get_current_cuda_device (void)
1768 return nvptx_get_current_cuda_device ();
1771 void *
1772 GOMP_OFFLOAD_openacc_get_current_cuda_context (void)
1774 return nvptx_get_current_cuda_context ();
1777 /* NOTE: This returns a CUstream, not a ptx_stream pointer. */
1779 void *
1780 GOMP_OFFLOAD_openacc_get_cuda_stream (int async)
1782 return nvptx_get_cuda_stream (async);
1785 /* NOTE: This takes a CUstream, not a ptx_stream pointer. */
1788 GOMP_OFFLOAD_openacc_set_cuda_stream (int async, void *stream)
1790 return nvptx_set_cuda_stream (async, stream);