Build a shared host libiberty also for libcc1's benefit.
[official-gcc.git] / libgomp / plugin-nvptx.c
blob33f868ae6c6d38603de1ee7e2cbd563a92b7694d
1 /* Plugin for NVPTX execution.
3 Copyright (C) 2013-2014 Free Software Foundation, Inc.
5 Contributed by Mentor Embedded.
7 This file is part of the GNU OpenMP Library (libgomp).
9 Libgomp is free software; you can redistribute it and/or modify it
10 under the terms of the GNU General Public License as published by
11 the Free Software Foundation; either version 3, or (at your option)
12 any later version.
14 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
15 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
16 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
17 more details.
19 Under Section 7 of GPL version 3, you are granted additional
20 permissions described in the GCC Runtime Library Exception, version
21 3.1, as published by the Free Software Foundation.
23 You should have received a copy of the GNU General Public License and
24 a copy of the GCC Runtime Library Exception along with this program;
25 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
26 <http://www.gnu.org/licenses/>. */
28 /* Nvidia PTX-specific parts of OpenACC support. The cuda driver
29 library appears to hold some implicit state, but the documentation
30 is not clear as to what that state might be. Or how one might
31 propagate it from one thread to another. */
33 //#define DEBUG
34 //#define DISABLE_ASYNC
36 #include "openacc.h"
37 #include "config.h"
38 #include "libgomp.h"
39 #include "target.h"
40 #include "libgomp-plugin.h"
42 #include <cuda.h>
43 #include <stdint.h>
44 #include <string.h>
45 #include <stdio.h>
46 #include <dlfcn.h>
47 #include <unistd.h>
48 #include <assert.h>
50 #define ARRAYSIZE(X) (sizeof (X) / sizeof ((X)[0]))
52 static struct _errlist
54 CUresult r;
55 char *m;
56 } cuErrorList[] = {
57 { CUDA_ERROR_INVALID_VALUE, "invalid value" },
58 { CUDA_ERROR_OUT_OF_MEMORY, "out of memory" },
59 { CUDA_ERROR_NOT_INITIALIZED, "not initialized" },
60 { CUDA_ERROR_DEINITIALIZED, "deinitialized" },
61 { CUDA_ERROR_PROFILER_DISABLED, "profiler disabled" },
62 { CUDA_ERROR_PROFILER_NOT_INITIALIZED, "profiler not initialized" },
63 { CUDA_ERROR_PROFILER_ALREADY_STARTED, "already started" },
64 { CUDA_ERROR_PROFILER_ALREADY_STOPPED, "already stopped" },
65 { CUDA_ERROR_NO_DEVICE, "no device" },
66 { CUDA_ERROR_INVALID_DEVICE, "invalid device" },
67 { CUDA_ERROR_INVALID_IMAGE, "invalid image" },
68 { CUDA_ERROR_INVALID_CONTEXT, "invalid context" },
69 { CUDA_ERROR_CONTEXT_ALREADY_CURRENT, "context already current" },
70 { CUDA_ERROR_MAP_FAILED, "map error" },
71 { CUDA_ERROR_UNMAP_FAILED, "unmap error" },
72 { CUDA_ERROR_ARRAY_IS_MAPPED, "array is mapped" },
73 { CUDA_ERROR_ALREADY_MAPPED, "already mapped" },
74 { CUDA_ERROR_NO_BINARY_FOR_GPU, "no binary for gpu" },
75 { CUDA_ERROR_ALREADY_ACQUIRED, "already acquired" },
76 { CUDA_ERROR_NOT_MAPPED, "not mapped" },
77 { CUDA_ERROR_NOT_MAPPED_AS_ARRAY, "not mapped as array" },
78 { CUDA_ERROR_NOT_MAPPED_AS_POINTER, "not mapped as pointer" },
79 { CUDA_ERROR_ECC_UNCORRECTABLE, "ecc uncorrectable" },
80 { CUDA_ERROR_UNSUPPORTED_LIMIT, "unsupported limit" },
81 { CUDA_ERROR_CONTEXT_ALREADY_IN_USE, "context already in use" },
82 { CUDA_ERROR_PEER_ACCESS_UNSUPPORTED, "peer access unsupported" },
83 { CUDA_ERROR_INVALID_SOURCE, "invalid source" },
84 { CUDA_ERROR_FILE_NOT_FOUND, "file not found" },
85 { CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
86 "shared object symbol not found" },
87 { CUDA_ERROR_SHARED_OBJECT_INIT_FAILED, "shared object init error" },
88 { CUDA_ERROR_OPERATING_SYSTEM, "operating system" },
89 { CUDA_ERROR_INVALID_HANDLE, "invalid handle" },
90 { CUDA_ERROR_NOT_FOUND, "not found" },
91 { CUDA_ERROR_NOT_READY, "not ready" },
92 { CUDA_ERROR_LAUNCH_FAILED, "launch error" },
93 { CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, "launch out of resources" },
94 { CUDA_ERROR_LAUNCH_TIMEOUT, "launch timeout" },
95 { CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
96 "launch incompatibe texturing" },
97 { CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED, "peer access already enabled" },
98 { CUDA_ERROR_PEER_ACCESS_NOT_ENABLED, "peer access not enabled " },
99 { CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE, "primary cotext active" },
100 { CUDA_ERROR_CONTEXT_IS_DESTROYED, "context is destroyed" },
101 { CUDA_ERROR_ASSERT, "assert" },
102 { CUDA_ERROR_TOO_MANY_PEERS, "too many peers" },
103 { CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED,
104 "host memory already registered" },
105 { CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED, "host memory not registered" },
106 { CUDA_ERROR_NOT_PERMITTED, "no permitted" },
107 { CUDA_ERROR_NOT_SUPPORTED, "not supported" },
108 { CUDA_ERROR_UNKNOWN, "unknown" }
111 static char errmsg[128];
113 static char *
114 cuErrorMsg (CUresult r)
116 int i;
118 for (i = 0; i < ARRAYSIZE (cuErrorList); i++)
120 if (cuErrorList[i].r == r)
121 return &cuErrorList[i].m[0];
124 sprintf (&errmsg[0], "unknown result code: %5d", r);
126 return &errmsg[0];
129 struct targ_fn_descriptor
131 CUfunction fn;
132 const char *name;
135 static bool PTX_inited = false;
137 struct PTX_stream
139 CUstream stream;
140 pthread_t host_thread;
141 bool multithreaded;
143 CUdeviceptr d;
144 void *h;
145 void *h_begin;
146 void *h_end;
147 void *h_next;
148 void *h_prev;
149 void *h_tail;
151 struct PTX_stream *next;
154 /* Each thread may select a stream (also specific to a device/context). */
155 static __thread struct PTX_stream *current_stream;
157 struct map
159 int async;
160 size_t size;
161 char mappings[0];
164 static void
165 map_init (struct PTX_stream *s)
167 CUresult r;
169 int size = getpagesize ();
171 assert (s);
172 assert (!s->d);
173 assert (!s->h);
175 r = cuMemAllocHost (&s->h, size);
176 if (r != CUDA_SUCCESS)
177 GOMP_PLUGIN_fatal ("cuMemAllocHost error: %s", cuErrorMsg (r));
179 r = cuMemHostGetDevicePointer (&s->d, s->h, 0);
180 if (r != CUDA_SUCCESS)
181 GOMP_PLUGIN_fatal ("cuMemHostGetDevicePointer error: %s", cuErrorMsg (r));
183 assert (s->h);
185 s->h_begin = s->h;
186 s->h_end = s->h_begin + size;
187 s->h_next = s->h_prev = s->h_tail = s->h_begin;
189 assert (s->h_next);
190 assert (s->h_end);
193 static void
194 map_fini (struct PTX_stream *s)
196 CUresult r;
198 r = cuMemFreeHost (s->h);
199 if (r != CUDA_SUCCESS)
200 GOMP_PLUGIN_fatal ("cuMemFreeHost error: %s", cuErrorMsg (r));
203 static void
204 map_pop (struct PTX_stream *s)
206 struct map *m;
208 assert (s != NULL);
209 assert (s->h_next);
210 assert (s->h_prev);
211 assert (s->h_tail);
213 m = s->h_tail;
215 s->h_tail += m->size;
217 if (s->h_tail >= s->h_end)
218 s->h_tail = s->h_begin + (int) (s->h_tail - s->h_end);
220 if (s->h_next == s->h_tail)
221 s->h_prev = s->h_next;
223 assert (s->h_next >= s->h_begin);
224 assert (s->h_tail >= s->h_begin);
225 assert (s->h_prev >= s->h_begin);
227 assert (s->h_next <= s->h_end);
228 assert (s->h_tail <= s->h_end);
229 assert (s->h_prev <= s->h_end);
232 static void
233 map_push (struct PTX_stream *s, int async, size_t size, void **h, void **d)
235 int left;
236 int offset;
237 struct map *m;
239 assert (s != NULL);
241 left = s->h_end - s->h_next;
242 size += sizeof (struct map);
244 assert (s->h_prev);
245 assert (s->h_next);
247 if (size >= left)
249 m = s->h_prev;
250 m->size += left;
251 s->h_next = s->h_begin;
253 if (s->h_next + size > s->h_end)
254 GOMP_PLUGIN_fatal ("unable to push map");
257 assert (s->h_next);
259 m = s->h_next;
260 m->async = async;
261 m->size = size;
263 offset = (void *)&m->mappings[0] - s->h;
265 *d = (void *)(s->d + offset);
266 *h = (void *)(s->h + offset);
268 s->h_prev = s->h_next;
269 s->h_next += size;
271 assert (s->h_prev);
272 assert (s->h_next);
274 assert (s->h_next >= s->h_begin);
275 assert (s->h_tail >= s->h_begin);
276 assert (s->h_prev >= s->h_begin);
277 assert (s->h_next <= s->h_end);
278 assert (s->h_tail <= s->h_end);
279 assert (s->h_prev <= s->h_end);
281 return;
284 struct PTX_device
286 CUcontext ctx;
287 bool ctx_shared;
288 CUdevice dev;
289 struct PTX_stream *null_stream;
290 /* All non-null streams associated with this device (actually context),
291 either created implicitly or passed in from the user (via
292 acc_set_cuda_stream). */
293 struct PTX_stream *active_streams;
294 struct {
295 struct PTX_stream **arr;
296 int size;
297 } async_streams;
298 /* A lock for use when manipulating the above stream list and array. */
299 gomp_mutex_t stream_lock;
300 int ord;
301 bool overlap;
302 bool map;
303 bool concur;
304 int mode;
305 bool mkern;
307 struct PTX_device *next;
310 static __thread struct PTX_device *PTX_dev;
311 static struct PTX_device *PTX_devices;
313 enum PTX_event_type
315 PTX_EVT_MEM,
316 PTX_EVT_KNL,
317 PTX_EVT_SYNC,
318 PTX_EVT_ASYNC_CLEANUP
321 struct PTX_event
323 CUevent *evt;
324 int type;
325 void *addr;
326 int ord;
328 struct PTX_event *next;
331 static gomp_mutex_t PTX_event_lock;
332 static struct PTX_event *PTX_events;
334 #define _XSTR(s) _STR(s)
335 #define _STR(s) #s
337 static struct _synames
339 char *n;
340 } cuSymNames[] =
342 { _XSTR(cuCtxCreate) },
343 { _XSTR(cuCtxDestroy) },
344 { _XSTR(cuCtxGetCurrent) },
345 { _XSTR(cuCtxPushCurrent) },
346 { _XSTR(cuCtxSynchronize) },
347 { _XSTR(cuDeviceGet) },
348 { _XSTR(cuDeviceGetAttribute) },
349 { _XSTR(cuDeviceGetCount) },
350 { _XSTR(cuEventCreate) },
351 { _XSTR(cuEventDestroy) },
352 { _XSTR(cuEventQuery) },
353 { _XSTR(cuEventRecord) },
354 { _XSTR(cuInit) },
355 { _XSTR(cuLaunchKernel) },
356 { _XSTR(cuLinkAddData) },
357 { _XSTR(cuLinkComplete) },
358 { _XSTR(cuLinkCreate) },
359 { _XSTR(cuMemAlloc) },
360 { _XSTR(cuMemAllocHost) },
361 { _XSTR(cuMemcpy) },
362 { _XSTR(cuMemcpyDtoH) },
363 { _XSTR(cuMemcpyDtoHAsync) },
364 { _XSTR(cuMemcpyHtoD) },
365 { _XSTR(cuMemcpyHtoDAsync) },
366 { _XSTR(cuMemFree) },
367 { _XSTR(cuMemFreeHost) },
368 { _XSTR(cuMemGetAddressRange) },
369 { _XSTR(cuMemHostGetDevicePointer) },
370 { _XSTR(cuMemHostRegister) },
371 { _XSTR(cuMemHostUnregister) },
372 { _XSTR(cuModuleGetFunction) },
373 { _XSTR(cuModuleLoadData) },
374 { _XSTR(cuStreamDestroy) },
375 { _XSTR(cuStreamQuery) },
376 { _XSTR(cuStreamSynchronize) },
377 { _XSTR(cuStreamWaitEvent) }
380 static int
381 verify_device_library (void)
383 int i;
384 void *dh, *ds;
386 dh = dlopen ("libcuda.so", RTLD_LAZY);
387 if (!dh)
388 return -1;
390 for (i = 0; i < ARRAYSIZE (cuSymNames); i++)
392 ds = dlsym (dh, cuSymNames[i].n);
393 if (!ds)
394 return -1;
397 dlclose (dh);
399 return 0;
402 static void
403 init_streams_for_device (struct PTX_device *ptx_dev, int concurrency)
405 int i;
406 struct PTX_stream *null_stream
407 = GOMP_PLUGIN_malloc (sizeof (struct PTX_stream));
409 null_stream->stream = NULL;
410 null_stream->host_thread = pthread_self ();
411 null_stream->multithreaded = true;
412 null_stream->d = (CUdeviceptr) NULL;
413 null_stream->h = NULL;
414 map_init (null_stream);
415 ptx_dev->null_stream = null_stream;
417 ptx_dev->active_streams = NULL;
418 GOMP_PLUGIN_mutex_init (&ptx_dev->stream_lock);
420 if (concurrency < 1)
421 concurrency = 1;
423 /* This is just a guess -- make space for as many async streams as the
424 current device is capable of concurrently executing. This can grow
425 later as necessary. No streams are created yet. */
426 ptx_dev->async_streams.arr
427 = GOMP_PLUGIN_malloc (concurrency * sizeof (struct PTX_stream *));
428 ptx_dev->async_streams.size = concurrency;
430 for (i = 0; i < concurrency; i++)
431 ptx_dev->async_streams.arr[i] = NULL;
434 static void
435 fini_streams_for_device (struct PTX_device *ptx_dev)
437 free (ptx_dev->async_streams.arr);
439 while (ptx_dev->active_streams != NULL)
441 struct PTX_stream *s = ptx_dev->active_streams;
442 ptx_dev->active_streams = ptx_dev->active_streams->next;
444 cuStreamDestroy (s->stream);
445 map_fini (s);
446 free (s);
449 map_fini (ptx_dev->null_stream);
450 free (ptx_dev->null_stream);
453 /* Select a stream for (OpenACC-semantics) ASYNC argument for the current
454 thread THREAD (and also current device/context). If CREATE is true, create
455 the stream if it does not exist (or use EXISTING if it is non-NULL), and
456 associate the stream with the same thread argument. Returns stream to use
457 as result. */
459 static struct PTX_stream *
460 select_stream_for_async (int async, pthread_t thread, bool create,
461 CUstream existing)
463 /* Local copy of TLS variable. */
464 struct PTX_device *ptx_dev = PTX_dev;
465 struct PTX_stream *stream = NULL;
466 int orig_async = async;
468 /* The special value acc_async_noval (-1) maps (for now) to an
469 implicitly-created stream, which is then handled the same as any other
470 numbered async stream. Other options are available, e.g. using the null
471 stream for anonymous async operations, or choosing an idle stream from an
472 active set. But, stick with this for now. */
473 if (async > acc_async_sync)
474 async++;
476 if (create)
477 GOMP_PLUGIN_mutex_lock (&ptx_dev->stream_lock);
479 /* NOTE: AFAICT there's no particular need for acc_async_sync to map to the
480 null stream, and in fact better performance may be obtainable if it doesn't
481 (because the null stream enforces overly-strict synchronisation with
482 respect to other streams for legacy reasons, and that's probably not
483 needed with OpenACC). Maybe investigate later. */
484 if (async == acc_async_sync)
485 stream = ptx_dev->null_stream;
486 else if (async >= 0 && async < ptx_dev->async_streams.size
487 && ptx_dev->async_streams.arr[async] && !(create && existing))
488 stream = ptx_dev->async_streams.arr[async];
489 else if (async >= 0 && create)
491 if (async >= ptx_dev->async_streams.size)
493 int i, newsize = ptx_dev->async_streams.size * 2;
495 if (async >= newsize)
496 newsize = async + 1;
498 ptx_dev->async_streams.arr
499 = GOMP_PLUGIN_realloc (ptx_dev->async_streams.arr,
500 newsize * sizeof (struct PTX_stream *));
502 for (i = ptx_dev->async_streams.size; i < newsize; i++)
503 ptx_dev->async_streams.arr[i] = NULL;
505 ptx_dev->async_streams.size = newsize;
508 /* Create a new stream on-demand if there isn't one already, or if we're
509 setting a particular async value to an existing (externally-provided)
510 stream. */
511 if (!ptx_dev->async_streams.arr[async] || existing)
513 CUresult r;
514 struct PTX_stream *s
515 = GOMP_PLUGIN_malloc (sizeof (struct PTX_stream));
517 if (existing)
518 s->stream = existing;
519 else
521 r = cuStreamCreate (&s->stream, CU_STREAM_DEFAULT);
522 if (r != CUDA_SUCCESS)
523 GOMP_PLUGIN_fatal ("cuStreamCreate error: %s", cuErrorMsg (r));
526 /* If CREATE is true, we're going to be queueing some work on this
527 stream. Associate it with the current host thread. */
528 s->host_thread = thread;
529 s->multithreaded = false;
531 s->d = (CUdeviceptr) NULL;
532 s->h = NULL;
533 map_init (s);
535 s->next = ptx_dev->active_streams;
536 ptx_dev->active_streams = s;
537 ptx_dev->async_streams.arr[async] = s;
540 stream = ptx_dev->async_streams.arr[async];
542 else if (async < 0)
543 GOMP_PLUGIN_fatal ("bad async %d", async);
545 if (create)
547 assert (stream != NULL);
549 /* If we're trying to use the same stream from different threads
550 simultaneously, set stream->multithreaded to true. This affects the
551 behaviour of acc_async_test_all and acc_wait_all, which are supposed to
552 only wait for asynchronous launches from the same host thread they are
553 invoked on. If multiple threads use the same async value, we make note
554 of that here and fall back to testing/waiting for all threads in those
555 functions. */
556 if (thread != stream->host_thread)
557 stream->multithreaded = true;
559 GOMP_PLUGIN_mutex_unlock (&ptx_dev->stream_lock);
561 else if (stream && !stream->multithreaded
562 && !pthread_equal (stream->host_thread, thread))
563 GOMP_PLUGIN_fatal ("async %d used on wrong thread", orig_async);
565 #ifdef DEBUG
566 fprintf (stderr, "libgomp plugin: %s:%s using stream %p (CUDA stream %p) "
567 "for async %d\n", __FILE__, __FUNCTION__, stream,
568 stream ? stream->stream : NULL, orig_async);
569 #endif
571 return stream;
574 static int PTX_get_num_devices (void);
576 /* Initialize the device. */
577 static int
578 PTX_init (void)
580 CUresult r;
581 int rc;
583 if (PTX_inited)
584 return PTX_get_num_devices ();
586 rc = verify_device_library ();
587 if (rc < 0)
588 return -1;
590 r = cuInit (0);
591 if (r != CUDA_SUCCESS)
592 GOMP_PLUGIN_fatal ("cuInit error: %s", cuErrorMsg (r));
594 PTX_devices = NULL;
595 PTX_events = NULL;
597 GOMP_PLUGIN_mutex_init (&PTX_event_lock);
599 PTX_inited = true;
601 return PTX_get_num_devices ();
604 static int
605 PTX_fini (void)
607 PTX_inited = false;
609 return 0;
612 static void *
613 PTX_open_device (int n)
615 CUdevice dev;
616 CUresult r;
617 int async_engines, pi;
619 if (PTX_devices)
621 struct PTX_device *ptx_device;
623 for (ptx_device = PTX_devices;
624 ptx_device != NULL;
625 ptx_device = ptx_device->next)
627 if (ptx_device->ord == n)
629 PTX_dev = ptx_device;
631 if (PTX_dev->ctx)
633 r = cuCtxPushCurrent (PTX_dev->ctx);
634 if (r != CUDA_SUCCESS)
635 GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s",
636 cuErrorMsg (r));
639 return (void *)PTX_dev;
644 r = cuDeviceGet (&dev, n);
645 if (r != CUDA_SUCCESS)
646 GOMP_PLUGIN_fatal ("cuDeviceGet error: %s", cuErrorMsg (r));
648 PTX_dev = GOMP_PLUGIN_malloc (sizeof (struct PTX_device));
649 PTX_dev->ord = n;
650 PTX_dev->dev = dev;
651 PTX_dev->ctx_shared = false;
653 PTX_dev->next = PTX_devices;
654 PTX_devices = PTX_dev;
656 r = cuCtxGetCurrent (&PTX_dev->ctx);
657 if (r != CUDA_SUCCESS)
658 GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuErrorMsg (r));
660 if (!PTX_dev->ctx)
662 r = cuCtxCreate (&PTX_dev->ctx, CU_CTX_SCHED_AUTO, dev);
663 if (r != CUDA_SUCCESS)
664 GOMP_PLUGIN_fatal ("cuCtxCreate error: %s", cuErrorMsg (r));
666 else
668 PTX_dev->ctx_shared = true;
671 r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
672 if (r != CUDA_SUCCESS)
673 GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuErrorMsg (r));
675 PTX_dev->overlap = pi;
677 r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
678 if (r != CUDA_SUCCESS)
679 GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuErrorMsg (r));
681 PTX_dev->map = pi;
683 r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
684 if (r != CUDA_SUCCESS)
685 GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuErrorMsg (r));
687 PTX_dev->concur = pi;
689 r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
690 if (r != CUDA_SUCCESS)
691 GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuErrorMsg (r));
693 PTX_dev->mode = pi;
695 r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
696 if (r != CUDA_SUCCESS)
697 GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuErrorMsg (r));
699 PTX_dev->mkern = pi;
701 r = cuDeviceGetAttribute (&async_engines,
702 CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
703 if (r != CUDA_SUCCESS)
704 async_engines = 1;
706 init_streams_for_device (PTX_dev, async_engines);
708 current_stream = PTX_dev->null_stream;
710 return (void *)PTX_dev;
713 static int
714 PTX_close_device (void *h __attribute__((unused)))
716 CUresult r;
718 if (!PTX_dev)
719 return 0;
721 fini_streams_for_device (PTX_dev);
723 if (!PTX_dev->ctx_shared)
725 r = cuCtxDestroy (PTX_dev->ctx);
726 if (r != CUDA_SUCCESS)
727 GOMP_PLUGIN_fatal ("cuCtxDestroy error: %s", cuErrorMsg (r));
730 if (PTX_devices == PTX_dev)
731 PTX_devices = PTX_devices->next;
732 else
734 struct PTX_device* d = PTX_devices;
735 while (d->next != PTX_dev)
736 d = d->next;
737 d->next = d->next->next;
739 free (PTX_dev);
741 PTX_dev = NULL;
743 return 0;
746 static int
747 PTX_get_num_devices (void)
749 int n;
750 CUresult r;
752 assert (PTX_inited);
754 r = cuDeviceGetCount (&n);
755 if (r!= CUDA_SUCCESS)
756 GOMP_PLUGIN_fatal ("cuDeviceGetCount error: %s", cuErrorMsg (r));
758 return n;
761 static bool
762 PTX_avail(void)
764 bool avail = false;
766 if (PTX_init () > 0)
767 avail = true;
769 return avail;
772 #define ABORT_PTX \
773 ".version 3.1\n" \
774 ".target sm_30\n" \
775 ".address_size 64\n" \
776 ".visible .func abort;\n" \
777 ".visible .func abort\n" \
778 "{\n" \
779 "trap;\n" \
780 "ret;\n" \
781 "}\n" \
782 ".visible .func _gfortran_abort;\n" \
783 ".visible .func _gfortran_abort\n" \
784 "{\n" \
785 "trap;\n" \
786 "ret;\n" \
787 "}\n" \
789 /* Generated with:
791 $ echo 'int acc_on_device(int d) { return __builtin_acc_on_device(d); } int acc_on_device_h_(int *d) { return acc_on_device(*d); }' | accel-gcc/xgcc -Baccel-gcc -x c - -o - -S -m64 -O3 -fno-builtin-acc_on_device -fno-inline
793 #define ACC_ON_DEVICE_PTX \
794 " .version 3.1\n" \
795 " .target sm_30\n" \
796 " .address_size 64\n" \
797 ".visible .func (.param.u32 %out_retval)acc_on_device(.param.u32 %in_ar1);\n" \
798 ".visible .func (.param.u32 %out_retval)acc_on_device(.param.u32 %in_ar1)\n" \
799 "{\n" \
800 " .reg.u32 %ar1;\n" \
801 ".reg.u32 %retval;\n" \
802 " .reg.u64 %hr10;\n" \
803 " .reg.u32 %r24;\n" \
804 " .reg.u32 %r25;\n" \
805 " .reg.pred %r27;\n" \
806 " .reg.u32 %r30;\n" \
807 " ld.param.u32 %ar1, [%in_ar1];\n" \
808 " mov.u32 %r24, %ar1;\n" \
809 " setp.ne.u32 %r27,%r24,4;\n" \
810 " set.u32.eq.u32 %r30,%r24,5;\n" \
811 " neg.s32 %r25, %r30;\n" \
812 " @%r27 bra $L3;\n" \
813 " mov.u32 %r25, 1;\n" \
814 "$L3:\n" \
815 " mov.u32 %retval, %r25;\n" \
816 " st.param.u32 [%out_retval], %retval;\n" \
817 " ret;\n" \
818 " }\n" \
819 ".visible .func (.param.u32 %out_retval)acc_on_device_h_(.param.u64 %in_ar1);\n" \
820 ".visible .func (.param.u32 %out_retval)acc_on_device_h_(.param.u64 %in_ar1)\n" \
821 "{\n" \
822 " .reg.u64 %ar1;\n" \
823 ".reg.u32 %retval;\n" \
824 " .reg.u64 %hr10;\n" \
825 " .reg.u64 %r25;\n" \
826 " .reg.u32 %r26;\n" \
827 " .reg.u32 %r27;\n" \
828 " ld.param.u64 %ar1, [%in_ar1];\n" \
829 " mov.u64 %r25, %ar1;\n" \
830 " ld.u32 %r26, [%r25];\n" \
831 " {\n" \
832 " .param.u32 %retval_in;\n" \
833 " {\n" \
834 " .param.u32 %out_arg0;\n" \
835 " st.param.u32 [%out_arg0], %r26;\n" \
836 " call (%retval_in), acc_on_device, (%out_arg0);\n" \
837 " }\n" \
838 " ld.param.u32 %r27, [%retval_in];\n" \
839 "}\n" \
840 " mov.u32 %retval, %r27;\n" \
841 " st.param.u32 [%out_retval], %retval;\n" \
842 " ret;\n" \
843 " }"
845 static void
846 link_ptx (CUmodule *module, char *ptx_code)
848 CUjit_option opts[7];
849 void *optvals[7];
850 float elapsed = 0.0;
851 #define LOGSIZE 8192
852 char elog[LOGSIZE];
853 char ilog[LOGSIZE];
854 unsigned long logsize = LOGSIZE;
855 CUlinkState linkstate;
856 CUresult r;
857 void *linkout;
858 size_t linkoutsize __attribute__((unused));
860 GOMP_PLUGIN_notify ("attempting to load:\n---\n%s\n---\n", ptx_code);
862 opts[0] = CU_JIT_WALL_TIME;
863 optvals[0] = &elapsed;
865 opts[1] = CU_JIT_INFO_LOG_BUFFER;
866 optvals[1] = &ilog[0];
868 opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
869 optvals[2] = (void *) logsize;
871 opts[3] = CU_JIT_ERROR_LOG_BUFFER;
872 optvals[3] = &elog[0];
874 opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
875 optvals[4] = (void *) logsize;
877 opts[5] = CU_JIT_LOG_VERBOSE;
878 optvals[5] = (void *) 1;
880 opts[6] = CU_JIT_TARGET;
881 optvals[6] = (void *) CU_TARGET_COMPUTE_30;
883 r = cuLinkCreate (7, opts, optvals, &linkstate);
884 if (r != CUDA_SUCCESS)
885 GOMP_PLUGIN_fatal ("cuLinkCreate error: %s", cuErrorMsg (r));
887 char *abort_ptx = ABORT_PTX;
888 r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, abort_ptx,
889 strlen (abort_ptx) + 1, 0, 0, 0, 0);
890 if (r != CUDA_SUCCESS)
892 GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
893 GOMP_PLUGIN_fatal ("cuLinkAddData (abort) error: %s", cuErrorMsg (r));
896 char *acc_on_device_ptx = ACC_ON_DEVICE_PTX;
897 r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, acc_on_device_ptx,
898 strlen (acc_on_device_ptx) + 1, 0, 0, 0, 0);
899 if (r != CUDA_SUCCESS)
901 GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
902 GOMP_PLUGIN_fatal ("cuLinkAddData (acc_on_device) error: %s",
903 cuErrorMsg (r));
906 r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, ptx_code,
907 strlen (ptx_code) + 1, 0, 0, 0, 0);
908 if (r != CUDA_SUCCESS)
910 GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
911 GOMP_PLUGIN_fatal ("cuLinkAddData (ptx_code) error: %s", cuErrorMsg (r));
914 r = cuLinkComplete (linkstate, &linkout, &linkoutsize);
915 if (r != CUDA_SUCCESS)
916 GOMP_PLUGIN_fatal ("cuLinkComplete error: %s", cuErrorMsg (r));
918 GOMP_PLUGIN_notify ("Link complete: %fms\n", elapsed);
919 GOMP_PLUGIN_notify ("Link log %s\n", &ilog[0]);
921 r = cuModuleLoadData (module, linkout);
922 if (r != CUDA_SUCCESS)
923 GOMP_PLUGIN_fatal ("cuModuleLoadData error: %s", cuErrorMsg (r));
926 static void
927 event_gc (bool memmap_lockable)
929 struct PTX_event *ptx_event = PTX_events;
931 GOMP_PLUGIN_mutex_lock (&PTX_event_lock);
933 while (ptx_event != NULL)
935 CUresult r;
936 struct PTX_event *e = ptx_event;
938 ptx_event = ptx_event->next;
940 if (e->ord != PTX_dev->ord)
941 continue;
943 r = cuEventQuery (*e->evt);
944 if (r == CUDA_SUCCESS)
946 CUevent *te;
948 te = e->evt;
950 switch (e->type)
952 case PTX_EVT_MEM:
953 case PTX_EVT_SYNC:
954 break;
956 case PTX_EVT_KNL:
957 map_pop (e->addr);
958 break;
960 case PTX_EVT_ASYNC_CLEANUP:
962 /* The function GOMP_PLUGIN_async_unmap_vars needs to claim the
963 memory-map splay tree lock for the current device, so we
964 can't call it when one of our callers has already claimed
965 the lock. In that case, just delay the GC for this event
966 until later. */
967 if (!memmap_lockable)
968 continue;
970 GOMP_PLUGIN_async_unmap_vars (e->addr);
972 break;
975 cuEventDestroy (*te);
976 free ((void *)te);
978 if (PTX_events == e)
979 PTX_events = PTX_events->next;
980 else
982 struct PTX_event *e_ = PTX_events;
983 while (e_->next != e)
984 e_ = e_->next;
985 e_->next = e_->next->next;
988 free (e);
992 GOMP_PLUGIN_mutex_unlock (&PTX_event_lock);
995 static void
996 event_add (enum PTX_event_type type, CUevent *e, void *h)
998 struct PTX_event *ptx_event;
1000 assert (type == PTX_EVT_MEM || type == PTX_EVT_KNL || type == PTX_EVT_SYNC
1001 || type == PTX_EVT_ASYNC_CLEANUP);
1003 ptx_event = GOMP_PLUGIN_malloc (sizeof (struct PTX_event));
1004 ptx_event->type = type;
1005 ptx_event->evt = e;
1006 ptx_event->addr = h;
1007 ptx_event->ord = PTX_dev->ord;
1009 GOMP_PLUGIN_mutex_lock (&PTX_event_lock);
1011 ptx_event->next = PTX_events;
1012 PTX_events = ptx_event;
1014 GOMP_PLUGIN_mutex_unlock (&PTX_event_lock);
1017 void
1018 PTX_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
1019 size_t *sizes, unsigned short *kinds, int num_gangs, int num_workers,
1020 int vector_length, int async, void *targ_mem_desc)
1022 struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
1023 CUfunction function;
1024 CUresult r;
1025 int i;
1026 struct PTX_stream *dev_str;
1027 void *kargs[1];
1028 void *hp, *dp;
1029 unsigned int nthreads_in_block;
1031 function = targ_fn->fn;
1033 dev_str = select_stream_for_async (async, pthread_self (), false, NULL);
1034 assert (dev_str == current_stream);
1036 /* This reserves a chunk of a pre-allocated page of memory mapped on both
1037 the host and the device. HP is a host pointer to the new chunk, and DP is
1038 the corresponding device pointer. */
1039 map_push (dev_str, async, mapnum * sizeof (void *), &hp, &dp);
1041 GOMP_PLUGIN_notify (" %s: prepare mappings\n", __FUNCTION__);
1043 /* Copy the array of arguments to the mapped page. */
1044 for (i = 0; i < mapnum; i++)
1045 ((void **) hp)[i] = devaddrs[i];
1047 /* Copy the (device) pointers to arguments to the device (dp and hp might in
1048 fact have the same value on a unified-memory system). */
1049 r = cuMemcpy ((CUdeviceptr)dp, (CUdeviceptr)hp, mapnum * sizeof (void *));
1050 if (r != CUDA_SUCCESS)
1051 GOMP_PLUGIN_fatal ("cuMemcpy failed: %s", cuErrorMsg (r));
1053 GOMP_PLUGIN_notify (" %s: kernel %s: launch\n", __FUNCTION__, targ_fn->name);
1055 // XXX: possible geometry mappings??
1057 // OpenACC CUDA
1059 // num_gangs blocks
1060 // num_workers warps (where a warp is equivalent to 32 threads)
1061 // vector length threads
1064 /* The openacc vector_length clause 'determines the vector length to use for
1065 vector or SIMD operations'. The question is how to map this to CUDA.
1067 In CUDA, the warp size is the vector length of a CUDA device. However, the
1068 CUDA interface abstracts away from that, and only shows us warp size
1069 indirectly in maximum number of threads per block, which is a product of
1070 warp size and the number of hyperthreads of a multiprocessor.
1072 We choose to map openacc vector_length directly onto the number of threads
1073 in a block, in the x dimension. This is reflected in gcc code generation
1074 that uses ThreadIdx.x to access vector elements.
1076 Attempting to use an openacc vector_length of more than the maximum number
1077 of threads per block will result in a cuda error. */
1078 nthreads_in_block = vector_length;
1080 kargs[0] = &dp;
1081 r = cuLaunchKernel (function,
1082 1, 1, 1,
1083 nthreads_in_block, 1, 1,
1084 0, dev_str->stream, kargs, 0);
1085 if (r != CUDA_SUCCESS)
1086 GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuErrorMsg (r));
1088 #ifndef DISABLE_ASYNC
1089 if (async < acc_async_noval)
1091 r = cuStreamSynchronize (dev_str->stream);
1092 if (r != CUDA_SUCCESS)
1093 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuErrorMsg (r));
1095 else
1097 CUevent *e;
1099 e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1101 r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1102 if (r != CUDA_SUCCESS)
1103 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuErrorMsg (r));
1105 event_gc (true);
1107 r = cuEventRecord (*e, dev_str->stream);
1108 if (r != CUDA_SUCCESS)
1109 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuErrorMsg (r));
1111 event_add (PTX_EVT_KNL, e, (void *)dev_str);
1113 #else
1114 r = cuCtxSynchronize ();
1115 if (r != CUDA_SUCCESS)
1116 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuErrorMsg (r));
1117 #endif
1119 GOMP_PLUGIN_notify (" %s: kernel %s: finished\n", __FUNCTION__,
1120 targ_fn->name);
1122 #ifndef DISABLE_ASYNC
1123 if (async < acc_async_noval)
1124 #endif
1125 map_pop (dev_str);
1128 void * openacc_get_current_cuda_context (void);
1130 static void *
1131 PTX_alloc (size_t s)
1133 CUdeviceptr d;
1134 CUresult r;
1136 r = cuMemAlloc (&d, s);
1137 if (r == CUDA_ERROR_OUT_OF_MEMORY)
1138 return 0;
1139 if (r != CUDA_SUCCESS)
1140 GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuErrorMsg (r));
1141 return (void *)d;
1144 static void
1145 PTX_free (void *p)
1147 CUresult r;
1148 CUdeviceptr pb;
1149 size_t ps;
1151 r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)p);
1152 if (r != CUDA_SUCCESS)
1153 GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuErrorMsg (r));
1155 if ((CUdeviceptr)p != pb)
1156 GOMP_PLUGIN_fatal ("invalid device address");
1158 r = cuMemFree ((CUdeviceptr)p);
1159 if (r != CUDA_SUCCESS)
1160 GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuErrorMsg (r));
1163 static void *
1164 PTX_host2dev (void *d, const void *h, size_t s)
1166 CUresult r;
1167 CUdeviceptr pb;
1168 size_t ps;
1170 if (!s)
1171 return 0;
1173 if (!d)
1174 GOMP_PLUGIN_fatal ("invalid device address");
1176 r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)d);
1177 if (r != CUDA_SUCCESS)
1178 GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuErrorMsg (r));
1180 if (!pb)
1181 GOMP_PLUGIN_fatal ("invalid device address");
1183 if (!h)
1184 GOMP_PLUGIN_fatal ("invalid host address");
1186 if (d == h)
1187 GOMP_PLUGIN_fatal ("invalid host or device address");
1189 if ((void *)(d + s) > (void *)(pb + ps))
1190 GOMP_PLUGIN_fatal ("invalid size");
1192 #ifndef DISABLE_ASYNC
1193 if (current_stream != PTX_dev->null_stream)
1195 CUevent *e;
1197 e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1199 r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1200 if (r != CUDA_SUCCESS)
1201 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuErrorMsg (r));
1203 event_gc (false);
1205 r = cuMemcpyHtoDAsync ((CUdeviceptr)d, h, s, current_stream->stream);
1206 if (r != CUDA_SUCCESS)
1207 GOMP_PLUGIN_fatal ("cuMemcpyHtoDAsync error: %s", cuErrorMsg (r));
1209 r = cuEventRecord (*e, current_stream->stream);
1210 if (r != CUDA_SUCCESS)
1211 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuErrorMsg (r));
1213 event_add (PTX_EVT_MEM, e, (void *)h);
1215 else
1216 #endif
1218 r = cuMemcpyHtoD ((CUdeviceptr)d, h, s);
1219 if (r != CUDA_SUCCESS)
1220 GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuErrorMsg (r));
1223 return 0;
1226 static void *
1227 PTX_dev2host (void *h, const void *d, size_t s)
1229 CUresult r;
1230 CUdeviceptr pb;
1231 size_t ps;
1233 if (!s)
1234 return 0;
1236 if (!d)
1237 GOMP_PLUGIN_fatal ("invalid device address");
1239 r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)d);
1240 if (r != CUDA_SUCCESS)
1241 GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuErrorMsg (r));
1243 if (!pb)
1244 GOMP_PLUGIN_fatal ("invalid device address");
1246 if (!h)
1247 GOMP_PLUGIN_fatal ("invalid host address");
1249 if (d == h)
1250 GOMP_PLUGIN_fatal ("invalid host or device address");
1252 if ((void *)(d + s) > (void *)(pb + ps))
1253 GOMP_PLUGIN_fatal ("invalid size");
1255 #ifndef DISABLE_ASYNC
1256 if (current_stream != PTX_dev->null_stream)
1258 CUevent *e;
1260 e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1262 r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1263 if (r != CUDA_SUCCESS)
1264 GOMP_PLUGIN_fatal ("cuEventCreate error: %s\n", cuErrorMsg (r));
1266 event_gc (false);
1268 r = cuMemcpyDtoHAsync (h, (CUdeviceptr)d, s, current_stream->stream);
1269 if (r != CUDA_SUCCESS)
1270 GOMP_PLUGIN_fatal ("cuMemcpyDtoHAsync error: %s", cuErrorMsg (r));
1272 r = cuEventRecord (*e, current_stream->stream);
1273 if (r != CUDA_SUCCESS)
1274 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuErrorMsg (r));
1276 event_add (PTX_EVT_MEM, e, (void *)h);
1278 else
1279 #endif
1281 r = cuMemcpyDtoH (h, (CUdeviceptr)d, s);
1282 if (r != CUDA_SUCCESS)
1283 GOMP_PLUGIN_fatal ("cuMemcpyDtoH error: %s", cuErrorMsg (r));
1286 return 0;
1289 static void
1290 PTX_set_async (int async)
1292 current_stream = select_stream_for_async (async, pthread_self (), true, NULL);
1295 static int
1296 PTX_async_test (int async)
1298 CUresult r;
1299 struct PTX_stream *s;
1301 s = select_stream_for_async (async, pthread_self (), false, NULL);
1303 if (!s)
1304 GOMP_PLUGIN_fatal ("unknown async %d", async);
1306 r = cuStreamQuery (s->stream);
1307 if (r == CUDA_SUCCESS)
1309 /* The oacc-parallel.c:goacc_wait function calls this hook to determine
1310 whether all work has completed on this stream, and if so omits the call
1311 to the wait hook. If that happens, event_gc might not get called
1312 (which prevents variables from getting unmapped and their associated
1313 device storage freed), so call it here. */
1314 event_gc (true);
1315 return 1;
1317 else if (r == CUDA_ERROR_NOT_READY)
1318 return 0;
1320 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuErrorMsg (r));
1322 return 0;
1325 static int
1326 PTX_async_test_all (void)
1328 struct PTX_stream *s;
1329 pthread_t self = pthread_self ();
1331 GOMP_PLUGIN_mutex_lock (&PTX_dev->stream_lock);
1333 for (s = PTX_dev->active_streams; s != NULL; s = s->next)
1335 if ((s->multithreaded || pthread_equal (s->host_thread, self))
1336 && cuStreamQuery (s->stream) == CUDA_ERROR_NOT_READY)
1338 GOMP_PLUGIN_mutex_unlock (&PTX_dev->stream_lock);
1339 return 0;
1343 GOMP_PLUGIN_mutex_unlock (&PTX_dev->stream_lock);
1345 event_gc (true);
1347 return 1;
1350 static void
1351 PTX_wait (int async)
1353 CUresult r;
1354 struct PTX_stream *s;
1356 s = select_stream_for_async (async, pthread_self (), false, NULL);
1358 if (!s)
1359 GOMP_PLUGIN_fatal ("unknown async %d", async);
1361 r = cuStreamSynchronize (s->stream);
1362 if (r != CUDA_SUCCESS)
1363 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuErrorMsg (r));
1365 event_gc (true);
1368 static void
1369 PTX_wait_async (int async1, int async2)
1371 CUresult r;
1372 CUevent *e;
1373 struct PTX_stream *s1, *s2;
1374 pthread_t self = pthread_self ();
1376 /* The stream that is waiting (rather than being waited for) doesn't
1377 necessarily have to exist already. */
1378 s2 = select_stream_for_async (async2, self, true, NULL);
1380 s1 = select_stream_for_async (async1, self, false, NULL);
1381 if (!s1)
1382 GOMP_PLUGIN_fatal ("invalid async 1\n");
1384 if (s1 == s2)
1385 GOMP_PLUGIN_fatal ("identical parameters");
1387 e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1389 r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1390 if (r != CUDA_SUCCESS)
1391 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuErrorMsg (r));
1393 event_gc (true);
1395 r = cuEventRecord (*e, s1->stream);
1396 if (r != CUDA_SUCCESS)
1397 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuErrorMsg (r));
1399 event_add (PTX_EVT_SYNC, e, NULL);
1401 r = cuStreamWaitEvent (s2->stream, *e, 0);
1402 if (r != CUDA_SUCCESS)
1403 GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuErrorMsg (r));
1406 static void
1407 PTX_wait_all (void)
1409 CUresult r;
1410 struct PTX_stream *s;
1411 pthread_t self = pthread_self ();
1413 GOMP_PLUGIN_mutex_lock (&PTX_dev->stream_lock);
1415 /* Wait for active streams initiated by this thread (or by multiple threads)
1416 to complete. */
1417 for (s = PTX_dev->active_streams; s != NULL; s = s->next)
1419 if (s->multithreaded || pthread_equal (s->host_thread, self))
1421 r = cuStreamQuery (s->stream);
1422 if (r == CUDA_SUCCESS)
1423 continue;
1424 else if (r != CUDA_ERROR_NOT_READY)
1425 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuErrorMsg (r));
1427 r = cuStreamSynchronize (s->stream);
1428 if (r != CUDA_SUCCESS)
1429 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuErrorMsg (r));
1433 GOMP_PLUGIN_mutex_unlock (&PTX_dev->stream_lock);
1435 event_gc (true);
1438 static void
1439 PTX_wait_all_async (int async)
1441 CUresult r;
1442 struct PTX_stream *waiting_stream, *other_stream;
1443 CUevent *e;
1444 pthread_t self = pthread_self ();
1446 /* The stream doing the waiting. This could be the first mention of the
1447 stream, so create it if necessary. */
1448 waiting_stream
1449 = select_stream_for_async (async, pthread_self (), true, NULL);
1451 /* Launches on the null stream already block on other streams in the
1452 context. */
1453 if (!waiting_stream || waiting_stream == PTX_dev->null_stream)
1454 return;
1456 event_gc (true);
1458 GOMP_PLUGIN_mutex_lock (&PTX_dev->stream_lock);
1460 for (other_stream = PTX_dev->active_streams;
1461 other_stream != NULL;
1462 other_stream = other_stream->next)
1464 if (!other_stream->multithreaded
1465 && !pthread_equal (other_stream->host_thread, self))
1466 continue;
1468 e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1470 r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1471 if (r != CUDA_SUCCESS)
1472 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuErrorMsg (r));
1474 /* Record an event on the waited-for stream. */
1475 r = cuEventRecord (*e, other_stream->stream);
1476 if (r != CUDA_SUCCESS)
1477 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuErrorMsg (r));
1479 event_add (PTX_EVT_SYNC, e, NULL);
1481 r = cuStreamWaitEvent (waiting_stream->stream, *e, 0);
1482 if (r != CUDA_SUCCESS)
1483 GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuErrorMsg (r));
1486 GOMP_PLUGIN_mutex_unlock (&PTX_dev->stream_lock);
1489 static void *
1490 PTX_get_current_cuda_device (void)
1492 if (!PTX_dev)
1493 return NULL;
1495 return &PTX_dev->dev;
1498 static void *
1499 PTX_get_current_cuda_context (void)
1501 if (!PTX_dev)
1502 return NULL;
1504 return PTX_dev->ctx;
1507 static void *
1508 PTX_get_cuda_stream (int async)
1510 struct PTX_stream *s;
1512 if (!PTX_dev)
1513 return NULL;
1515 s = select_stream_for_async (async, pthread_self (), false, NULL);
1517 return s ? s->stream : NULL;
1520 static int
1521 PTX_set_cuda_stream (int async, void *stream)
1523 struct PTX_stream *oldstream;
1524 pthread_t self = pthread_self ();
1526 GOMP_PLUGIN_mutex_lock (&PTX_dev->stream_lock);
1528 if (async < 0)
1529 GOMP_PLUGIN_fatal ("bad async %d", async);
1531 /* We have a list of active streams and an array mapping async values to
1532 entries of that list. We need to take "ownership" of the passed-in stream,
1533 and add it to our list, removing the previous entry also (if there was one)
1534 in order to prevent resource leaks. Note the potential for surprise
1535 here: maybe we should keep track of passed-in streams and leave it up to
1536 the user to tidy those up, but that doesn't work for stream handles
1537 returned from acc_get_cuda_stream above... */
1539 oldstream = select_stream_for_async (async, self, false, NULL);
1541 if (oldstream)
1543 if (PTX_dev->active_streams == oldstream)
1544 PTX_dev->active_streams = PTX_dev->active_streams->next;
1545 else
1547 struct PTX_stream *s = PTX_dev->active_streams;
1548 while (s->next != oldstream)
1549 s = s->next;
1550 s->next = s->next->next;
1553 cuStreamDestroy (oldstream->stream);
1554 map_fini (oldstream);
1555 free (oldstream);
1558 GOMP_PLUGIN_mutex_unlock (&PTX_dev->stream_lock);
1560 (void) select_stream_for_async (async, self, true, (CUstream) stream);
1562 return 1;
1565 /* Plugin entry points. */
1569 get_type (void)
1571 #ifdef DEBUG
1572 fprintf (stderr, "libgomp plugin: %s:%s\n", __FILE__, __FUNCTION__);
1573 #endif
1575 return TARGET_TYPE_NVIDIA_PTX;
1578 unsigned int
1579 get_caps (void)
1581 return TARGET_CAP_OPENACC_200;
1584 const char *
1585 get_name (void)
1587 return "nvidia";
1591 get_num_devices (void)
1593 #ifdef DEBUG
1594 fprintf (stderr, "libgomp plugin: %s:%s\n", __FILE__, __FUNCTION__);
1595 #endif
1597 return PTX_get_num_devices ();
1600 static void **kernel_target_data;
1601 static void **kernel_host_table;
1603 void
1604 offload_register (void *host_table, void *target_data)
1606 #ifdef DEBUG
1607 fprintf (stderr, "libgomp plugin: %s:%s (%p, %p)\n", __FILE__, __FUNCTION__,
1608 host_table, target_data);
1609 #endif
1611 kernel_target_data = target_data;
1612 kernel_host_table = host_table;
1616 device_init (void)
1618 #ifdef DEBUG
1619 fprintf (stderr, "libgomp plugin: %s:%s\n", __FILE__, __FUNCTION__);
1620 #endif
1622 return PTX_init ();
1626 device_fini (void)
1628 #ifdef DEBUG
1629 fprintf (stderr, "libgomp plugin: %s:%s\n", __FILE__, __FUNCTION__);
1630 #endif
1632 return PTX_fini ();
1636 device_get_table (struct mapping_table **tablep)
1638 CUmodule module;
1639 void **fn_table;
1640 char **fn_names;
1641 int fn_entries, i;
1642 CUresult r;
1643 struct targ_fn_descriptor *targ_fns;
1645 #ifdef DEBUG
1646 fprintf (stderr, "libgomp plugin: %s:%s (%p)\n", __FILE__, __FUNCTION__,
1647 tablep);
1648 #endif
1650 if (PTX_init () <= 0)
1651 return 0;
1653 /* This isn't an error, because an image may legitimately have no offloaded
1654 regions and so will not call GOMP_offload_register. */
1655 if (kernel_target_data == NULL)
1656 return 0;
1658 link_ptx (&module, kernel_target_data[0]);
1660 /* kernel_target_data[0] -> ptx code
1661 kernel_target_data[1] -> variable mappings
1662 kernel_target_data[2] -> array of kernel names in ascii
1664 kernel_host_table[0] -> start of function addresses (_omp_func_table)
1665 kernel_host_table[1] -> end of function addresses (_omp_funcs_end)
1667 The array of kernel names and the functions addresses form a
1668 one-to-one correspondence. */
1670 fn_table = kernel_host_table[0];
1671 fn_names = (char **) kernel_target_data[2];
1672 fn_entries = (kernel_host_table[1] - kernel_host_table[0]) / sizeof (void *);
1674 *tablep = GOMP_PLUGIN_malloc (sizeof (struct mapping_table) * fn_entries);
1675 targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
1676 * fn_entries);
1678 for (i = 0; i < fn_entries; i++)
1680 CUfunction function;
1682 r = cuModuleGetFunction (&function, module, fn_names[i]);
1683 if (r != CUDA_SUCCESS)
1684 GOMP_PLUGIN_fatal ("cuModuleGetFunction error: %s", cuErrorMsg (r));
1686 targ_fns[i].fn = function;
1687 targ_fns[i].name = (const char *) fn_names[i];
1689 (*tablep)[i].host_start = (uintptr_t) fn_table[i];
1690 (*tablep)[i].host_end = (*tablep)[i].host_start + 1;
1691 (*tablep)[i].tgt_start = (uintptr_t) &targ_fns[i];
1692 (*tablep)[i].tgt_end = (*tablep)[i].tgt_start + 1;
1695 return fn_entries;
1698 void *
1699 device_alloc (size_t size)
1701 #ifdef DEBUG
1702 fprintf (stderr, "libgomp plugin: %s:%s (%zu)\n", __FILE__, __FUNCTION__,
1703 size);
1704 #endif
1706 return PTX_alloc (size);
1709 void
1710 device_free (void *ptr)
1712 #ifdef DEBUG
1713 fprintf (stderr, "libgomp plugin: %s:%s (%p)\n", __FILE__, __FUNCTION__, ptr);
1714 #endif
1716 PTX_free (ptr);
1719 void *
1720 device_dev2host (void *dst, const void *src, size_t n)
1722 #ifdef DEBUG
1723 fprintf (stderr, "libgomp plugin: %s:%s (%p, %p, %zu)\n", __FILE__,
1724 __FUNCTION__, dst,
1725 src, n);
1726 #endif
1728 return PTX_dev2host (dst, src, n);
1731 void *
1732 device_host2dev (void *dst, const void *src, size_t n)
1734 #ifdef DEBUG
1735 fprintf (stderr, "libgomp plugin: %s:%s (%p, %p, %zu)\n", __FILE__,
1736 __FUNCTION__, dst, src, n);
1737 #endif
1739 return PTX_host2dev (dst, src, n);
1742 void (*device_run) (void *fn_ptr, void *vars) = NULL;
1744 void
1745 openacc_parallel (void (*fn) (void *), size_t mapnum, void **hostaddrs,
1746 void **devaddrs, size_t *sizes, unsigned short *kinds,
1747 int num_gangs, int num_workers, int vector_length,
1748 int async, void *targ_mem_desc)
1750 #ifdef DEBUG
1751 fprintf (stderr, "libgomp plugin: %s:%s (%p, %zu, %p, %p, %p, %d, %d, %d, "
1752 "%d, %p)\n", __FILE__, __FUNCTION__, fn, mapnum, hostaddrs, sizes,
1753 kinds, num_gangs, num_workers, vector_length, async, targ_mem_desc);
1754 #endif
1756 PTX_exec (fn, mapnum, hostaddrs, devaddrs, sizes, kinds, num_gangs,
1757 num_workers, vector_length, async, targ_mem_desc);
1760 void *
1761 openacc_open_device (int n)
1763 #ifdef DEBUG
1764 fprintf (stderr, "libgomp plugin: %s:%s (%d)\n", __FILE__, __FUNCTION__, n);
1765 #endif
1766 return PTX_open_device (n);
1770 openacc_close_device (void *h)
1772 #ifdef DEBUG
1773 fprintf (stderr, "libgomp plugin: %s:%s (%p)\n", __FILE__, __FUNCTION__, h);
1774 #endif
1775 return PTX_close_device (h);
1778 void
1779 openacc_set_device_num (int n)
1781 assert (n >= 0);
1783 if (!PTX_dev || PTX_dev->ord != n)
1784 (void) PTX_open_device (n);
1787 /* This can be called before the device is "opened" for the current thread, in
1788 which case we can't tell which device number should be returned. We don't
1789 actually want to open the device here, so just return -1 and let the caller
1790 (oacc-init.c:acc_get_device_num) handle it. */
1793 openacc_get_device_num (void)
1795 if (PTX_dev)
1796 return PTX_dev->ord;
1797 else
1798 return -1;
1801 bool
1802 openacc_avail (void)
1804 #ifdef DEBUG
1805 fprintf (stderr, "libgomp plugin: %s:%s\n", __FILE__, __FUNCTION__);
1806 #endif
1807 return PTX_avail ();
1810 void
1811 openacc_register_async_cleanup (void *targ_mem_desc)
1813 CUevent *e;
1814 CUresult r;
1816 #ifdef DEBUG
1817 fprintf (stderr, "libgomp plugin: %s:%s (%p)\n", __FILE__, __FUNCTION__,
1818 targ_mem_desc);
1819 #endif
1821 e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1823 r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1824 if (r != CUDA_SUCCESS)
1825 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuErrorMsg (r));
1827 r = cuEventRecord (*e, current_stream->stream);
1828 if (r != CUDA_SUCCESS)
1829 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuErrorMsg (r));
1831 event_add (PTX_EVT_ASYNC_CLEANUP, e, targ_mem_desc);
1835 openacc_async_test (int async)
1837 #ifdef DEBUG
1838 fprintf (stderr, "libgomp plugin: %s:%s (%d)\n", __FILE__, __FUNCTION__,
1839 async);
1840 #endif
1841 return PTX_async_test (async);
1845 openacc_async_test_all (void)
1847 #ifdef DEBUG
1848 fprintf (stderr, "libgomp plugin: %s:%s\n", __FILE__, __FUNCTION__);
1849 #endif
1850 return PTX_async_test_all ();
1853 void
1854 openacc_async_wait (int async)
1856 #ifdef DEBUG
1857 fprintf (stderr, "libgomp plugin: %s:%s (%d)\n", __FILE__, __FUNCTION__,
1858 async);
1859 #endif
1860 PTX_wait (async);
1863 void
1864 openacc_async_wait_async (int async1, int async2)
1866 #ifdef DEBUG
1867 fprintf (stderr, "libgomp plugin: %s:%s (%d, %d)\n", __FILE__, __FUNCTION__,
1868 async1, async2);
1869 #endif
1870 PTX_wait_async (async1, async2);
1873 void
1874 openacc_async_wait_all (void)
1876 #ifdef DEBUG
1877 fprintf (stderr, "libgomp plugin: %s:%s\n", __FILE__, __FUNCTION__);
1878 #endif
1879 PTX_wait_all ();
1882 void
1883 openacc_async_wait_all_async (int async)
1885 #ifdef DEBUG
1886 fprintf (stderr, "libgomp plugin: %s:%s (%d)\n", __FILE__, __FUNCTION__,
1887 async);
1888 #endif
1889 PTX_wait_all_async (async);
1892 void
1893 openacc_async_set_async (int async)
1895 #ifdef DEBUG
1896 fprintf (stderr, "libgomp plugin: %s:%s (%d)\n", __FILE__, __FUNCTION__,
1897 async);
1898 #endif
1899 PTX_set_async (async);
1902 void *
1903 openacc_get_current_cuda_device (void)
1905 #ifdef DEBUG
1906 fprintf (stderr, "libgomp plugin: %s:%s\n", __FILE__, __FUNCTION__);
1907 #endif
1908 return PTX_get_current_cuda_device ();
1911 void *
1912 openacc_get_current_cuda_context (void)
1914 #ifdef DEBUG
1915 fprintf (stderr, "libgomp plugin: %s:%s\n", __FILE__, __FUNCTION__);
1916 #endif
1917 return PTX_get_current_cuda_context ();
1920 /* NOTE: This returns a CUstream, not a PTX_stream pointer. */
1922 void *
1923 openacc_get_cuda_stream (int async)
1925 #ifdef DEBUG
1926 fprintf (stderr, "libgomp plugin: %s:%s (%d)\n", __FILE__, __FUNCTION__,
1927 async);
1928 #endif
1929 return PTX_get_cuda_stream (async);
1932 /* NOTE: This takes a CUstream, not a PTX_stream pointer. */
1935 openacc_set_cuda_stream (int async, void *stream)
1937 #ifdef DEBUG
1938 fprintf (stderr, "libgomp plugin: %s:%s (%d, %p)\n", __FILE__, __FUNCTION__,
1939 async, stream);
1940 #endif
1941 return PTX_set_cuda_stream (async, stream);