1 /* Plugin for NVPTX execution.
3 Copyright (C) 2013-2016 Free Software Foundation, Inc.
5 Contributed by Mentor Embedded.
7 This file is part of the GNU Offloading and Multi Processing Library
10 Libgomp is free software; you can redistribute it and/or modify it
11 under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 3, or (at your option)
15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
20 Under Section 7 of GPL version 3, you are granted additional
21 permissions described in the GCC Runtime Library Exception, version
22 3.1, as published by the Free Software Foundation.
24 You should have received a copy of the GNU General Public License and
25 a copy of the GCC Runtime Library Exception along with this program;
26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
27 <http://www.gnu.org/licenses/>. */
29 /* Nvidia PTX-specific parts of OpenACC support. The cuda driver
30 library appears to hold some implicit state, but the documentation
31 is not clear as to what that state might be. Or how one might
32 propagate it from one thread to another. */
36 #include "libgomp-plugin.h"
37 #include "oacc-plugin.h"
38 #include "gomp-constants.h"
50 cuda_error (CUresult r
)
52 #if CUDA_VERSION < 7000
53 /* Specified in documentation and present in library from at least
54 5.5. Not declared in header file prior to 7.0. */
55 extern CUresult
cuGetErrorString (CUresult
, const char **);
59 r
= cuGetErrorString (r
, &desc
);
60 if (r
!= CUDA_SUCCESS
)
61 desc
= "unknown cuda error";
66 /* Convenience macros for the frequently used CUDA library call and
67 error handling sequence. This does not capture all the cases we
68 use in this file, but is common enough. */
70 #define CUDA_CALL_ERET(ERET, FN, ...) \
72 unsigned __r = FN (__VA_ARGS__); \
73 if (__r != CUDA_SUCCESS) \
75 GOMP_PLUGIN_error (#FN " error: %s", \
81 #define CUDA_CALL(FN, ...) \
82 CUDA_CALL_ERET (false, (FN), __VA_ARGS__)
84 #define CUDA_CALL_ASSERT(FN, ...) \
86 unsigned __r = FN (__VA_ARGS__); \
87 if (__r != CUDA_SUCCESS) \
89 GOMP_PLUGIN_fatal (#FN " error: %s", \
94 static unsigned int instantiated_devices
= 0;
95 static pthread_mutex_t ptx_dev_lock
= PTHREAD_MUTEX_INITIALIZER
;
100 pthread_t host_thread
;
111 struct ptx_stream
*next
;
114 /* Thread-specific data for PTX. */
118 struct ptx_stream
*current_stream
;
119 struct ptx_device
*ptx_dev
;
130 map_init (struct ptx_stream
*s
)
132 int size
= getpagesize ();
138 CUDA_CALL (cuMemAllocHost
, &s
->h
, size
);
139 CUDA_CALL (cuMemHostGetDevicePointer
, &s
->d
, s
->h
, 0);
144 s
->h_end
= s
->h_begin
+ size
;
145 s
->h_next
= s
->h_prev
= s
->h_tail
= s
->h_begin
;
153 map_fini (struct ptx_stream
*s
)
155 CUDA_CALL (cuMemFreeHost
, s
->h
);
160 map_pop (struct ptx_stream
*s
)
171 s
->h_tail
+= m
->size
;
173 if (s
->h_tail
>= s
->h_end
)
174 s
->h_tail
= s
->h_begin
+ (int) (s
->h_tail
- s
->h_end
);
176 if (s
->h_next
== s
->h_tail
)
177 s
->h_prev
= s
->h_next
;
179 assert (s
->h_next
>= s
->h_begin
);
180 assert (s
->h_tail
>= s
->h_begin
);
181 assert (s
->h_prev
>= s
->h_begin
);
183 assert (s
->h_next
<= s
->h_end
);
184 assert (s
->h_tail
<= s
->h_end
);
185 assert (s
->h_prev
<= s
->h_end
);
189 map_push (struct ptx_stream
*s
, int async
, size_t size
, void **h
, void **d
)
197 left
= s
->h_end
- s
->h_next
;
198 size
+= sizeof (struct map
);
207 s
->h_next
= s
->h_begin
;
209 if (s
->h_next
+ size
> s
->h_end
)
210 GOMP_PLUGIN_fatal ("unable to push map");
219 offset
= (void *)&m
->mappings
[0] - s
->h
;
221 *d
= (void *)(s
->d
+ offset
);
222 *h
= (void *)(s
->h
+ offset
);
224 s
->h_prev
= s
->h_next
;
230 assert (s
->h_next
>= s
->h_begin
);
231 assert (s
->h_tail
>= s
->h_begin
);
232 assert (s
->h_prev
>= s
->h_begin
);
233 assert (s
->h_next
<= s
->h_end
);
234 assert (s
->h_tail
<= s
->h_end
);
235 assert (s
->h_prev
<= s
->h_end
);
240 /* Target data function launch information. */
242 struct targ_fn_launch
245 unsigned short dim
[GOMP_DIM_MAX
];
248 /* Target PTX object information. */
256 /* Target data image information. */
258 typedef struct nvptx_tdata
260 const struct targ_ptx_obj
*ptx_objs
;
263 const char *const *var_names
;
266 const struct targ_fn_launch
*fn_descs
;
270 /* Descriptor of a loaded function. */
272 struct targ_fn_descriptor
275 const struct targ_fn_launch
*launch
;
278 /* A loaded PTX image. */
279 struct ptx_image_data
281 const void *target_data
;
284 struct targ_fn_descriptor
*fns
; /* Array of functions. */
286 struct ptx_image_data
*next
;
294 struct ptx_stream
*null_stream
;
295 /* All non-null streams associated with this device (actually context),
296 either created implicitly or passed in from the user (via
297 acc_set_cuda_stream). */
298 struct ptx_stream
*active_streams
;
300 struct ptx_stream
**arr
;
303 /* A lock for use when manipulating the above stream list and array. */
304 pthread_mutex_t stream_lock
;
312 struct ptx_image_data
*images
; /* Images loaded on device. */
313 pthread_mutex_t image_lock
; /* Lock for above list. */
315 struct ptx_device
*next
;
323 PTX_EVT_ASYNC_CLEANUP
334 struct ptx_event
*next
;
337 static pthread_mutex_t ptx_event_lock
;
338 static struct ptx_event
*ptx_events
;
340 static struct ptx_device
**ptx_devices
;
342 static inline struct nvptx_thread
*
345 return (struct nvptx_thread
*) GOMP_PLUGIN_acc_thread ();
349 init_streams_for_device (struct ptx_device
*ptx_dev
, int concurrency
)
352 struct ptx_stream
*null_stream
353 = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream
));
355 null_stream
->stream
= NULL
;
356 null_stream
->host_thread
= pthread_self ();
357 null_stream
->multithreaded
= true;
358 null_stream
->d
= (CUdeviceptr
) NULL
;
359 null_stream
->h
= NULL
;
360 if (!map_init (null_stream
))
363 ptx_dev
->null_stream
= null_stream
;
364 ptx_dev
->active_streams
= NULL
;
365 pthread_mutex_init (&ptx_dev
->stream_lock
, NULL
);
370 /* This is just a guess -- make space for as many async streams as the
371 current device is capable of concurrently executing. This can grow
372 later as necessary. No streams are created yet. */
373 ptx_dev
->async_streams
.arr
374 = GOMP_PLUGIN_malloc (concurrency
* sizeof (struct ptx_stream
*));
375 ptx_dev
->async_streams
.size
= concurrency
;
377 for (i
= 0; i
< concurrency
; i
++)
378 ptx_dev
->async_streams
.arr
[i
] = NULL
;
384 fini_streams_for_device (struct ptx_device
*ptx_dev
)
386 free (ptx_dev
->async_streams
.arr
);
389 while (ptx_dev
->active_streams
!= NULL
)
391 struct ptx_stream
*s
= ptx_dev
->active_streams
;
392 ptx_dev
->active_streams
= ptx_dev
->active_streams
->next
;
396 CUresult r
= cuStreamDestroy (s
->stream
);
397 if (r
!= CUDA_SUCCESS
)
399 GOMP_PLUGIN_error ("cuStreamDestroy error: %s", cuda_error (r
));
405 ret
&= map_fini (ptx_dev
->null_stream
);
406 free (ptx_dev
->null_stream
);
410 /* Select a stream for (OpenACC-semantics) ASYNC argument for the current
411 thread THREAD (and also current device/context). If CREATE is true, create
412 the stream if it does not exist (or use EXISTING if it is non-NULL), and
413 associate the stream with the same thread argument. Returns stream to use
416 static struct ptx_stream
*
417 select_stream_for_async (int async
, pthread_t thread
, bool create
,
420 struct nvptx_thread
*nvthd
= nvptx_thread ();
421 /* Local copy of TLS variable. */
422 struct ptx_device
*ptx_dev
= nvthd
->ptx_dev
;
423 struct ptx_stream
*stream
= NULL
;
424 int orig_async
= async
;
426 /* The special value acc_async_noval (-1) maps (for now) to an
427 implicitly-created stream, which is then handled the same as any other
428 numbered async stream. Other options are available, e.g. using the null
429 stream for anonymous async operations, or choosing an idle stream from an
430 active set. But, stick with this for now. */
431 if (async
> acc_async_sync
)
435 pthread_mutex_lock (&ptx_dev
->stream_lock
);
437 /* NOTE: AFAICT there's no particular need for acc_async_sync to map to the
438 null stream, and in fact better performance may be obtainable if it doesn't
439 (because the null stream enforces overly-strict synchronisation with
440 respect to other streams for legacy reasons, and that's probably not
441 needed with OpenACC). Maybe investigate later. */
442 if (async
== acc_async_sync
)
443 stream
= ptx_dev
->null_stream
;
444 else if (async
>= 0 && async
< ptx_dev
->async_streams
.size
445 && ptx_dev
->async_streams
.arr
[async
] && !(create
&& existing
))
446 stream
= ptx_dev
->async_streams
.arr
[async
];
447 else if (async
>= 0 && create
)
449 if (async
>= ptx_dev
->async_streams
.size
)
451 int i
, newsize
= ptx_dev
->async_streams
.size
* 2;
453 if (async
>= newsize
)
456 ptx_dev
->async_streams
.arr
457 = GOMP_PLUGIN_realloc (ptx_dev
->async_streams
.arr
,
458 newsize
* sizeof (struct ptx_stream
*));
460 for (i
= ptx_dev
->async_streams
.size
; i
< newsize
; i
++)
461 ptx_dev
->async_streams
.arr
[i
] = NULL
;
463 ptx_dev
->async_streams
.size
= newsize
;
466 /* Create a new stream on-demand if there isn't one already, or if we're
467 setting a particular async value to an existing (externally-provided)
469 if (!ptx_dev
->async_streams
.arr
[async
] || existing
)
473 = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream
));
476 s
->stream
= existing
;
479 r
= cuStreamCreate (&s
->stream
, CU_STREAM_DEFAULT
);
480 if (r
!= CUDA_SUCCESS
)
482 pthread_mutex_unlock (&ptx_dev
->stream_lock
);
483 GOMP_PLUGIN_fatal ("cuStreamCreate error: %s",
488 /* If CREATE is true, we're going to be queueing some work on this
489 stream. Associate it with the current host thread. */
490 s
->host_thread
= thread
;
491 s
->multithreaded
= false;
493 s
->d
= (CUdeviceptr
) NULL
;
497 pthread_mutex_unlock (&ptx_dev
->stream_lock
);
498 GOMP_PLUGIN_fatal ("map_init fail");
501 s
->next
= ptx_dev
->active_streams
;
502 ptx_dev
->active_streams
= s
;
503 ptx_dev
->async_streams
.arr
[async
] = s
;
506 stream
= ptx_dev
->async_streams
.arr
[async
];
511 pthread_mutex_unlock (&ptx_dev
->stream_lock
);
512 GOMP_PLUGIN_fatal ("bad async %d", async
);
517 assert (stream
!= NULL
);
519 /* If we're trying to use the same stream from different threads
520 simultaneously, set stream->multithreaded to true. This affects the
521 behaviour of acc_async_test_all and acc_wait_all, which are supposed to
522 only wait for asynchronous launches from the same host thread they are
523 invoked on. If multiple threads use the same async value, we make note
524 of that here and fall back to testing/waiting for all threads in those
526 if (thread
!= stream
->host_thread
)
527 stream
->multithreaded
= true;
529 pthread_mutex_unlock (&ptx_dev
->stream_lock
);
531 else if (stream
&& !stream
->multithreaded
532 && !pthread_equal (stream
->host_thread
, thread
))
533 GOMP_PLUGIN_fatal ("async %d used on wrong thread", orig_async
);
538 /* Initialize the device. Return TRUE on success, else FALSE. PTX_DEV_LOCK
539 should be locked on entry and remains locked on exit. */
546 if (instantiated_devices
!= 0)
549 CUDA_CALL (cuInit
, 0);
551 pthread_mutex_init (&ptx_event_lock
, NULL
);
553 CUDA_CALL (cuDeviceGetCount
, &ndevs
);
554 ptx_devices
= GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device
*)
559 /* Select the N'th PTX device for the current host thread. The device must
560 have been previously opened before calling this function. */
563 nvptx_attach_host_thread_to_device (int n
)
567 struct ptx_device
*ptx_dev
;
570 r
= cuCtxGetDevice (&dev
);
571 if (r
!= CUDA_SUCCESS
&& r
!= CUDA_ERROR_INVALID_CONTEXT
)
573 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r
));
577 if (r
!= CUDA_ERROR_INVALID_CONTEXT
&& dev
== n
)
583 ptx_dev
= ptx_devices
[n
];
586 GOMP_PLUGIN_error ("device %d not found", n
);
590 CUDA_CALL (cuCtxGetCurrent
, &thd_ctx
);
592 /* We don't necessarily have a current context (e.g. if it has been
593 destroyed. Pop it if we do though. */
595 CUDA_CALL (cuCtxPopCurrent
, &old_ctx
);
597 CUDA_CALL (cuCtxPushCurrent
, ptx_dev
->ctx
);
602 static struct ptx_device
*
603 nvptx_open_device (int n
)
605 struct ptx_device
*ptx_dev
;
606 CUdevice dev
, ctx_dev
;
608 int async_engines
, pi
;
610 CUDA_CALL_ERET (NULL
, cuDeviceGet
, &dev
, n
);
612 ptx_dev
= GOMP_PLUGIN_malloc (sizeof (struct ptx_device
));
616 ptx_dev
->ctx_shared
= false;
618 r
= cuCtxGetDevice (&ctx_dev
);
619 if (r
!= CUDA_SUCCESS
&& r
!= CUDA_ERROR_INVALID_CONTEXT
)
621 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r
));
625 if (r
!= CUDA_ERROR_INVALID_CONTEXT
&& ctx_dev
!= dev
)
627 /* The current host thread has an active context for a different device.
630 CUDA_CALL_ERET (NULL
, cuCtxPopCurrent
, &old_ctx
);
633 CUDA_CALL_ERET (NULL
, cuCtxGetCurrent
, &ptx_dev
->ctx
);
636 CUDA_CALL_ERET (NULL
, cuCtxCreate
, &ptx_dev
->ctx
, CU_CTX_SCHED_AUTO
, dev
);
638 ptx_dev
->ctx_shared
= true;
640 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
641 &pi
, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP
, dev
);
642 ptx_dev
->overlap
= pi
;
644 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
645 &pi
, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY
, dev
);
648 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
649 &pi
, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS
, dev
);
650 ptx_dev
->concur
= pi
;
652 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
653 &pi
, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE
, dev
);
656 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
657 &pi
, CU_DEVICE_ATTRIBUTE_INTEGRATED
, dev
);
660 r
= cuDeviceGetAttribute (&async_engines
,
661 CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT
, dev
);
662 if (r
!= CUDA_SUCCESS
)
665 ptx_dev
->images
= NULL
;
666 pthread_mutex_init (&ptx_dev
->image_lock
, NULL
);
668 if (!init_streams_for_device (ptx_dev
, async_engines
))
675 nvptx_close_device (struct ptx_device
*ptx_dev
)
680 if (!fini_streams_for_device (ptx_dev
))
683 pthread_mutex_destroy (&ptx_dev
->image_lock
);
685 if (!ptx_dev
->ctx_shared
)
686 CUDA_CALL (cuCtxDestroy
, ptx_dev
->ctx
);
693 nvptx_get_num_devices (void)
697 /* PR libgomp/65099: Currently, we only support offloading in 64-bit
699 if (sizeof (void *) != 8)
702 /* This function will be called before the plugin has been initialized in
703 order to enumerate available devices, but CUDA API routines can't be used
704 until cuInit has been called. Just call it now (but don't yet do any
705 further initialization). */
706 if (instantiated_devices
== 0)
708 CUresult r
= cuInit (0);
709 /* This is not an error: e.g. we may have CUDA libraries installed but
710 no devices available. */
711 if (r
!= CUDA_SUCCESS
)
715 CUDA_CALL_ERET (-1, cuDeviceGetCount
, &n
);
721 link_ptx (CUmodule
*module
, const struct targ_ptx_obj
*ptx_objs
,
724 CUjit_option opts
[6];
730 unsigned long logsize
= LOGSIZE
;
731 CUlinkState linkstate
;
734 size_t linkoutsize
__attribute__ ((unused
));
736 opts
[0] = CU_JIT_WALL_TIME
;
737 optvals
[0] = &elapsed
;
739 opts
[1] = CU_JIT_INFO_LOG_BUFFER
;
740 optvals
[1] = &ilog
[0];
742 opts
[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES
;
743 optvals
[2] = (void *) logsize
;
745 opts
[3] = CU_JIT_ERROR_LOG_BUFFER
;
746 optvals
[3] = &elog
[0];
748 opts
[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES
;
749 optvals
[4] = (void *) logsize
;
751 opts
[5] = CU_JIT_LOG_VERBOSE
;
752 optvals
[5] = (void *) 1;
754 CUDA_CALL (cuLinkCreate
, 6, opts
, optvals
, &linkstate
);
756 for (; num_objs
--; ptx_objs
++)
758 /* cuLinkAddData's 'data' argument erroneously omits the const
760 GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs
->code
);
761 r
= cuLinkAddData (linkstate
, CU_JIT_INPUT_PTX
, (char*)ptx_objs
->code
,
762 ptx_objs
->size
, 0, 0, 0, 0);
763 if (r
!= CUDA_SUCCESS
)
765 GOMP_PLUGIN_error ("Link error log %s\n", &elog
[0]);
766 GOMP_PLUGIN_error ("cuLinkAddData (ptx_code) error: %s",
772 GOMP_PLUGIN_debug (0, "Linking\n");
773 r
= cuLinkComplete (linkstate
, &linkout
, &linkoutsize
);
775 GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed
);
776 GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog
[0]);
778 if (r
!= CUDA_SUCCESS
)
780 GOMP_PLUGIN_error ("cuLinkComplete error: %s", cuda_error (r
));
784 CUDA_CALL (cuModuleLoadData
, module
, linkout
);
785 CUDA_CALL (cuLinkDestroy
, linkstate
);
790 event_gc (bool memmap_lockable
)
792 struct ptx_event
*ptx_event
= ptx_events
;
793 struct ptx_event
*async_cleanups
= NULL
;
794 struct nvptx_thread
*nvthd
= nvptx_thread ();
796 pthread_mutex_lock (&ptx_event_lock
);
798 while (ptx_event
!= NULL
)
801 struct ptx_event
*e
= ptx_event
;
803 ptx_event
= ptx_event
->next
;
805 if (e
->ord
!= nvthd
->ptx_dev
->ord
)
808 r
= cuEventQuery (*e
->evt
);
809 if (r
== CUDA_SUCCESS
)
811 bool append_async
= false;
826 case PTX_EVT_ASYNC_CLEANUP
:
828 /* The function gomp_plugin_async_unmap_vars needs to claim the
829 memory-map splay tree lock for the current device, so we
830 can't call it when one of our callers has already claimed
831 the lock. In that case, just delay the GC for this event
833 if (!memmap_lockable
)
841 cuEventDestroy (*te
);
844 /* Unlink 'e' from ptx_events list. */
846 ptx_events
= ptx_events
->next
;
849 struct ptx_event
*e_
= ptx_events
;
850 while (e_
->next
!= e
)
852 e_
->next
= e_
->next
->next
;
857 e
->next
= async_cleanups
;
865 pthread_mutex_unlock (&ptx_event_lock
);
867 /* We have to do these here, after ptx_event_lock is released. */
868 while (async_cleanups
)
870 struct ptx_event
*e
= async_cleanups
;
871 async_cleanups
= async_cleanups
->next
;
873 GOMP_PLUGIN_async_unmap_vars (e
->addr
, e
->val
);
879 event_add (enum ptx_event_type type
, CUevent
*e
, void *h
, int val
)
881 struct ptx_event
*ptx_event
;
882 struct nvptx_thread
*nvthd
= nvptx_thread ();
884 assert (type
== PTX_EVT_MEM
|| type
== PTX_EVT_KNL
|| type
== PTX_EVT_SYNC
885 || type
== PTX_EVT_ASYNC_CLEANUP
);
887 ptx_event
= GOMP_PLUGIN_malloc (sizeof (struct ptx_event
));
888 ptx_event
->type
= type
;
891 ptx_event
->ord
= nvthd
->ptx_dev
->ord
;
892 ptx_event
->val
= val
;
894 pthread_mutex_lock (&ptx_event_lock
);
896 ptx_event
->next
= ptx_events
;
897 ptx_events
= ptx_event
;
899 pthread_mutex_unlock (&ptx_event_lock
);
903 nvptx_exec (void (*fn
), size_t mapnum
, void **hostaddrs
, void **devaddrs
,
904 int async
, unsigned *dims
, void *targ_mem_desc
)
906 struct targ_fn_descriptor
*targ_fn
= (struct targ_fn_descriptor
*) fn
;
910 struct ptx_stream
*dev_str
;
913 struct nvptx_thread
*nvthd
= nvptx_thread ();
914 const char *maybe_abort_msg
= "(perhaps abort was called)";
916 function
= targ_fn
->fn
;
918 dev_str
= select_stream_for_async (async
, pthread_self (), false, NULL
);
919 assert (dev_str
== nvthd
->current_stream
);
921 /* Initialize the launch dimensions. Typically this is constant,
922 provided by the device compiler, but we must permit runtime
925 for (i
= 0; i
!= GOMP_DIM_MAX
; i
++)
927 if (targ_fn
->launch
->dim
[i
])
928 dims
[i
] = targ_fn
->launch
->dim
[i
];
935 for (i
= 0; i
!= GOMP_DIM_MAX
; i
++)
937 dims
[i
] = /* TODO */ 32;
940 /* This reserves a chunk of a pre-allocated page of memory mapped on both
941 the host and the device. HP is a host pointer to the new chunk, and DP is
942 the corresponding device pointer. */
943 map_push (dev_str
, async
, mapnum
* sizeof (void *), &hp
, &dp
);
945 GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__
);
947 /* Copy the array of arguments to the mapped page. */
948 for (i
= 0; i
< mapnum
; i
++)
949 ((void **) hp
)[i
] = devaddrs
[i
];
951 /* Copy the (device) pointers to arguments to the device (dp and hp might in
952 fact have the same value on a unified-memory system). */
953 CUDA_CALL_ASSERT (cuMemcpy
, (CUdeviceptr
) dp
, (CUdeviceptr
) hp
,
954 mapnum
* sizeof (void *));
955 GOMP_PLUGIN_debug (0, " %s: kernel %s: launch"
956 " gangs=%u, workers=%u, vectors=%u\n",
957 __FUNCTION__
, targ_fn
->launch
->fn
,
958 dims
[0], dims
[1], dims
[2]);
962 // num_gangs nctaid.x
963 // num_workers ntid.y
964 // vector length ntid.x
967 CUDA_CALL_ASSERT (cuLaunchKernel
, function
,
968 dims
[GOMP_DIM_GANG
], 1, 1,
969 dims
[GOMP_DIM_VECTOR
], dims
[GOMP_DIM_WORKER
], 1,
970 0, dev_str
->stream
, kargs
, 0);
972 #ifndef DISABLE_ASYNC
973 if (async
< acc_async_noval
)
975 r
= cuStreamSynchronize (dev_str
->stream
);
976 if (r
== CUDA_ERROR_LAUNCH_FAILED
)
977 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r
),
979 else if (r
!= CUDA_SUCCESS
)
980 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r
));
986 e
= (CUevent
*)GOMP_PLUGIN_malloc (sizeof (CUevent
));
988 r
= cuEventCreate (e
, CU_EVENT_DISABLE_TIMING
);
989 if (r
== CUDA_ERROR_LAUNCH_FAILED
)
990 GOMP_PLUGIN_fatal ("cuEventCreate error: %s %s\n", cuda_error (r
),
992 else if (r
!= CUDA_SUCCESS
)
993 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r
));
997 CUDA_CALL_ASSERT (cuEventRecord
, *e
, dev_str
->stream
);
999 event_add (PTX_EVT_KNL
, e
, (void *)dev_str
, 0);
1002 r
= cuCtxSynchronize ();
1003 if (r
== CUDA_ERROR_LAUNCH_FAILED
)
1004 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r
),
1006 else if (r
!= CUDA_SUCCESS
)
1007 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r
));
1010 GOMP_PLUGIN_debug (0, " %s: kernel %s: finished\n", __FUNCTION__
,
1011 targ_fn
->launch
->fn
);
1013 #ifndef DISABLE_ASYNC
1014 if (async
< acc_async_noval
)
1019 void * openacc_get_current_cuda_context (void);
1022 nvptx_alloc (size_t s
)
1026 CUDA_CALL_ERET (NULL
, cuMemAlloc
, &d
, s
);
1031 nvptx_free (void *p
)
1036 CUDA_CALL (cuMemGetAddressRange
, &pb
, &ps
, (CUdeviceptr
) p
);
1037 if ((CUdeviceptr
) p
!= pb
)
1039 GOMP_PLUGIN_error ("invalid device address");
1043 CUDA_CALL (cuMemFree
, (CUdeviceptr
) p
);
1049 nvptx_host2dev (void *d
, const void *h
, size_t s
)
1053 struct nvptx_thread
*nvthd
= nvptx_thread ();
1059 GOMP_PLUGIN_error ("invalid device address");
1063 CUDA_CALL (cuMemGetAddressRange
, &pb
, &ps
, (CUdeviceptr
) d
);
1067 GOMP_PLUGIN_error ("invalid device address");
1072 GOMP_PLUGIN_error ("invalid host address");
1077 GOMP_PLUGIN_error ("invalid host or device address");
1080 if ((void *)(d
+ s
) > (void *)(pb
+ ps
))
1082 GOMP_PLUGIN_error ("invalid size");
1086 #ifndef DISABLE_ASYNC
1087 if (nvthd
->current_stream
!= nvthd
->ptx_dev
->null_stream
)
1089 CUevent
*e
= (CUevent
*)GOMP_PLUGIN_malloc (sizeof (CUevent
));
1090 CUDA_CALL (cuEventCreate
, e
, CU_EVENT_DISABLE_TIMING
);
1092 CUDA_CALL (cuMemcpyHtoDAsync
,
1093 (CUdeviceptr
) d
, h
, s
, nvthd
->current_stream
->stream
);
1094 CUDA_CALL (cuEventRecord
, *e
, nvthd
->current_stream
->stream
);
1095 event_add (PTX_EVT_MEM
, e
, (void *)h
, 0);
1099 CUDA_CALL (cuMemcpyHtoD
, (CUdeviceptr
) d
, h
, s
);
1105 nvptx_dev2host (void *h
, const void *d
, size_t s
)
1109 struct nvptx_thread
*nvthd
= nvptx_thread ();
1115 GOMP_PLUGIN_error ("invalid device address");
1119 CUDA_CALL (cuMemGetAddressRange
, &pb
, &ps
, (CUdeviceptr
) d
);
1123 GOMP_PLUGIN_error ("invalid device address");
1128 GOMP_PLUGIN_error ("invalid host address");
1133 GOMP_PLUGIN_error ("invalid host or device address");
1136 if ((void *)(d
+ s
) > (void *)(pb
+ ps
))
1138 GOMP_PLUGIN_error ("invalid size");
1142 #ifndef DISABLE_ASYNC
1143 if (nvthd
->current_stream
!= nvthd
->ptx_dev
->null_stream
)
1145 CUevent
*e
= (CUevent
*) GOMP_PLUGIN_malloc (sizeof (CUevent
));
1146 CUDA_CALL (cuEventCreate
, e
, CU_EVENT_DISABLE_TIMING
);
1148 CUDA_CALL (cuMemcpyDtoHAsync
,
1149 h
, (CUdeviceptr
) d
, s
, nvthd
->current_stream
->stream
);
1150 CUDA_CALL (cuEventRecord
, *e
, nvthd
->current_stream
->stream
);
1151 event_add (PTX_EVT_MEM
, e
, (void *)h
, 0);
1155 CUDA_CALL (cuMemcpyDtoH
, h
, (CUdeviceptr
) d
, s
);
1161 nvptx_set_async (int async
)
1163 struct nvptx_thread
*nvthd
= nvptx_thread ();
1164 nvthd
->current_stream
1165 = select_stream_for_async (async
, pthread_self (), true, NULL
);
1169 nvptx_async_test (int async
)
1172 struct ptx_stream
*s
;
1174 s
= select_stream_for_async (async
, pthread_self (), false, NULL
);
1177 GOMP_PLUGIN_fatal ("unknown async %d", async
);
1179 r
= cuStreamQuery (s
->stream
);
1180 if (r
== CUDA_SUCCESS
)
1182 /* The oacc-parallel.c:goacc_wait function calls this hook to determine
1183 whether all work has completed on this stream, and if so omits the call
1184 to the wait hook. If that happens, event_gc might not get called
1185 (which prevents variables from getting unmapped and their associated
1186 device storage freed), so call it here. */
1190 else if (r
== CUDA_ERROR_NOT_READY
)
1193 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r
));
1199 nvptx_async_test_all (void)
1201 struct ptx_stream
*s
;
1202 pthread_t self
= pthread_self ();
1203 struct nvptx_thread
*nvthd
= nvptx_thread ();
1205 pthread_mutex_lock (&nvthd
->ptx_dev
->stream_lock
);
1207 for (s
= nvthd
->ptx_dev
->active_streams
; s
!= NULL
; s
= s
->next
)
1209 if ((s
->multithreaded
|| pthread_equal (s
->host_thread
, self
))
1210 && cuStreamQuery (s
->stream
) == CUDA_ERROR_NOT_READY
)
1212 pthread_mutex_unlock (&nvthd
->ptx_dev
->stream_lock
);
1217 pthread_mutex_unlock (&nvthd
->ptx_dev
->stream_lock
);
1225 nvptx_wait (int async
)
1227 struct ptx_stream
*s
;
1229 s
= select_stream_for_async (async
, pthread_self (), false, NULL
);
1231 GOMP_PLUGIN_fatal ("unknown async %d", async
);
1233 CUDA_CALL_ASSERT (cuStreamSynchronize
, s
->stream
);
1239 nvptx_wait_async (int async1
, int async2
)
1242 struct ptx_stream
*s1
, *s2
;
1243 pthread_t self
= pthread_self ();
1245 /* The stream that is waiting (rather than being waited for) doesn't
1246 necessarily have to exist already. */
1247 s2
= select_stream_for_async (async2
, self
, true, NULL
);
1249 s1
= select_stream_for_async (async1
, self
, false, NULL
);
1251 GOMP_PLUGIN_fatal ("invalid async 1\n");
1254 GOMP_PLUGIN_fatal ("identical parameters");
1256 e
= (CUevent
*) GOMP_PLUGIN_malloc (sizeof (CUevent
));
1258 CUDA_CALL_ASSERT (cuEventCreate
, e
, CU_EVENT_DISABLE_TIMING
);
1262 CUDA_CALL_ASSERT (cuEventRecord
, *e
, s1
->stream
);
1264 event_add (PTX_EVT_SYNC
, e
, NULL
, 0);
1266 CUDA_CALL_ASSERT (cuStreamWaitEvent
, s2
->stream
, *e
, 0);
1270 nvptx_wait_all (void)
1273 struct ptx_stream
*s
;
1274 pthread_t self
= pthread_self ();
1275 struct nvptx_thread
*nvthd
= nvptx_thread ();
1277 pthread_mutex_lock (&nvthd
->ptx_dev
->stream_lock
);
1279 /* Wait for active streams initiated by this thread (or by multiple threads)
1281 for (s
= nvthd
->ptx_dev
->active_streams
; s
!= NULL
; s
= s
->next
)
1283 if (s
->multithreaded
|| pthread_equal (s
->host_thread
, self
))
1285 r
= cuStreamQuery (s
->stream
);
1286 if (r
== CUDA_SUCCESS
)
1288 else if (r
!= CUDA_ERROR_NOT_READY
)
1289 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r
));
1291 CUDA_CALL_ASSERT (cuStreamSynchronize
, s
->stream
);
1295 pthread_mutex_unlock (&nvthd
->ptx_dev
->stream_lock
);
1301 nvptx_wait_all_async (int async
)
1303 struct ptx_stream
*waiting_stream
, *other_stream
;
1305 struct nvptx_thread
*nvthd
= nvptx_thread ();
1306 pthread_t self
= pthread_self ();
1308 /* The stream doing the waiting. This could be the first mention of the
1309 stream, so create it if necessary. */
1311 = select_stream_for_async (async
, pthread_self (), true, NULL
);
1313 /* Launches on the null stream already block on other streams in the
1315 if (!waiting_stream
|| waiting_stream
== nvthd
->ptx_dev
->null_stream
)
1320 pthread_mutex_lock (&nvthd
->ptx_dev
->stream_lock
);
1322 for (other_stream
= nvthd
->ptx_dev
->active_streams
;
1323 other_stream
!= NULL
;
1324 other_stream
= other_stream
->next
)
1326 if (!other_stream
->multithreaded
1327 && !pthread_equal (other_stream
->host_thread
, self
))
1330 e
= (CUevent
*) GOMP_PLUGIN_malloc (sizeof (CUevent
));
1332 CUDA_CALL_ASSERT (cuEventCreate
, e
, CU_EVENT_DISABLE_TIMING
);
1334 /* Record an event on the waited-for stream. */
1335 CUDA_CALL_ASSERT (cuEventRecord
, *e
, other_stream
->stream
);
1337 event_add (PTX_EVT_SYNC
, e
, NULL
, 0);
1339 CUDA_CALL_ASSERT (cuStreamWaitEvent
, waiting_stream
->stream
, *e
, 0);
1342 pthread_mutex_unlock (&nvthd
->ptx_dev
->stream_lock
);
1346 nvptx_get_current_cuda_device (void)
1348 struct nvptx_thread
*nvthd
= nvptx_thread ();
1350 if (!nvthd
|| !nvthd
->ptx_dev
)
1353 return &nvthd
->ptx_dev
->dev
;
1357 nvptx_get_current_cuda_context (void)
1359 struct nvptx_thread
*nvthd
= nvptx_thread ();
1361 if (!nvthd
|| !nvthd
->ptx_dev
)
1364 return nvthd
->ptx_dev
->ctx
;
1368 nvptx_get_cuda_stream (int async
)
1370 struct ptx_stream
*s
;
1371 struct nvptx_thread
*nvthd
= nvptx_thread ();
1373 if (!nvthd
|| !nvthd
->ptx_dev
)
1376 s
= select_stream_for_async (async
, pthread_self (), false, NULL
);
1378 return s
? s
->stream
: NULL
;
1382 nvptx_set_cuda_stream (int async
, void *stream
)
1384 struct ptx_stream
*oldstream
;
1385 pthread_t self
= pthread_self ();
1386 struct nvptx_thread
*nvthd
= nvptx_thread ();
1389 GOMP_PLUGIN_fatal ("bad async %d", async
);
1391 pthread_mutex_lock (&nvthd
->ptx_dev
->stream_lock
);
1393 /* We have a list of active streams and an array mapping async values to
1394 entries of that list. We need to take "ownership" of the passed-in stream,
1395 and add it to our list, removing the previous entry also (if there was one)
1396 in order to prevent resource leaks. Note the potential for surprise
1397 here: maybe we should keep track of passed-in streams and leave it up to
1398 the user to tidy those up, but that doesn't work for stream handles
1399 returned from acc_get_cuda_stream above... */
1401 oldstream
= select_stream_for_async (async
, self
, false, NULL
);
1405 if (nvthd
->ptx_dev
->active_streams
== oldstream
)
1406 nvthd
->ptx_dev
->active_streams
= nvthd
->ptx_dev
->active_streams
->next
;
1409 struct ptx_stream
*s
= nvthd
->ptx_dev
->active_streams
;
1410 while (s
->next
!= oldstream
)
1412 s
->next
= s
->next
->next
;
1415 CUDA_CALL_ASSERT (cuStreamDestroy
, oldstream
->stream
);
1417 if (!map_fini (oldstream
))
1418 GOMP_PLUGIN_fatal ("error when freeing host memory");
1423 pthread_mutex_unlock (&nvthd
->ptx_dev
->stream_lock
);
1425 (void) select_stream_for_async (async
, self
, true, (CUstream
) stream
);
1430 /* Plugin entry points. */
1433 GOMP_OFFLOAD_get_name (void)
1439 GOMP_OFFLOAD_get_caps (void)
1441 return GOMP_OFFLOAD_CAP_OPENACC_200
;
1445 GOMP_OFFLOAD_get_type (void)
1447 return OFFLOAD_TARGET_TYPE_NVIDIA_PTX
;
1451 GOMP_OFFLOAD_get_num_devices (void)
1453 return nvptx_get_num_devices ();
1457 GOMP_OFFLOAD_init_device (int n
)
1459 struct ptx_device
*dev
;
1461 pthread_mutex_lock (&ptx_dev_lock
);
1463 if (!nvptx_init () || ptx_devices
[n
] != NULL
)
1465 pthread_mutex_unlock (&ptx_dev_lock
);
1469 dev
= nvptx_open_device (n
);
1472 ptx_devices
[n
] = dev
;
1473 instantiated_devices
++;
1476 pthread_mutex_unlock (&ptx_dev_lock
);
1482 GOMP_OFFLOAD_fini_device (int n
)
1484 pthread_mutex_lock (&ptx_dev_lock
);
1486 if (ptx_devices
[n
] != NULL
)
1488 if (!nvptx_attach_host_thread_to_device (n
)
1489 || !nvptx_close_device (ptx_devices
[n
]))
1491 pthread_mutex_unlock (&ptx_dev_lock
);
1494 ptx_devices
[n
] = NULL
;
1495 instantiated_devices
--;
1498 pthread_mutex_unlock (&ptx_dev_lock
);
1502 /* Return the libgomp version number we're compatible with. There is
1503 no requirement for cross-version compatibility. */
1506 GOMP_OFFLOAD_version (void)
1508 return GOMP_VERSION
;
1511 /* Load the (partial) program described by TARGET_DATA to device
1512 number ORD. Allocate and return TARGET_TABLE. */
1515 GOMP_OFFLOAD_load_image (int ord
, unsigned version
, const void *target_data
,
1516 struct addr_pair
**target_table
)
1519 const char *const *var_names
;
1520 const struct targ_fn_launch
*fn_descs
;
1521 unsigned int fn_entries
, var_entries
, i
, j
;
1522 struct targ_fn_descriptor
*targ_fns
;
1523 struct addr_pair
*targ_tbl
;
1524 const nvptx_tdata_t
*img_header
= (const nvptx_tdata_t
*) target_data
;
1525 struct ptx_image_data
*new_image
;
1526 struct ptx_device
*dev
;
1528 if (GOMP_VERSION_DEV (version
) > GOMP_VERSION_NVIDIA_PTX
)
1530 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1531 " (expected %u, received %u)",
1532 GOMP_VERSION_NVIDIA_PTX
, GOMP_VERSION_DEV (version
));
1536 if (!nvptx_attach_host_thread_to_device (ord
)
1537 || !link_ptx (&module
, img_header
->ptx_objs
, img_header
->ptx_num
))
1540 dev
= ptx_devices
[ord
];
1542 /* The mkoffload utility emits a struct of pointers/integers at the
1543 start of each offload image. The array of kernel names and the
1544 functions addresses form a one-to-one correspondence. */
1546 var_entries
= img_header
->var_num
;
1547 var_names
= img_header
->var_names
;
1548 fn_entries
= img_header
->fn_num
;
1549 fn_descs
= img_header
->fn_descs
;
1551 targ_tbl
= GOMP_PLUGIN_malloc (sizeof (struct addr_pair
)
1552 * (fn_entries
+ var_entries
));
1553 targ_fns
= GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor
)
1556 *target_table
= targ_tbl
;
1558 new_image
= GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data
));
1559 new_image
->target_data
= target_data
;
1560 new_image
->module
= module
;
1561 new_image
->fns
= targ_fns
;
1563 pthread_mutex_lock (&dev
->image_lock
);
1564 new_image
->next
= dev
->images
;
1565 dev
->images
= new_image
;
1566 pthread_mutex_unlock (&dev
->image_lock
);
1568 for (i
= 0; i
< fn_entries
; i
++, targ_fns
++, targ_tbl
++)
1570 CUfunction function
;
1572 CUDA_CALL_ERET (-1, cuModuleGetFunction
, &function
, module
,
1575 targ_fns
->fn
= function
;
1576 targ_fns
->launch
= &fn_descs
[i
];
1578 targ_tbl
->start
= (uintptr_t) targ_fns
;
1579 targ_tbl
->end
= targ_tbl
->start
+ 1;
1582 for (j
= 0; j
< var_entries
; j
++, targ_tbl
++)
1587 CUDA_CALL_ERET (-1, cuModuleGetGlobal
,
1588 &var
, &bytes
, module
, var_names
[j
]);
1590 targ_tbl
->start
= (uintptr_t) var
;
1591 targ_tbl
->end
= targ_tbl
->start
+ bytes
;
1594 return fn_entries
+ var_entries
;
1597 /* Unload the program described by TARGET_DATA. DEV_DATA is the
1598 function descriptors allocated by G_O_load_image. */
1601 GOMP_OFFLOAD_unload_image (int ord
, unsigned version
, const void *target_data
)
1603 struct ptx_image_data
*image
, **prev_p
;
1604 struct ptx_device
*dev
= ptx_devices
[ord
];
1606 if (GOMP_VERSION_DEV (version
) > GOMP_VERSION_NVIDIA_PTX
)
1608 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1609 " (expected %u, received %u)",
1610 GOMP_VERSION_NVIDIA_PTX
, GOMP_VERSION_DEV (version
));
1615 pthread_mutex_lock (&dev
->image_lock
);
1616 for (prev_p
= &dev
->images
; (image
= *prev_p
) != 0; prev_p
= &image
->next
)
1617 if (image
->target_data
== target_data
)
1619 *prev_p
= image
->next
;
1620 if (cuModuleUnload (image
->module
) != CUDA_SUCCESS
)
1626 pthread_mutex_unlock (&dev
->image_lock
);
1631 GOMP_OFFLOAD_alloc (int ord
, size_t size
)
1633 if (!nvptx_attach_host_thread_to_device (ord
))
1635 return nvptx_alloc (size
);
1639 GOMP_OFFLOAD_free (int ord
, void *ptr
)
1641 return (nvptx_attach_host_thread_to_device (ord
)
1642 && nvptx_free (ptr
));
1646 GOMP_OFFLOAD_dev2host (int ord
, void *dst
, const void *src
, size_t n
)
1648 return (nvptx_attach_host_thread_to_device (ord
)
1649 && nvptx_dev2host (dst
, src
, n
));
1653 GOMP_OFFLOAD_host2dev (int ord
, void *dst
, const void *src
, size_t n
)
1655 return (nvptx_attach_host_thread_to_device (ord
)
1656 && nvptx_host2dev (dst
, src
, n
));
1659 void (*device_run
) (int n
, void *fn_ptr
, void *vars
) = NULL
;
1662 GOMP_OFFLOAD_openacc_parallel (void (*fn
) (void *), size_t mapnum
,
1663 void **hostaddrs
, void **devaddrs
,
1664 int async
, unsigned *dims
, void *targ_mem_desc
)
1666 nvptx_exec (fn
, mapnum
, hostaddrs
, devaddrs
, async
, dims
, targ_mem_desc
);
1670 GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc
, int async
)
1672 struct nvptx_thread
*nvthd
= nvptx_thread ();
1673 CUevent
*e
= (CUevent
*) GOMP_PLUGIN_malloc (sizeof (CUevent
));
1675 CUDA_CALL_ASSERT (cuEventCreate
, e
, CU_EVENT_DISABLE_TIMING
);
1676 CUDA_CALL_ASSERT (cuEventRecord
, *e
, nvthd
->current_stream
->stream
);
1677 event_add (PTX_EVT_ASYNC_CLEANUP
, e
, targ_mem_desc
, async
);
1681 GOMP_OFFLOAD_openacc_async_test (int async
)
1683 return nvptx_async_test (async
);
1687 GOMP_OFFLOAD_openacc_async_test_all (void)
1689 return nvptx_async_test_all ();
1693 GOMP_OFFLOAD_openacc_async_wait (int async
)
1699 GOMP_OFFLOAD_openacc_async_wait_async (int async1
, int async2
)
1701 nvptx_wait_async (async1
, async2
);
1705 GOMP_OFFLOAD_openacc_async_wait_all (void)
1711 GOMP_OFFLOAD_openacc_async_wait_all_async (int async
)
1713 nvptx_wait_all_async (async
);
1717 GOMP_OFFLOAD_openacc_async_set_async (int async
)
1719 nvptx_set_async (async
);
1723 GOMP_OFFLOAD_openacc_create_thread_data (int ord
)
1725 struct ptx_device
*ptx_dev
;
1726 struct nvptx_thread
*nvthd
1727 = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread
));
1730 ptx_dev
= ptx_devices
[ord
];
1734 CUDA_CALL_ASSERT (cuCtxGetCurrent
, &thd_ctx
);
1736 assert (ptx_dev
->ctx
);
1739 CUDA_CALL_ASSERT (cuCtxPushCurrent
, ptx_dev
->ctx
);
1741 nvthd
->current_stream
= ptx_dev
->null_stream
;
1742 nvthd
->ptx_dev
= ptx_dev
;
1744 return (void *) nvthd
;
1748 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data
)
1754 GOMP_OFFLOAD_openacc_get_current_cuda_device (void)
1756 return nvptx_get_current_cuda_device ();
1760 GOMP_OFFLOAD_openacc_get_current_cuda_context (void)
1762 return nvptx_get_current_cuda_context ();
1765 /* NOTE: This returns a CUstream, not a ptx_stream pointer. */
1768 GOMP_OFFLOAD_openacc_get_cuda_stream (int async
)
1770 return nvptx_get_cuda_stream (async
);
1773 /* NOTE: This takes a CUstream, not a ptx_stream pointer. */
1776 GOMP_OFFLOAD_openacc_set_cuda_stream (int async
, void *stream
)
1778 return nvptx_set_cuda_stream (async
, stream
);