1 /* Plugin for NVPTX execution.
3 Copyright (C) 2013-2016 Free Software Foundation, Inc.
5 Contributed by Mentor Embedded.
7 This file is part of the GNU Offloading and Multi Processing Library
10 Libgomp is free software; you can redistribute it and/or modify it
11 under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 3, or (at your option)
15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
20 Under Section 7 of GPL version 3, you are granted additional
21 permissions described in the GCC Runtime Library Exception, version
22 3.1, as published by the Free Software Foundation.
24 You should have received a copy of the GNU General Public License and
25 a copy of the GCC Runtime Library Exception along with this program;
26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
27 <http://www.gnu.org/licenses/>. */
29 /* Nvidia PTX-specific parts of OpenACC support. The cuda driver
30 library appears to hold some implicit state, but the documentation
31 is not clear as to what that state might be. Or how one might
32 propagate it from one thread to another. */
36 #include "libgomp-plugin.h"
37 #include "oacc-plugin.h"
38 #include "gomp-constants.h"
50 cuda_error (CUresult r
)
52 #if CUDA_VERSION < 7000
53 /* Specified in documentation and present in library from at least
54 5.5. Not declared in header file prior to 7.0. */
55 extern CUresult
cuGetErrorString (CUresult
, const char **);
59 r
= cuGetErrorString (r
, &desc
);
60 if (r
!= CUDA_SUCCESS
)
61 desc
= "unknown cuda error";
66 static unsigned int instantiated_devices
= 0;
67 static pthread_mutex_t ptx_dev_lock
= PTHREAD_MUTEX_INITIALIZER
;
72 pthread_t host_thread
;
83 struct ptx_stream
*next
;
86 /* Thread-specific data for PTX. */
90 struct ptx_stream
*current_stream
;
91 struct ptx_device
*ptx_dev
;
102 map_init (struct ptx_stream
*s
)
106 int size
= getpagesize ();
112 r
= cuMemAllocHost (&s
->h
, size
);
113 if (r
!= CUDA_SUCCESS
)
114 GOMP_PLUGIN_fatal ("cuMemAllocHost error: %s", cuda_error (r
));
116 r
= cuMemHostGetDevicePointer (&s
->d
, s
->h
, 0);
117 if (r
!= CUDA_SUCCESS
)
118 GOMP_PLUGIN_fatal ("cuMemHostGetDevicePointer error: %s", cuda_error (r
));
123 s
->h_end
= s
->h_begin
+ size
;
124 s
->h_next
= s
->h_prev
= s
->h_tail
= s
->h_begin
;
131 map_fini (struct ptx_stream
*s
)
135 r
= cuMemFreeHost (s
->h
);
136 if (r
!= CUDA_SUCCESS
)
137 GOMP_PLUGIN_fatal ("cuMemFreeHost error: %s", cuda_error (r
));
141 map_pop (struct ptx_stream
*s
)
152 s
->h_tail
+= m
->size
;
154 if (s
->h_tail
>= s
->h_end
)
155 s
->h_tail
= s
->h_begin
+ (int) (s
->h_tail
- s
->h_end
);
157 if (s
->h_next
== s
->h_tail
)
158 s
->h_prev
= s
->h_next
;
160 assert (s
->h_next
>= s
->h_begin
);
161 assert (s
->h_tail
>= s
->h_begin
);
162 assert (s
->h_prev
>= s
->h_begin
);
164 assert (s
->h_next
<= s
->h_end
);
165 assert (s
->h_tail
<= s
->h_end
);
166 assert (s
->h_prev
<= s
->h_end
);
170 map_push (struct ptx_stream
*s
, int async
, size_t size
, void **h
, void **d
)
178 left
= s
->h_end
- s
->h_next
;
179 size
+= sizeof (struct map
);
188 s
->h_next
= s
->h_begin
;
190 if (s
->h_next
+ size
> s
->h_end
)
191 GOMP_PLUGIN_fatal ("unable to push map");
200 offset
= (void *)&m
->mappings
[0] - s
->h
;
202 *d
= (void *)(s
->d
+ offset
);
203 *h
= (void *)(s
->h
+ offset
);
205 s
->h_prev
= s
->h_next
;
211 assert (s
->h_next
>= s
->h_begin
);
212 assert (s
->h_tail
>= s
->h_begin
);
213 assert (s
->h_prev
>= s
->h_begin
);
214 assert (s
->h_next
<= s
->h_end
);
215 assert (s
->h_tail
<= s
->h_end
);
216 assert (s
->h_prev
<= s
->h_end
);
221 /* Target data function launch information. */
223 struct targ_fn_launch
226 unsigned short dim
[GOMP_DIM_MAX
];
229 /* Target PTX object information. */
237 /* Target data image information. */
239 typedef struct nvptx_tdata
241 const struct targ_ptx_obj
*ptx_objs
;
244 const char *const *var_names
;
247 const struct targ_fn_launch
*fn_descs
;
251 /* Descriptor of a loaded function. */
253 struct targ_fn_descriptor
256 const struct targ_fn_launch
*launch
;
259 /* A loaded PTX image. */
260 struct ptx_image_data
262 const void *target_data
;
265 struct targ_fn_descriptor
*fns
; /* Array of functions. */
267 struct ptx_image_data
*next
;
275 struct ptx_stream
*null_stream
;
276 /* All non-null streams associated with this device (actually context),
277 either created implicitly or passed in from the user (via
278 acc_set_cuda_stream). */
279 struct ptx_stream
*active_streams
;
281 struct ptx_stream
**arr
;
284 /* A lock for use when manipulating the above stream list and array. */
285 pthread_mutex_t stream_lock
;
293 struct ptx_image_data
*images
; /* Images loaded on device. */
294 pthread_mutex_t image_lock
; /* Lock for above list. */
296 struct ptx_device
*next
;
304 PTX_EVT_ASYNC_CLEANUP
314 struct ptx_event
*next
;
317 static pthread_mutex_t ptx_event_lock
;
318 static struct ptx_event
*ptx_events
;
320 static struct ptx_device
**ptx_devices
;
322 static inline struct nvptx_thread
*
325 return (struct nvptx_thread
*) GOMP_PLUGIN_acc_thread ();
329 init_streams_for_device (struct ptx_device
*ptx_dev
, int concurrency
)
332 struct ptx_stream
*null_stream
333 = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream
));
335 null_stream
->stream
= NULL
;
336 null_stream
->host_thread
= pthread_self ();
337 null_stream
->multithreaded
= true;
338 null_stream
->d
= (CUdeviceptr
) NULL
;
339 null_stream
->h
= NULL
;
340 map_init (null_stream
);
341 ptx_dev
->null_stream
= null_stream
;
343 ptx_dev
->active_streams
= NULL
;
344 pthread_mutex_init (&ptx_dev
->stream_lock
, NULL
);
349 /* This is just a guess -- make space for as many async streams as the
350 current device is capable of concurrently executing. This can grow
351 later as necessary. No streams are created yet. */
352 ptx_dev
->async_streams
.arr
353 = GOMP_PLUGIN_malloc (concurrency
* sizeof (struct ptx_stream
*));
354 ptx_dev
->async_streams
.size
= concurrency
;
356 for (i
= 0; i
< concurrency
; i
++)
357 ptx_dev
->async_streams
.arr
[i
] = NULL
;
361 fini_streams_for_device (struct ptx_device
*ptx_dev
)
363 free (ptx_dev
->async_streams
.arr
);
365 while (ptx_dev
->active_streams
!= NULL
)
367 struct ptx_stream
*s
= ptx_dev
->active_streams
;
368 ptx_dev
->active_streams
= ptx_dev
->active_streams
->next
;
371 cuStreamDestroy (s
->stream
);
375 map_fini (ptx_dev
->null_stream
);
376 free (ptx_dev
->null_stream
);
379 /* Select a stream for (OpenACC-semantics) ASYNC argument for the current
380 thread THREAD (and also current device/context). If CREATE is true, create
381 the stream if it does not exist (or use EXISTING if it is non-NULL), and
382 associate the stream with the same thread argument. Returns stream to use
385 static struct ptx_stream
*
386 select_stream_for_async (int async
, pthread_t thread
, bool create
,
389 struct nvptx_thread
*nvthd
= nvptx_thread ();
390 /* Local copy of TLS variable. */
391 struct ptx_device
*ptx_dev
= nvthd
->ptx_dev
;
392 struct ptx_stream
*stream
= NULL
;
393 int orig_async
= async
;
395 /* The special value acc_async_noval (-1) maps (for now) to an
396 implicitly-created stream, which is then handled the same as any other
397 numbered async stream. Other options are available, e.g. using the null
398 stream for anonymous async operations, or choosing an idle stream from an
399 active set. But, stick with this for now. */
400 if (async
> acc_async_sync
)
404 pthread_mutex_lock (&ptx_dev
->stream_lock
);
406 /* NOTE: AFAICT there's no particular need for acc_async_sync to map to the
407 null stream, and in fact better performance may be obtainable if it doesn't
408 (because the null stream enforces overly-strict synchronisation with
409 respect to other streams for legacy reasons, and that's probably not
410 needed with OpenACC). Maybe investigate later. */
411 if (async
== acc_async_sync
)
412 stream
= ptx_dev
->null_stream
;
413 else if (async
>= 0 && async
< ptx_dev
->async_streams
.size
414 && ptx_dev
->async_streams
.arr
[async
] && !(create
&& existing
))
415 stream
= ptx_dev
->async_streams
.arr
[async
];
416 else if (async
>= 0 && create
)
418 if (async
>= ptx_dev
->async_streams
.size
)
420 int i
, newsize
= ptx_dev
->async_streams
.size
* 2;
422 if (async
>= newsize
)
425 ptx_dev
->async_streams
.arr
426 = GOMP_PLUGIN_realloc (ptx_dev
->async_streams
.arr
,
427 newsize
* sizeof (struct ptx_stream
*));
429 for (i
= ptx_dev
->async_streams
.size
; i
< newsize
; i
++)
430 ptx_dev
->async_streams
.arr
[i
] = NULL
;
432 ptx_dev
->async_streams
.size
= newsize
;
435 /* Create a new stream on-demand if there isn't one already, or if we're
436 setting a particular async value to an existing (externally-provided)
438 if (!ptx_dev
->async_streams
.arr
[async
] || existing
)
442 = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream
));
445 s
->stream
= existing
;
448 r
= cuStreamCreate (&s
->stream
, CU_STREAM_DEFAULT
);
449 if (r
!= CUDA_SUCCESS
)
450 GOMP_PLUGIN_fatal ("cuStreamCreate error: %s", cuda_error (r
));
453 /* If CREATE is true, we're going to be queueing some work on this
454 stream. Associate it with the current host thread. */
455 s
->host_thread
= thread
;
456 s
->multithreaded
= false;
458 s
->d
= (CUdeviceptr
) NULL
;
462 s
->next
= ptx_dev
->active_streams
;
463 ptx_dev
->active_streams
= s
;
464 ptx_dev
->async_streams
.arr
[async
] = s
;
467 stream
= ptx_dev
->async_streams
.arr
[async
];
470 GOMP_PLUGIN_fatal ("bad async %d", async
);
474 assert (stream
!= NULL
);
476 /* If we're trying to use the same stream from different threads
477 simultaneously, set stream->multithreaded to true. This affects the
478 behaviour of acc_async_test_all and acc_wait_all, which are supposed to
479 only wait for asynchronous launches from the same host thread they are
480 invoked on. If multiple threads use the same async value, we make note
481 of that here and fall back to testing/waiting for all threads in those
483 if (thread
!= stream
->host_thread
)
484 stream
->multithreaded
= true;
486 pthread_mutex_unlock (&ptx_dev
->stream_lock
);
488 else if (stream
&& !stream
->multithreaded
489 && !pthread_equal (stream
->host_thread
, thread
))
490 GOMP_PLUGIN_fatal ("async %d used on wrong thread", orig_async
);
495 /* Initialize the device. Return TRUE on success, else FALSE. PTX_DEV_LOCK
496 should be locked on entry and remains locked on exit. */
504 if (instantiated_devices
!= 0)
508 if (r
!= CUDA_SUCCESS
)
509 GOMP_PLUGIN_fatal ("cuInit error: %s", cuda_error (r
));
513 pthread_mutex_init (&ptx_event_lock
, NULL
);
515 r
= cuDeviceGetCount (&ndevs
);
516 if (r
!= CUDA_SUCCESS
)
517 GOMP_PLUGIN_fatal ("cuDeviceGetCount error: %s", cuda_error (r
));
519 ptx_devices
= GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device
*)
525 /* Select the N'th PTX device for the current host thread. The device must
526 have been previously opened before calling this function. */
529 nvptx_attach_host_thread_to_device (int n
)
533 struct ptx_device
*ptx_dev
;
536 r
= cuCtxGetDevice (&dev
);
537 if (r
!= CUDA_SUCCESS
&& r
!= CUDA_ERROR_INVALID_CONTEXT
)
538 GOMP_PLUGIN_fatal ("cuCtxGetDevice error: %s", cuda_error (r
));
540 if (r
!= CUDA_ERROR_INVALID_CONTEXT
&& dev
== n
)
546 ptx_dev
= ptx_devices
[n
];
549 r
= cuCtxGetCurrent (&thd_ctx
);
550 if (r
!= CUDA_SUCCESS
)
551 GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r
));
553 /* We don't necessarily have a current context (e.g. if it has been
554 destroyed. Pop it if we do though. */
557 r
= cuCtxPopCurrent (&old_ctx
);
558 if (r
!= CUDA_SUCCESS
)
559 GOMP_PLUGIN_fatal ("cuCtxPopCurrent error: %s", cuda_error (r
));
562 r
= cuCtxPushCurrent (ptx_dev
->ctx
);
563 if (r
!= CUDA_SUCCESS
)
564 GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s", cuda_error (r
));
568 static struct ptx_device
*
569 nvptx_open_device (int n
)
571 struct ptx_device
*ptx_dev
;
572 CUdevice dev
, ctx_dev
;
574 int async_engines
, pi
;
576 r
= cuDeviceGet (&dev
, n
);
577 if (r
!= CUDA_SUCCESS
)
578 GOMP_PLUGIN_fatal ("cuDeviceGet error: %s", cuda_error (r
));
580 ptx_dev
= GOMP_PLUGIN_malloc (sizeof (struct ptx_device
));
584 ptx_dev
->ctx_shared
= false;
586 r
= cuCtxGetDevice (&ctx_dev
);
587 if (r
!= CUDA_SUCCESS
&& r
!= CUDA_ERROR_INVALID_CONTEXT
)
588 GOMP_PLUGIN_fatal ("cuCtxGetDevice error: %s", cuda_error (r
));
590 if (r
!= CUDA_ERROR_INVALID_CONTEXT
&& ctx_dev
!= dev
)
592 /* The current host thread has an active context for a different device.
596 r
= cuCtxPopCurrent (&old_ctx
);
597 if (r
!= CUDA_SUCCESS
)
598 GOMP_PLUGIN_fatal ("cuCtxPopCurrent error: %s", cuda_error (r
));
601 r
= cuCtxGetCurrent (&ptx_dev
->ctx
);
602 if (r
!= CUDA_SUCCESS
)
603 GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r
));
607 r
= cuCtxCreate (&ptx_dev
->ctx
, CU_CTX_SCHED_AUTO
, dev
);
608 if (r
!= CUDA_SUCCESS
)
609 GOMP_PLUGIN_fatal ("cuCtxCreate error: %s", cuda_error (r
));
612 ptx_dev
->ctx_shared
= true;
614 r
= cuDeviceGetAttribute (&pi
, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP
, dev
);
615 if (r
!= CUDA_SUCCESS
)
616 GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r
));
618 ptx_dev
->overlap
= pi
;
620 r
= cuDeviceGetAttribute (&pi
, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY
, dev
);
621 if (r
!= CUDA_SUCCESS
)
622 GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r
));
626 r
= cuDeviceGetAttribute (&pi
, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS
, dev
);
627 if (r
!= CUDA_SUCCESS
)
628 GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r
));
630 ptx_dev
->concur
= pi
;
632 r
= cuDeviceGetAttribute (&pi
, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE
, dev
);
633 if (r
!= CUDA_SUCCESS
)
634 GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r
));
638 r
= cuDeviceGetAttribute (&pi
, CU_DEVICE_ATTRIBUTE_INTEGRATED
, dev
);
639 if (r
!= CUDA_SUCCESS
)
640 GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r
));
644 r
= cuDeviceGetAttribute (&async_engines
,
645 CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT
, dev
);
646 if (r
!= CUDA_SUCCESS
)
649 ptx_dev
->images
= NULL
;
650 pthread_mutex_init (&ptx_dev
->image_lock
, NULL
);
652 init_streams_for_device (ptx_dev
, async_engines
);
658 nvptx_close_device (struct ptx_device
*ptx_dev
)
665 fini_streams_for_device (ptx_dev
);
667 pthread_mutex_destroy (&ptx_dev
->image_lock
);
669 if (!ptx_dev
->ctx_shared
)
671 r
= cuCtxDestroy (ptx_dev
->ctx
);
672 if (r
!= CUDA_SUCCESS
)
673 GOMP_PLUGIN_fatal ("cuCtxDestroy error: %s", cuda_error (r
));
680 nvptx_get_num_devices (void)
685 /* PR libgomp/65099: Currently, we only support offloading in 64-bit
687 if (sizeof (void *) != 8)
690 /* This function will be called before the plugin has been initialized in
691 order to enumerate available devices, but CUDA API routines can't be used
692 until cuInit has been called. Just call it now (but don't yet do any
693 further initialization). */
694 if (instantiated_devices
== 0)
697 /* This is not an error: e.g. we may have CUDA libraries installed but
698 no devices available. */
699 if (r
!= CUDA_SUCCESS
)
703 r
= cuDeviceGetCount (&n
);
704 if (r
!= CUDA_SUCCESS
)
705 GOMP_PLUGIN_fatal ("cuDeviceGetCount error: %s", cuda_error (r
));
712 link_ptx (CUmodule
*module
, const struct targ_ptx_obj
*ptx_objs
,
715 CUjit_option opts
[6];
721 unsigned long logsize
= LOGSIZE
;
722 CUlinkState linkstate
;
725 size_t linkoutsize
__attribute__ ((unused
));
727 opts
[0] = CU_JIT_WALL_TIME
;
728 optvals
[0] = &elapsed
;
730 opts
[1] = CU_JIT_INFO_LOG_BUFFER
;
731 optvals
[1] = &ilog
[0];
733 opts
[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES
;
734 optvals
[2] = (void *) logsize
;
736 opts
[3] = CU_JIT_ERROR_LOG_BUFFER
;
737 optvals
[3] = &elog
[0];
739 opts
[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES
;
740 optvals
[4] = (void *) logsize
;
742 opts
[5] = CU_JIT_LOG_VERBOSE
;
743 optvals
[5] = (void *) 1;
745 r
= cuLinkCreate (6, opts
, optvals
, &linkstate
);
746 if (r
!= CUDA_SUCCESS
)
747 GOMP_PLUGIN_fatal ("cuLinkCreate error: %s", cuda_error (r
));
749 for (; num_objs
--; ptx_objs
++)
751 /* cuLinkAddData's 'data' argument erroneously omits the const
753 GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs
->code
);
754 r
= cuLinkAddData (linkstate
, CU_JIT_INPUT_PTX
, (char*)ptx_objs
->code
,
755 ptx_objs
->size
, 0, 0, 0, 0);
756 if (r
!= CUDA_SUCCESS
)
758 GOMP_PLUGIN_error ("Link error log %s\n", &elog
[0]);
759 GOMP_PLUGIN_fatal ("cuLinkAddData (ptx_code) error: %s",
764 GOMP_PLUGIN_debug (0, "Linking\n");
765 r
= cuLinkComplete (linkstate
, &linkout
, &linkoutsize
);
767 GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed
);
768 GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog
[0]);
770 if (r
!= CUDA_SUCCESS
)
771 GOMP_PLUGIN_fatal ("cuLinkComplete error: %s", cuda_error (r
));
773 r
= cuModuleLoadData (module
, linkout
);
774 if (r
!= CUDA_SUCCESS
)
775 GOMP_PLUGIN_fatal ("cuModuleLoadData error: %s", cuda_error (r
));
777 r
= cuLinkDestroy (linkstate
);
778 if (r
!= CUDA_SUCCESS
)
779 GOMP_PLUGIN_fatal ("cuLinkDestory error: %s", cuda_error (r
));
783 event_gc (bool memmap_lockable
)
785 struct ptx_event
*ptx_event
= ptx_events
;
786 struct nvptx_thread
*nvthd
= nvptx_thread ();
788 pthread_mutex_lock (&ptx_event_lock
);
790 while (ptx_event
!= NULL
)
793 struct ptx_event
*e
= ptx_event
;
795 ptx_event
= ptx_event
->next
;
797 if (e
->ord
!= nvthd
->ptx_dev
->ord
)
800 r
= cuEventQuery (*e
->evt
);
801 if (r
== CUDA_SUCCESS
)
817 case PTX_EVT_ASYNC_CLEANUP
:
819 /* The function gomp_plugin_async_unmap_vars needs to claim the
820 memory-map splay tree lock for the current device, so we
821 can't call it when one of our callers has already claimed
822 the lock. In that case, just delay the GC for this event
824 if (!memmap_lockable
)
827 GOMP_PLUGIN_async_unmap_vars (e
->addr
);
832 cuEventDestroy (*te
);
836 ptx_events
= ptx_events
->next
;
839 struct ptx_event
*e_
= ptx_events
;
840 while (e_
->next
!= e
)
842 e_
->next
= e_
->next
->next
;
849 pthread_mutex_unlock (&ptx_event_lock
);
853 event_add (enum ptx_event_type type
, CUevent
*e
, void *h
)
855 struct ptx_event
*ptx_event
;
856 struct nvptx_thread
*nvthd
= nvptx_thread ();
858 assert (type
== PTX_EVT_MEM
|| type
== PTX_EVT_KNL
|| type
== PTX_EVT_SYNC
859 || type
== PTX_EVT_ASYNC_CLEANUP
);
861 ptx_event
= GOMP_PLUGIN_malloc (sizeof (struct ptx_event
));
862 ptx_event
->type
= type
;
865 ptx_event
->ord
= nvthd
->ptx_dev
->ord
;
867 pthread_mutex_lock (&ptx_event_lock
);
869 ptx_event
->next
= ptx_events
;
870 ptx_events
= ptx_event
;
872 pthread_mutex_unlock (&ptx_event_lock
);
876 nvptx_exec (void (*fn
), size_t mapnum
, void **hostaddrs
, void **devaddrs
,
877 int async
, unsigned *dims
, void *targ_mem_desc
)
879 struct targ_fn_descriptor
*targ_fn
= (struct targ_fn_descriptor
*) fn
;
883 struct ptx_stream
*dev_str
;
886 struct nvptx_thread
*nvthd
= nvptx_thread ();
887 const char *maybe_abort_msg
= "(perhaps abort was called)";
889 function
= targ_fn
->fn
;
891 dev_str
= select_stream_for_async (async
, pthread_self (), false, NULL
);
892 assert (dev_str
== nvthd
->current_stream
);
894 /* Initialize the launch dimensions. Typically this is constant,
895 provided by the device compiler, but we must permit runtime
898 for (i
= 0; i
!= GOMP_DIM_MAX
; i
++)
900 if (targ_fn
->launch
->dim
[i
])
901 dims
[i
] = targ_fn
->launch
->dim
[i
];
908 for (i
= 0; i
!= GOMP_DIM_MAX
; i
++)
910 dims
[i
] = /* TODO */ 32;
913 /* This reserves a chunk of a pre-allocated page of memory mapped on both
914 the host and the device. HP is a host pointer to the new chunk, and DP is
915 the corresponding device pointer. */
916 map_push (dev_str
, async
, mapnum
* sizeof (void *), &hp
, &dp
);
918 GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__
);
920 /* Copy the array of arguments to the mapped page. */
921 for (i
= 0; i
< mapnum
; i
++)
922 ((void **) hp
)[i
] = devaddrs
[i
];
924 /* Copy the (device) pointers to arguments to the device (dp and hp might in
925 fact have the same value on a unified-memory system). */
926 r
= cuMemcpy ((CUdeviceptr
)dp
, (CUdeviceptr
)hp
, mapnum
* sizeof (void *));
927 if (r
!= CUDA_SUCCESS
)
928 GOMP_PLUGIN_fatal ("cuMemcpy failed: %s", cuda_error (r
));
930 GOMP_PLUGIN_debug (0, " %s: kernel %s: launch"
931 " gangs=%u, workers=%u, vectors=%u\n",
932 __FUNCTION__
, targ_fn
->launch
->fn
,
933 dims
[0], dims
[1], dims
[2]);
937 // num_gangs nctaid.x
938 // num_workers ntid.y
939 // vector length ntid.x
942 r
= cuLaunchKernel (function
,
943 dims
[GOMP_DIM_GANG
], 1, 1,
944 dims
[GOMP_DIM_VECTOR
], dims
[GOMP_DIM_WORKER
], 1,
945 0, dev_str
->stream
, kargs
, 0);
946 if (r
!= CUDA_SUCCESS
)
947 GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r
));
949 #ifndef DISABLE_ASYNC
950 if (async
< acc_async_noval
)
952 r
= cuStreamSynchronize (dev_str
->stream
);
953 if (r
== CUDA_ERROR_LAUNCH_FAILED
)
954 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r
),
956 else if (r
!= CUDA_SUCCESS
)
957 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r
));
963 e
= (CUevent
*)GOMP_PLUGIN_malloc (sizeof (CUevent
));
965 r
= cuEventCreate (e
, CU_EVENT_DISABLE_TIMING
);
966 if (r
== CUDA_ERROR_LAUNCH_FAILED
)
967 GOMP_PLUGIN_fatal ("cuEventCreate error: %s %s\n", cuda_error (r
),
969 else if (r
!= CUDA_SUCCESS
)
970 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r
));
974 r
= cuEventRecord (*e
, dev_str
->stream
);
975 if (r
!= CUDA_SUCCESS
)
976 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r
));
978 event_add (PTX_EVT_KNL
, e
, (void *)dev_str
);
981 r
= cuCtxSynchronize ();
982 if (r
== CUDA_ERROR_LAUNCH_FAILED
)
983 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r
),
985 else if (r
!= CUDA_SUCCESS
)
986 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r
));
989 GOMP_PLUGIN_debug (0, " %s: kernel %s: finished\n", __FUNCTION__
,
990 targ_fn
->launch
->fn
);
992 #ifndef DISABLE_ASYNC
993 if (async
< acc_async_noval
)
998 void * openacc_get_current_cuda_context (void);
1001 nvptx_alloc (size_t s
)
1006 r
= cuMemAlloc (&d
, s
);
1007 if (r
== CUDA_ERROR_OUT_OF_MEMORY
)
1009 if (r
!= CUDA_SUCCESS
)
1010 GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r
));
1015 nvptx_free (void *p
)
1021 r
= cuMemGetAddressRange (&pb
, &ps
, (CUdeviceptr
)p
);
1022 if (r
!= CUDA_SUCCESS
)
1023 GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r
));
1025 if ((CUdeviceptr
)p
!= pb
)
1026 GOMP_PLUGIN_fatal ("invalid device address");
1028 r
= cuMemFree ((CUdeviceptr
)p
);
1029 if (r
!= CUDA_SUCCESS
)
1030 GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r
));
1034 nvptx_host2dev (void *d
, const void *h
, size_t s
)
1039 struct nvptx_thread
*nvthd
= nvptx_thread ();
1045 GOMP_PLUGIN_fatal ("invalid device address");
1047 r
= cuMemGetAddressRange (&pb
, &ps
, (CUdeviceptr
)d
);
1048 if (r
!= CUDA_SUCCESS
)
1049 GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r
));
1052 GOMP_PLUGIN_fatal ("invalid device address");
1055 GOMP_PLUGIN_fatal ("invalid host address");
1058 GOMP_PLUGIN_fatal ("invalid host or device address");
1060 if ((void *)(d
+ s
) > (void *)(pb
+ ps
))
1061 GOMP_PLUGIN_fatal ("invalid size");
1063 #ifndef DISABLE_ASYNC
1064 if (nvthd
->current_stream
!= nvthd
->ptx_dev
->null_stream
)
1068 e
= (CUevent
*)GOMP_PLUGIN_malloc (sizeof (CUevent
));
1070 r
= cuEventCreate (e
, CU_EVENT_DISABLE_TIMING
);
1071 if (r
!= CUDA_SUCCESS
)
1072 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r
));
1076 r
= cuMemcpyHtoDAsync ((CUdeviceptr
)d
, h
, s
,
1077 nvthd
->current_stream
->stream
);
1078 if (r
!= CUDA_SUCCESS
)
1079 GOMP_PLUGIN_fatal ("cuMemcpyHtoDAsync error: %s", cuda_error (r
));
1081 r
= cuEventRecord (*e
, nvthd
->current_stream
->stream
);
1082 if (r
!= CUDA_SUCCESS
)
1083 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r
));
1085 event_add (PTX_EVT_MEM
, e
, (void *)h
);
1090 r
= cuMemcpyHtoD ((CUdeviceptr
)d
, h
, s
);
1091 if (r
!= CUDA_SUCCESS
)
1092 GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r
));
1099 nvptx_dev2host (void *h
, const void *d
, size_t s
)
1104 struct nvptx_thread
*nvthd
= nvptx_thread ();
1110 GOMP_PLUGIN_fatal ("invalid device address");
1112 r
= cuMemGetAddressRange (&pb
, &ps
, (CUdeviceptr
)d
);
1113 if (r
!= CUDA_SUCCESS
)
1114 GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r
));
1117 GOMP_PLUGIN_fatal ("invalid device address");
1120 GOMP_PLUGIN_fatal ("invalid host address");
1123 GOMP_PLUGIN_fatal ("invalid host or device address");
1125 if ((void *)(d
+ s
) > (void *)(pb
+ ps
))
1126 GOMP_PLUGIN_fatal ("invalid size");
1128 #ifndef DISABLE_ASYNC
1129 if (nvthd
->current_stream
!= nvthd
->ptx_dev
->null_stream
)
1133 e
= (CUevent
*)GOMP_PLUGIN_malloc (sizeof (CUevent
));
1135 r
= cuEventCreate (e
, CU_EVENT_DISABLE_TIMING
);
1136 if (r
!= CUDA_SUCCESS
)
1137 GOMP_PLUGIN_fatal ("cuEventCreate error: %s\n", cuda_error (r
));
1141 r
= cuMemcpyDtoHAsync (h
, (CUdeviceptr
)d
, s
,
1142 nvthd
->current_stream
->stream
);
1143 if (r
!= CUDA_SUCCESS
)
1144 GOMP_PLUGIN_fatal ("cuMemcpyDtoHAsync error: %s", cuda_error (r
));
1146 r
= cuEventRecord (*e
, nvthd
->current_stream
->stream
);
1147 if (r
!= CUDA_SUCCESS
)
1148 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r
));
1150 event_add (PTX_EVT_MEM
, e
, (void *)h
);
1155 r
= cuMemcpyDtoH (h
, (CUdeviceptr
)d
, s
);
1156 if (r
!= CUDA_SUCCESS
)
1157 GOMP_PLUGIN_fatal ("cuMemcpyDtoH error: %s", cuda_error (r
));
1164 nvptx_set_async (int async
)
1166 struct nvptx_thread
*nvthd
= nvptx_thread ();
1167 nvthd
->current_stream
1168 = select_stream_for_async (async
, pthread_self (), true, NULL
);
1172 nvptx_async_test (int async
)
1175 struct ptx_stream
*s
;
1177 s
= select_stream_for_async (async
, pthread_self (), false, NULL
);
1180 GOMP_PLUGIN_fatal ("unknown async %d", async
);
1182 r
= cuStreamQuery (s
->stream
);
1183 if (r
== CUDA_SUCCESS
)
1185 /* The oacc-parallel.c:goacc_wait function calls this hook to determine
1186 whether all work has completed on this stream, and if so omits the call
1187 to the wait hook. If that happens, event_gc might not get called
1188 (which prevents variables from getting unmapped and their associated
1189 device storage freed), so call it here. */
1193 else if (r
== CUDA_ERROR_NOT_READY
)
1196 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r
));
1202 nvptx_async_test_all (void)
1204 struct ptx_stream
*s
;
1205 pthread_t self
= pthread_self ();
1206 struct nvptx_thread
*nvthd
= nvptx_thread ();
1208 pthread_mutex_lock (&nvthd
->ptx_dev
->stream_lock
);
1210 for (s
= nvthd
->ptx_dev
->active_streams
; s
!= NULL
; s
= s
->next
)
1212 if ((s
->multithreaded
|| pthread_equal (s
->host_thread
, self
))
1213 && cuStreamQuery (s
->stream
) == CUDA_ERROR_NOT_READY
)
1215 pthread_mutex_unlock (&nvthd
->ptx_dev
->stream_lock
);
1220 pthread_mutex_unlock (&nvthd
->ptx_dev
->stream_lock
);
1228 nvptx_wait (int async
)
1231 struct ptx_stream
*s
;
1233 s
= select_stream_for_async (async
, pthread_self (), false, NULL
);
1236 GOMP_PLUGIN_fatal ("unknown async %d", async
);
1238 r
= cuStreamSynchronize (s
->stream
);
1239 if (r
!= CUDA_SUCCESS
)
1240 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r
));
1246 nvptx_wait_async (int async1
, int async2
)
1250 struct ptx_stream
*s1
, *s2
;
1251 pthread_t self
= pthread_self ();
1253 /* The stream that is waiting (rather than being waited for) doesn't
1254 necessarily have to exist already. */
1255 s2
= select_stream_for_async (async2
, self
, true, NULL
);
1257 s1
= select_stream_for_async (async1
, self
, false, NULL
);
1259 GOMP_PLUGIN_fatal ("invalid async 1\n");
1262 GOMP_PLUGIN_fatal ("identical parameters");
1264 e
= (CUevent
*)GOMP_PLUGIN_malloc (sizeof (CUevent
));
1266 r
= cuEventCreate (e
, CU_EVENT_DISABLE_TIMING
);
1267 if (r
!= CUDA_SUCCESS
)
1268 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r
));
1272 r
= cuEventRecord (*e
, s1
->stream
);
1273 if (r
!= CUDA_SUCCESS
)
1274 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r
));
1276 event_add (PTX_EVT_SYNC
, e
, NULL
);
1278 r
= cuStreamWaitEvent (s2
->stream
, *e
, 0);
1279 if (r
!= CUDA_SUCCESS
)
1280 GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuda_error (r
));
1284 nvptx_wait_all (void)
1287 struct ptx_stream
*s
;
1288 pthread_t self
= pthread_self ();
1289 struct nvptx_thread
*nvthd
= nvptx_thread ();
1291 pthread_mutex_lock (&nvthd
->ptx_dev
->stream_lock
);
1293 /* Wait for active streams initiated by this thread (or by multiple threads)
1295 for (s
= nvthd
->ptx_dev
->active_streams
; s
!= NULL
; s
= s
->next
)
1297 if (s
->multithreaded
|| pthread_equal (s
->host_thread
, self
))
1299 r
= cuStreamQuery (s
->stream
);
1300 if (r
== CUDA_SUCCESS
)
1302 else if (r
!= CUDA_ERROR_NOT_READY
)
1303 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r
));
1305 r
= cuStreamSynchronize (s
->stream
);
1306 if (r
!= CUDA_SUCCESS
)
1307 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r
));
1311 pthread_mutex_unlock (&nvthd
->ptx_dev
->stream_lock
);
1317 nvptx_wait_all_async (int async
)
1320 struct ptx_stream
*waiting_stream
, *other_stream
;
1322 struct nvptx_thread
*nvthd
= nvptx_thread ();
1323 pthread_t self
= pthread_self ();
1325 /* The stream doing the waiting. This could be the first mention of the
1326 stream, so create it if necessary. */
1328 = select_stream_for_async (async
, pthread_self (), true, NULL
);
1330 /* Launches on the null stream already block on other streams in the
1332 if (!waiting_stream
|| waiting_stream
== nvthd
->ptx_dev
->null_stream
)
1337 pthread_mutex_lock (&nvthd
->ptx_dev
->stream_lock
);
1339 for (other_stream
= nvthd
->ptx_dev
->active_streams
;
1340 other_stream
!= NULL
;
1341 other_stream
= other_stream
->next
)
1343 if (!other_stream
->multithreaded
1344 && !pthread_equal (other_stream
->host_thread
, self
))
1347 e
= (CUevent
*) GOMP_PLUGIN_malloc (sizeof (CUevent
));
1349 r
= cuEventCreate (e
, CU_EVENT_DISABLE_TIMING
);
1350 if (r
!= CUDA_SUCCESS
)
1351 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r
));
1353 /* Record an event on the waited-for stream. */
1354 r
= cuEventRecord (*e
, other_stream
->stream
);
1355 if (r
!= CUDA_SUCCESS
)
1356 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r
));
1358 event_add (PTX_EVT_SYNC
, e
, NULL
);
1360 r
= cuStreamWaitEvent (waiting_stream
->stream
, *e
, 0);
1361 if (r
!= CUDA_SUCCESS
)
1362 GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuda_error (r
));
1365 pthread_mutex_unlock (&nvthd
->ptx_dev
->stream_lock
);
1369 nvptx_get_current_cuda_device (void)
1371 struct nvptx_thread
*nvthd
= nvptx_thread ();
1373 if (!nvthd
|| !nvthd
->ptx_dev
)
1376 return &nvthd
->ptx_dev
->dev
;
1380 nvptx_get_current_cuda_context (void)
1382 struct nvptx_thread
*nvthd
= nvptx_thread ();
1384 if (!nvthd
|| !nvthd
->ptx_dev
)
1387 return nvthd
->ptx_dev
->ctx
;
1391 nvptx_get_cuda_stream (int async
)
1393 struct ptx_stream
*s
;
1394 struct nvptx_thread
*nvthd
= nvptx_thread ();
1396 if (!nvthd
|| !nvthd
->ptx_dev
)
1399 s
= select_stream_for_async (async
, pthread_self (), false, NULL
);
1401 return s
? s
->stream
: NULL
;
1405 nvptx_set_cuda_stream (int async
, void *stream
)
1407 struct ptx_stream
*oldstream
;
1408 pthread_t self
= pthread_self ();
1409 struct nvptx_thread
*nvthd
= nvptx_thread ();
1411 pthread_mutex_lock (&nvthd
->ptx_dev
->stream_lock
);
1414 GOMP_PLUGIN_fatal ("bad async %d", async
);
1416 /* We have a list of active streams and an array mapping async values to
1417 entries of that list. We need to take "ownership" of the passed-in stream,
1418 and add it to our list, removing the previous entry also (if there was one)
1419 in order to prevent resource leaks. Note the potential for surprise
1420 here: maybe we should keep track of passed-in streams and leave it up to
1421 the user to tidy those up, but that doesn't work for stream handles
1422 returned from acc_get_cuda_stream above... */
1424 oldstream
= select_stream_for_async (async
, self
, false, NULL
);
1428 if (nvthd
->ptx_dev
->active_streams
== oldstream
)
1429 nvthd
->ptx_dev
->active_streams
= nvthd
->ptx_dev
->active_streams
->next
;
1432 struct ptx_stream
*s
= nvthd
->ptx_dev
->active_streams
;
1433 while (s
->next
!= oldstream
)
1435 s
->next
= s
->next
->next
;
1438 cuStreamDestroy (oldstream
->stream
);
1439 map_fini (oldstream
);
1443 pthread_mutex_unlock (&nvthd
->ptx_dev
->stream_lock
);
1445 (void) select_stream_for_async (async
, self
, true, (CUstream
) stream
);
1450 /* Plugin entry points. */
1453 GOMP_OFFLOAD_get_name (void)
1459 GOMP_OFFLOAD_get_caps (void)
1461 return GOMP_OFFLOAD_CAP_OPENACC_200
;
1465 GOMP_OFFLOAD_get_type (void)
1467 return OFFLOAD_TARGET_TYPE_NVIDIA_PTX
;
1471 GOMP_OFFLOAD_get_num_devices (void)
1473 return nvptx_get_num_devices ();
1477 GOMP_OFFLOAD_init_device (int n
)
1479 pthread_mutex_lock (&ptx_dev_lock
);
1481 if (!nvptx_init () || ptx_devices
[n
] != NULL
)
1483 pthread_mutex_unlock (&ptx_dev_lock
);
1487 ptx_devices
[n
] = nvptx_open_device (n
);
1488 instantiated_devices
++;
1490 pthread_mutex_unlock (&ptx_dev_lock
);
1494 GOMP_OFFLOAD_fini_device (int n
)
1496 pthread_mutex_lock (&ptx_dev_lock
);
1498 if (ptx_devices
[n
] != NULL
)
1500 nvptx_attach_host_thread_to_device (n
);
1501 nvptx_close_device (ptx_devices
[n
]);
1502 ptx_devices
[n
] = NULL
;
1503 instantiated_devices
--;
1506 pthread_mutex_unlock (&ptx_dev_lock
);
1509 /* Return the libgomp version number we're compatible with. There is
1510 no requirement for cross-version compatibility. */
1513 GOMP_OFFLOAD_version (void)
1515 return GOMP_VERSION
;
1518 /* Load the (partial) program described by TARGET_DATA to device
1519 number ORD. Allocate and return TARGET_TABLE. */
1522 GOMP_OFFLOAD_load_image (int ord
, unsigned version
, const void *target_data
,
1523 struct addr_pair
**target_table
)
1526 const char *const *var_names
;
1527 const struct targ_fn_launch
*fn_descs
;
1528 unsigned int fn_entries
, var_entries
, i
, j
;
1530 struct targ_fn_descriptor
*targ_fns
;
1531 struct addr_pair
*targ_tbl
;
1532 const nvptx_tdata_t
*img_header
= (const nvptx_tdata_t
*) target_data
;
1533 struct ptx_image_data
*new_image
;
1534 struct ptx_device
*dev
;
1536 if (GOMP_VERSION_DEV (version
) > GOMP_VERSION_NVIDIA_PTX
)
1537 GOMP_PLUGIN_fatal ("Offload data incompatible with PTX plugin"
1538 " (expected %u, received %u)",
1539 GOMP_VERSION_NVIDIA_PTX
, GOMP_VERSION_DEV (version
));
1541 GOMP_OFFLOAD_init_device (ord
);
1543 dev
= ptx_devices
[ord
];
1545 nvptx_attach_host_thread_to_device (ord
);
1547 link_ptx (&module
, img_header
->ptx_objs
, img_header
->ptx_num
);
1549 /* The mkoffload utility emits a struct of pointers/integers at the
1550 start of each offload image. The array of kernel names and the
1551 functions addresses form a one-to-one correspondence. */
1553 var_entries
= img_header
->var_num
;
1554 var_names
= img_header
->var_names
;
1555 fn_entries
= img_header
->fn_num
;
1556 fn_descs
= img_header
->fn_descs
;
1558 targ_tbl
= GOMP_PLUGIN_malloc (sizeof (struct addr_pair
)
1559 * (fn_entries
+ var_entries
));
1560 targ_fns
= GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor
)
1563 *target_table
= targ_tbl
;
1565 new_image
= GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data
));
1566 new_image
->target_data
= target_data
;
1567 new_image
->module
= module
;
1568 new_image
->fns
= targ_fns
;
1570 pthread_mutex_lock (&dev
->image_lock
);
1571 new_image
->next
= dev
->images
;
1572 dev
->images
= new_image
;
1573 pthread_mutex_unlock (&dev
->image_lock
);
1575 for (i
= 0; i
< fn_entries
; i
++, targ_fns
++, targ_tbl
++)
1577 CUfunction function
;
1579 r
= cuModuleGetFunction (&function
, module
, fn_descs
[i
].fn
);
1580 if (r
!= CUDA_SUCCESS
)
1581 GOMP_PLUGIN_fatal ("cuModuleGetFunction error: %s", cuda_error (r
));
1583 targ_fns
->fn
= function
;
1584 targ_fns
->launch
= &fn_descs
[i
];
1586 targ_tbl
->start
= (uintptr_t) targ_fns
;
1587 targ_tbl
->end
= targ_tbl
->start
+ 1;
1590 for (j
= 0; j
< var_entries
; j
++, targ_tbl
++)
1595 r
= cuModuleGetGlobal (&var
, &bytes
, module
, var_names
[j
]);
1596 if (r
!= CUDA_SUCCESS
)
1597 GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r
));
1599 targ_tbl
->start
= (uintptr_t) var
;
1600 targ_tbl
->end
= targ_tbl
->start
+ bytes
;
1603 return fn_entries
+ var_entries
;
1606 /* Unload the program described by TARGET_DATA. DEV_DATA is the
1607 function descriptors allocated by G_O_load_image. */
1610 GOMP_OFFLOAD_unload_image (int ord
, unsigned version
, const void *target_data
)
1612 struct ptx_image_data
*image
, **prev_p
;
1613 struct ptx_device
*dev
= ptx_devices
[ord
];
1615 if (GOMP_VERSION_DEV (version
) > GOMP_VERSION_NVIDIA_PTX
)
1618 pthread_mutex_lock (&dev
->image_lock
);
1619 for (prev_p
= &dev
->images
; (image
= *prev_p
) != 0; prev_p
= &image
->next
)
1620 if (image
->target_data
== target_data
)
1622 *prev_p
= image
->next
;
1623 cuModuleUnload (image
->module
);
1628 pthread_mutex_unlock (&dev
->image_lock
);
1632 GOMP_OFFLOAD_alloc (int ord
, size_t size
)
1634 nvptx_attach_host_thread_to_device (ord
);
1635 return nvptx_alloc (size
);
1639 GOMP_OFFLOAD_free (int ord
, void *ptr
)
1641 nvptx_attach_host_thread_to_device (ord
);
1646 GOMP_OFFLOAD_dev2host (int ord
, void *dst
, const void *src
, size_t n
)
1648 nvptx_attach_host_thread_to_device (ord
);
1649 return nvptx_dev2host (dst
, src
, n
);
1653 GOMP_OFFLOAD_host2dev (int ord
, void *dst
, const void *src
, size_t n
)
1655 nvptx_attach_host_thread_to_device (ord
);
1656 return nvptx_host2dev (dst
, src
, n
);
1659 void (*device_run
) (int n
, void *fn_ptr
, void *vars
) = NULL
;
1662 GOMP_OFFLOAD_openacc_parallel (void (*fn
) (void *), size_t mapnum
,
1663 void **hostaddrs
, void **devaddrs
,
1664 int async
, unsigned *dims
, void *targ_mem_desc
)
1666 nvptx_exec (fn
, mapnum
, hostaddrs
, devaddrs
, async
, dims
, targ_mem_desc
);
1670 GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc
)
1674 struct nvptx_thread
*nvthd
= nvptx_thread ();
1676 e
= (CUevent
*) GOMP_PLUGIN_malloc (sizeof (CUevent
));
1678 r
= cuEventCreate (e
, CU_EVENT_DISABLE_TIMING
);
1679 if (r
!= CUDA_SUCCESS
)
1680 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r
));
1682 r
= cuEventRecord (*e
, nvthd
->current_stream
->stream
);
1683 if (r
!= CUDA_SUCCESS
)
1684 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r
));
1686 event_add (PTX_EVT_ASYNC_CLEANUP
, e
, targ_mem_desc
);
1690 GOMP_OFFLOAD_openacc_async_test (int async
)
1692 return nvptx_async_test (async
);
1696 GOMP_OFFLOAD_openacc_async_test_all (void)
1698 return nvptx_async_test_all ();
1702 GOMP_OFFLOAD_openacc_async_wait (int async
)
1708 GOMP_OFFLOAD_openacc_async_wait_async (int async1
, int async2
)
1710 nvptx_wait_async (async1
, async2
);
1714 GOMP_OFFLOAD_openacc_async_wait_all (void)
1720 GOMP_OFFLOAD_openacc_async_wait_all_async (int async
)
1722 nvptx_wait_all_async (async
);
1726 GOMP_OFFLOAD_openacc_async_set_async (int async
)
1728 nvptx_set_async (async
);
1732 GOMP_OFFLOAD_openacc_create_thread_data (int ord
)
1734 struct ptx_device
*ptx_dev
;
1735 struct nvptx_thread
*nvthd
1736 = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread
));
1740 ptx_dev
= ptx_devices
[ord
];
1744 r
= cuCtxGetCurrent (&thd_ctx
);
1745 if (r
!= CUDA_SUCCESS
)
1746 GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r
));
1748 assert (ptx_dev
->ctx
);
1752 r
= cuCtxPushCurrent (ptx_dev
->ctx
);
1753 if (r
!= CUDA_SUCCESS
)
1754 GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s", cuda_error (r
));
1757 nvthd
->current_stream
= ptx_dev
->null_stream
;
1758 nvthd
->ptx_dev
= ptx_dev
;
1760 return (void *) nvthd
;
1764 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data
)
1770 GOMP_OFFLOAD_openacc_get_current_cuda_device (void)
1772 return nvptx_get_current_cuda_device ();
1776 GOMP_OFFLOAD_openacc_get_current_cuda_context (void)
1778 return nvptx_get_current_cuda_context ();
1781 /* NOTE: This returns a CUstream, not a ptx_stream pointer. */
1784 GOMP_OFFLOAD_openacc_get_cuda_stream (int async
)
1786 return nvptx_get_cuda_stream (async
);
1789 /* NOTE: This takes a CUstream, not a ptx_stream pointer. */
1792 GOMP_OFFLOAD_openacc_set_cuda_stream (int async
, void *stream
)
1794 return nvptx_set_cuda_stream (async
, stream
);