1 /* Plugin for NVPTX execution.
3 Copyright (C) 2013-2015 Free Software Foundation, Inc.
5 Contributed by Mentor Embedded.
7 This file is part of the GNU Offloading and Multi Processing Library
10 Libgomp is free software; you can redistribute it and/or modify it
11 under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 3, or (at your option)
15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
20 Under Section 7 of GPL version 3, you are granted additional
21 permissions described in the GCC Runtime Library Exception, version
22 3.1, as published by the Free Software Foundation.
24 You should have received a copy of the GNU General Public License and
25 a copy of the GCC Runtime Library Exception along with this program;
26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
27 <http://www.gnu.org/licenses/>. */
29 /* Nvidia PTX-specific parts of OpenACC support. The cuda driver
30 library appears to hold some implicit state, but the documentation
31 is not clear as to what that state might be. Or how one might
32 propagate it from one thread to another. */
36 #include "libgomp-plugin.h"
37 #include "oacc-plugin.h"
38 #include "gomp-constants.h"
50 cuda_error (CUresult r
)
52 #if CUDA_VERSION < 7000
53 /* Specified in documentation and present in library from at least
54 5.5. Not declared in header file prior to 7.0. */
55 extern CUresult
cuGetErrorString (CUresult
, const char **);
59 r
= cuGetErrorString (r
, &desc
);
60 if (r
!= CUDA_SUCCESS
)
61 desc
= "unknown cuda error";
66 static unsigned int instantiated_devices
= 0;
67 static pthread_mutex_t ptx_dev_lock
= PTHREAD_MUTEX_INITIALIZER
;
72 pthread_t host_thread
;
83 struct ptx_stream
*next
;
86 /* Thread-specific data for PTX. */
90 struct ptx_stream
*current_stream
;
91 struct ptx_device
*ptx_dev
;
102 map_init (struct ptx_stream
*s
)
106 int size
= getpagesize ();
112 r
= cuMemAllocHost (&s
->h
, size
);
113 if (r
!= CUDA_SUCCESS
)
114 GOMP_PLUGIN_fatal ("cuMemAllocHost error: %s", cuda_error (r
));
116 r
= cuMemHostGetDevicePointer (&s
->d
, s
->h
, 0);
117 if (r
!= CUDA_SUCCESS
)
118 GOMP_PLUGIN_fatal ("cuMemHostGetDevicePointer error: %s", cuda_error (r
));
123 s
->h_end
= s
->h_begin
+ size
;
124 s
->h_next
= s
->h_prev
= s
->h_tail
= s
->h_begin
;
131 map_fini (struct ptx_stream
*s
)
135 r
= cuMemFreeHost (s
->h
);
136 if (r
!= CUDA_SUCCESS
)
137 GOMP_PLUGIN_fatal ("cuMemFreeHost error: %s", cuda_error (r
));
141 map_pop (struct ptx_stream
*s
)
152 s
->h_tail
+= m
->size
;
154 if (s
->h_tail
>= s
->h_end
)
155 s
->h_tail
= s
->h_begin
+ (int) (s
->h_tail
- s
->h_end
);
157 if (s
->h_next
== s
->h_tail
)
158 s
->h_prev
= s
->h_next
;
160 assert (s
->h_next
>= s
->h_begin
);
161 assert (s
->h_tail
>= s
->h_begin
);
162 assert (s
->h_prev
>= s
->h_begin
);
164 assert (s
->h_next
<= s
->h_end
);
165 assert (s
->h_tail
<= s
->h_end
);
166 assert (s
->h_prev
<= s
->h_end
);
170 map_push (struct ptx_stream
*s
, int async
, size_t size
, void **h
, void **d
)
178 left
= s
->h_end
- s
->h_next
;
179 size
+= sizeof (struct map
);
188 s
->h_next
= s
->h_begin
;
190 if (s
->h_next
+ size
> s
->h_end
)
191 GOMP_PLUGIN_fatal ("unable to push map");
200 offset
= (void *)&m
->mappings
[0] - s
->h
;
202 *d
= (void *)(s
->d
+ offset
);
203 *h
= (void *)(s
->h
+ offset
);
205 s
->h_prev
= s
->h_next
;
211 assert (s
->h_next
>= s
->h_begin
);
212 assert (s
->h_tail
>= s
->h_begin
);
213 assert (s
->h_prev
>= s
->h_begin
);
214 assert (s
->h_next
<= s
->h_end
);
215 assert (s
->h_tail
<= s
->h_end
);
216 assert (s
->h_prev
<= s
->h_end
);
221 /* Target data function launch information. */
223 struct targ_fn_launch
226 unsigned short dim
[GOMP_DIM_MAX
];
229 /* Target PTX object information. */
237 /* Target data image information. */
239 typedef struct nvptx_tdata
241 const struct targ_ptx_obj
*ptx_objs
;
244 const char *const *var_names
;
247 const struct targ_fn_launch
*fn_descs
;
251 /* Descriptor of a loaded function. */
253 struct targ_fn_descriptor
256 const struct targ_fn_launch
*launch
;
259 /* A loaded PTX image. */
260 struct ptx_image_data
262 const void *target_data
;
265 struct targ_fn_descriptor
*fns
; /* Array of functions. */
267 struct ptx_image_data
*next
;
275 struct ptx_stream
*null_stream
;
276 /* All non-null streams associated with this device (actually context),
277 either created implicitly or passed in from the user (via
278 acc_set_cuda_stream). */
279 struct ptx_stream
*active_streams
;
281 struct ptx_stream
**arr
;
284 /* A lock for use when manipulating the above stream list and array. */
285 pthread_mutex_t stream_lock
;
293 struct ptx_image_data
*images
; /* Images loaded on device. */
294 pthread_mutex_t image_lock
; /* Lock for above list. */
296 struct ptx_device
*next
;
304 PTX_EVT_ASYNC_CLEANUP
314 struct ptx_event
*next
;
317 static pthread_mutex_t ptx_event_lock
;
318 static struct ptx_event
*ptx_events
;
320 static struct ptx_device
**ptx_devices
;
322 static inline struct nvptx_thread
*
325 return (struct nvptx_thread
*) GOMP_PLUGIN_acc_thread ();
329 init_streams_for_device (struct ptx_device
*ptx_dev
, int concurrency
)
332 struct ptx_stream
*null_stream
333 = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream
));
335 null_stream
->stream
= NULL
;
336 null_stream
->host_thread
= pthread_self ();
337 null_stream
->multithreaded
= true;
338 null_stream
->d
= (CUdeviceptr
) NULL
;
339 null_stream
->h
= NULL
;
340 map_init (null_stream
);
341 ptx_dev
->null_stream
= null_stream
;
343 ptx_dev
->active_streams
= NULL
;
344 pthread_mutex_init (&ptx_dev
->stream_lock
, NULL
);
349 /* This is just a guess -- make space for as many async streams as the
350 current device is capable of concurrently executing. This can grow
351 later as necessary. No streams are created yet. */
352 ptx_dev
->async_streams
.arr
353 = GOMP_PLUGIN_malloc (concurrency
* sizeof (struct ptx_stream
*));
354 ptx_dev
->async_streams
.size
= concurrency
;
356 for (i
= 0; i
< concurrency
; i
++)
357 ptx_dev
->async_streams
.arr
[i
] = NULL
;
361 fini_streams_for_device (struct ptx_device
*ptx_dev
)
363 free (ptx_dev
->async_streams
.arr
);
365 while (ptx_dev
->active_streams
!= NULL
)
367 struct ptx_stream
*s
= ptx_dev
->active_streams
;
368 ptx_dev
->active_streams
= ptx_dev
->active_streams
->next
;
371 cuStreamDestroy (s
->stream
);
375 map_fini (ptx_dev
->null_stream
);
376 free (ptx_dev
->null_stream
);
379 /* Select a stream for (OpenACC-semantics) ASYNC argument for the current
380 thread THREAD (and also current device/context). If CREATE is true, create
381 the stream if it does not exist (or use EXISTING if it is non-NULL), and
382 associate the stream with the same thread argument. Returns stream to use
385 static struct ptx_stream
*
386 select_stream_for_async (int async
, pthread_t thread
, bool create
,
389 struct nvptx_thread
*nvthd
= nvptx_thread ();
390 /* Local copy of TLS variable. */
391 struct ptx_device
*ptx_dev
= nvthd
->ptx_dev
;
392 struct ptx_stream
*stream
= NULL
;
393 int orig_async
= async
;
395 /* The special value acc_async_noval (-1) maps (for now) to an
396 implicitly-created stream, which is then handled the same as any other
397 numbered async stream. Other options are available, e.g. using the null
398 stream for anonymous async operations, or choosing an idle stream from an
399 active set. But, stick with this for now. */
400 if (async
> acc_async_sync
)
404 pthread_mutex_lock (&ptx_dev
->stream_lock
);
406 /* NOTE: AFAICT there's no particular need for acc_async_sync to map to the
407 null stream, and in fact better performance may be obtainable if it doesn't
408 (because the null stream enforces overly-strict synchronisation with
409 respect to other streams for legacy reasons, and that's probably not
410 needed with OpenACC). Maybe investigate later. */
411 if (async
== acc_async_sync
)
412 stream
= ptx_dev
->null_stream
;
413 else if (async
>= 0 && async
< ptx_dev
->async_streams
.size
414 && ptx_dev
->async_streams
.arr
[async
] && !(create
&& existing
))
415 stream
= ptx_dev
->async_streams
.arr
[async
];
416 else if (async
>= 0 && create
)
418 if (async
>= ptx_dev
->async_streams
.size
)
420 int i
, newsize
= ptx_dev
->async_streams
.size
* 2;
422 if (async
>= newsize
)
425 ptx_dev
->async_streams
.arr
426 = GOMP_PLUGIN_realloc (ptx_dev
->async_streams
.arr
,
427 newsize
* sizeof (struct ptx_stream
*));
429 for (i
= ptx_dev
->async_streams
.size
; i
< newsize
; i
++)
430 ptx_dev
->async_streams
.arr
[i
] = NULL
;
432 ptx_dev
->async_streams
.size
= newsize
;
435 /* Create a new stream on-demand if there isn't one already, or if we're
436 setting a particular async value to an existing (externally-provided)
438 if (!ptx_dev
->async_streams
.arr
[async
] || existing
)
442 = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream
));
445 s
->stream
= existing
;
448 r
= cuStreamCreate (&s
->stream
, CU_STREAM_DEFAULT
);
449 if (r
!= CUDA_SUCCESS
)
450 GOMP_PLUGIN_fatal ("cuStreamCreate error: %s", cuda_error (r
));
453 /* If CREATE is true, we're going to be queueing some work on this
454 stream. Associate it with the current host thread. */
455 s
->host_thread
= thread
;
456 s
->multithreaded
= false;
458 s
->d
= (CUdeviceptr
) NULL
;
462 s
->next
= ptx_dev
->active_streams
;
463 ptx_dev
->active_streams
= s
;
464 ptx_dev
->async_streams
.arr
[async
] = s
;
467 stream
= ptx_dev
->async_streams
.arr
[async
];
470 GOMP_PLUGIN_fatal ("bad async %d", async
);
474 assert (stream
!= NULL
);
476 /* If we're trying to use the same stream from different threads
477 simultaneously, set stream->multithreaded to true. This affects the
478 behaviour of acc_async_test_all and acc_wait_all, which are supposed to
479 only wait for asynchronous launches from the same host thread they are
480 invoked on. If multiple threads use the same async value, we make note
481 of that here and fall back to testing/waiting for all threads in those
483 if (thread
!= stream
->host_thread
)
484 stream
->multithreaded
= true;
486 pthread_mutex_unlock (&ptx_dev
->stream_lock
);
488 else if (stream
&& !stream
->multithreaded
489 && !pthread_equal (stream
->host_thread
, thread
))
490 GOMP_PLUGIN_fatal ("async %d used on wrong thread", orig_async
);
495 /* Initialize the device. Return TRUE on success, else FALSE. PTX_DEV_LOCK
496 should be locked on entry and remains locked on exit. */
504 if (instantiated_devices
!= 0)
508 if (r
!= CUDA_SUCCESS
)
509 GOMP_PLUGIN_fatal ("cuInit error: %s", cuda_error (r
));
513 pthread_mutex_init (&ptx_event_lock
, NULL
);
515 r
= cuDeviceGetCount (&ndevs
);
516 if (r
!= CUDA_SUCCESS
)
517 GOMP_PLUGIN_fatal ("cuDeviceGetCount error: %s", cuda_error (r
));
519 ptx_devices
= GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device
*)
525 /* Select the N'th PTX device for the current host thread. The device must
526 have been previously opened before calling this function. */
529 nvptx_attach_host_thread_to_device (int n
)
533 struct ptx_device
*ptx_dev
;
536 r
= cuCtxGetDevice (&dev
);
537 if (r
!= CUDA_SUCCESS
&& r
!= CUDA_ERROR_INVALID_CONTEXT
)
538 GOMP_PLUGIN_fatal ("cuCtxGetDevice error: %s", cuda_error (r
));
540 if (r
!= CUDA_ERROR_INVALID_CONTEXT
&& dev
== n
)
546 ptx_dev
= ptx_devices
[n
];
549 r
= cuCtxGetCurrent (&thd_ctx
);
550 if (r
!= CUDA_SUCCESS
)
551 GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r
));
553 /* We don't necessarily have a current context (e.g. if it has been
554 destroyed. Pop it if we do though. */
557 r
= cuCtxPopCurrent (&old_ctx
);
558 if (r
!= CUDA_SUCCESS
)
559 GOMP_PLUGIN_fatal ("cuCtxPopCurrent error: %s", cuda_error (r
));
562 r
= cuCtxPushCurrent (ptx_dev
->ctx
);
563 if (r
!= CUDA_SUCCESS
)
564 GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s", cuda_error (r
));
568 static struct ptx_device
*
569 nvptx_open_device (int n
)
571 struct ptx_device
*ptx_dev
;
572 CUdevice dev
, ctx_dev
;
574 int async_engines
, pi
;
576 r
= cuDeviceGet (&dev
, n
);
577 if (r
!= CUDA_SUCCESS
)
578 GOMP_PLUGIN_fatal ("cuDeviceGet error: %s", cuda_error (r
));
580 ptx_dev
= GOMP_PLUGIN_malloc (sizeof (struct ptx_device
));
584 ptx_dev
->ctx_shared
= false;
586 r
= cuCtxGetDevice (&ctx_dev
);
587 if (r
!= CUDA_SUCCESS
&& r
!= CUDA_ERROR_INVALID_CONTEXT
)
588 GOMP_PLUGIN_fatal ("cuCtxGetDevice error: %s", cuda_error (r
));
590 if (r
!= CUDA_ERROR_INVALID_CONTEXT
&& ctx_dev
!= dev
)
592 /* The current host thread has an active context for a different device.
596 r
= cuCtxPopCurrent (&old_ctx
);
597 if (r
!= CUDA_SUCCESS
)
598 GOMP_PLUGIN_fatal ("cuCtxPopCurrent error: %s", cuda_error (r
));
601 r
= cuCtxGetCurrent (&ptx_dev
->ctx
);
602 if (r
!= CUDA_SUCCESS
)
603 GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r
));
607 r
= cuCtxCreate (&ptx_dev
->ctx
, CU_CTX_SCHED_AUTO
, dev
);
608 if (r
!= CUDA_SUCCESS
)
609 GOMP_PLUGIN_fatal ("cuCtxCreate error: %s", cuda_error (r
));
612 ptx_dev
->ctx_shared
= true;
614 r
= cuDeviceGetAttribute (&pi
, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP
, dev
);
615 if (r
!= CUDA_SUCCESS
)
616 GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r
));
618 ptx_dev
->overlap
= pi
;
620 r
= cuDeviceGetAttribute (&pi
, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY
, dev
);
621 if (r
!= CUDA_SUCCESS
)
622 GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r
));
626 r
= cuDeviceGetAttribute (&pi
, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS
, dev
);
627 if (r
!= CUDA_SUCCESS
)
628 GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r
));
630 ptx_dev
->concur
= pi
;
632 r
= cuDeviceGetAttribute (&pi
, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE
, dev
);
633 if (r
!= CUDA_SUCCESS
)
634 GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r
));
638 r
= cuDeviceGetAttribute (&pi
, CU_DEVICE_ATTRIBUTE_INTEGRATED
, dev
);
639 if (r
!= CUDA_SUCCESS
)
640 GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r
));
644 r
= cuDeviceGetAttribute (&async_engines
,
645 CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT
, dev
);
646 if (r
!= CUDA_SUCCESS
)
649 ptx_dev
->images
= NULL
;
650 pthread_mutex_init (&ptx_dev
->image_lock
, NULL
);
652 init_streams_for_device (ptx_dev
, async_engines
);
658 nvptx_close_device (struct ptx_device
*ptx_dev
)
665 fini_streams_for_device (ptx_dev
);
667 pthread_mutex_destroy (&ptx_dev
->image_lock
);
669 if (!ptx_dev
->ctx_shared
)
671 r
= cuCtxDestroy (ptx_dev
->ctx
);
672 if (r
!= CUDA_SUCCESS
)
673 GOMP_PLUGIN_fatal ("cuCtxDestroy error: %s", cuda_error (r
));
680 nvptx_get_num_devices (void)
685 /* PR libgomp/65099: Currently, we only support offloading in 64-bit
687 if (sizeof (void *) != 8)
690 /* This function will be called before the plugin has been initialized in
691 order to enumerate available devices, but CUDA API routines can't be used
692 until cuInit has been called. Just call it now (but don't yet do any
693 further initialization). */
694 if (instantiated_devices
== 0)
697 /* This is not an error: e.g. we may have CUDA libraries installed but
698 no devices available. */
699 if (r
!= CUDA_SUCCESS
)
703 r
= cuDeviceGetCount (&n
);
704 if (r
!= CUDA_SUCCESS
)
705 GOMP_PLUGIN_fatal ("cuDeviceGetCount error: %s", cuda_error (r
));
712 link_ptx (CUmodule
*module
, const struct targ_ptx_obj
*ptx_objs
,
715 CUjit_option opts
[7];
721 unsigned long logsize
= LOGSIZE
;
722 CUlinkState linkstate
;
725 size_t linkoutsize
__attribute__ ((unused
));
727 opts
[0] = CU_JIT_WALL_TIME
;
728 optvals
[0] = &elapsed
;
730 opts
[1] = CU_JIT_INFO_LOG_BUFFER
;
731 optvals
[1] = &ilog
[0];
733 opts
[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES
;
734 optvals
[2] = (void *) logsize
;
736 opts
[3] = CU_JIT_ERROR_LOG_BUFFER
;
737 optvals
[3] = &elog
[0];
739 opts
[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES
;
740 optvals
[4] = (void *) logsize
;
742 opts
[5] = CU_JIT_LOG_VERBOSE
;
743 optvals
[5] = (void *) 1;
745 opts
[6] = CU_JIT_TARGET
;
746 optvals
[6] = (void *) CU_TARGET_COMPUTE_30
;
748 r
= cuLinkCreate (7, opts
, optvals
, &linkstate
);
749 if (r
!= CUDA_SUCCESS
)
750 GOMP_PLUGIN_fatal ("cuLinkCreate error: %s", cuda_error (r
));
752 for (; num_objs
--; ptx_objs
++)
754 /* cuLinkAddData's 'data' argument erroneously omits the const
756 GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs
->code
);
757 r
= cuLinkAddData (linkstate
, CU_JIT_INPUT_PTX
, (char*)ptx_objs
->code
,
758 ptx_objs
->size
, 0, 0, 0, 0);
759 if (r
!= CUDA_SUCCESS
)
761 GOMP_PLUGIN_error ("Link error log %s\n", &elog
[0]);
762 GOMP_PLUGIN_fatal ("cuLinkAddData (ptx_code) error: %s",
767 GOMP_PLUGIN_debug (0, "Linking\n");
768 r
= cuLinkComplete (linkstate
, &linkout
, &linkoutsize
);
770 GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed
);
771 GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog
[0]);
773 if (r
!= CUDA_SUCCESS
)
774 GOMP_PLUGIN_fatal ("cuLinkComplete error: %s", cuda_error (r
));
776 r
= cuModuleLoadData (module
, linkout
);
777 if (r
!= CUDA_SUCCESS
)
778 GOMP_PLUGIN_fatal ("cuModuleLoadData error: %s", cuda_error (r
));
780 r
= cuLinkDestroy (linkstate
);
781 if (r
!= CUDA_SUCCESS
)
782 GOMP_PLUGIN_fatal ("cuLinkDestory error: %s", cuda_error (r
));
786 event_gc (bool memmap_lockable
)
788 struct ptx_event
*ptx_event
= ptx_events
;
789 struct nvptx_thread
*nvthd
= nvptx_thread ();
791 pthread_mutex_lock (&ptx_event_lock
);
793 while (ptx_event
!= NULL
)
796 struct ptx_event
*e
= ptx_event
;
798 ptx_event
= ptx_event
->next
;
800 if (e
->ord
!= nvthd
->ptx_dev
->ord
)
803 r
= cuEventQuery (*e
->evt
);
804 if (r
== CUDA_SUCCESS
)
820 case PTX_EVT_ASYNC_CLEANUP
:
822 /* The function gomp_plugin_async_unmap_vars needs to claim the
823 memory-map splay tree lock for the current device, so we
824 can't call it when one of our callers has already claimed
825 the lock. In that case, just delay the GC for this event
827 if (!memmap_lockable
)
830 GOMP_PLUGIN_async_unmap_vars (e
->addr
);
835 cuEventDestroy (*te
);
839 ptx_events
= ptx_events
->next
;
842 struct ptx_event
*e_
= ptx_events
;
843 while (e_
->next
!= e
)
845 e_
->next
= e_
->next
->next
;
852 pthread_mutex_unlock (&ptx_event_lock
);
856 event_add (enum ptx_event_type type
, CUevent
*e
, void *h
)
858 struct ptx_event
*ptx_event
;
859 struct nvptx_thread
*nvthd
= nvptx_thread ();
861 assert (type
== PTX_EVT_MEM
|| type
== PTX_EVT_KNL
|| type
== PTX_EVT_SYNC
862 || type
== PTX_EVT_ASYNC_CLEANUP
);
864 ptx_event
= GOMP_PLUGIN_malloc (sizeof (struct ptx_event
));
865 ptx_event
->type
= type
;
868 ptx_event
->ord
= nvthd
->ptx_dev
->ord
;
870 pthread_mutex_lock (&ptx_event_lock
);
872 ptx_event
->next
= ptx_events
;
873 ptx_events
= ptx_event
;
875 pthread_mutex_unlock (&ptx_event_lock
);
879 nvptx_exec (void (*fn
), size_t mapnum
, void **hostaddrs
, void **devaddrs
,
880 int async
, unsigned *dims
, void *targ_mem_desc
)
882 struct targ_fn_descriptor
*targ_fn
= (struct targ_fn_descriptor
*) fn
;
886 struct ptx_stream
*dev_str
;
889 struct nvptx_thread
*nvthd
= nvptx_thread ();
890 const char *maybe_abort_msg
= "(perhaps abort was called)";
892 function
= targ_fn
->fn
;
894 dev_str
= select_stream_for_async (async
, pthread_self (), false, NULL
);
895 assert (dev_str
== nvthd
->current_stream
);
897 /* Initialize the launch dimensions. Typically this is constant,
898 provided by the device compiler, but we must permit runtime
900 for (i
= 0; i
!= 3; i
++)
901 if (targ_fn
->launch
->dim
[i
])
902 dims
[i
] = targ_fn
->launch
->dim
[i
];
904 /* This reserves a chunk of a pre-allocated page of memory mapped on both
905 the host and the device. HP is a host pointer to the new chunk, and DP is
906 the corresponding device pointer. */
907 map_push (dev_str
, async
, mapnum
* sizeof (void *), &hp
, &dp
);
909 GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__
);
911 /* Copy the array of arguments to the mapped page. */
912 for (i
= 0; i
< mapnum
; i
++)
913 ((void **) hp
)[i
] = devaddrs
[i
];
915 /* Copy the (device) pointers to arguments to the device (dp and hp might in
916 fact have the same value on a unified-memory system). */
917 r
= cuMemcpy ((CUdeviceptr
)dp
, (CUdeviceptr
)hp
, mapnum
* sizeof (void *));
918 if (r
!= CUDA_SUCCESS
)
919 GOMP_PLUGIN_fatal ("cuMemcpy failed: %s", cuda_error (r
));
921 GOMP_PLUGIN_debug (0, " %s: kernel %s: launch"
922 " gangs=%u, workers=%u, vectors=%u\n",
923 __FUNCTION__
, targ_fn
->launch
->fn
,
924 dims
[0], dims
[1], dims
[2]);
928 // num_gangs nctaid.x
929 // num_workers ntid.y
930 // vector length ntid.x
933 r
= cuLaunchKernel (function
,
934 dims
[GOMP_DIM_GANG
], 1, 1,
935 dims
[GOMP_DIM_VECTOR
], dims
[GOMP_DIM_WORKER
], 1,
936 0, dev_str
->stream
, kargs
, 0);
937 if (r
!= CUDA_SUCCESS
)
938 GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r
));
940 #ifndef DISABLE_ASYNC
941 if (async
< acc_async_noval
)
943 r
= cuStreamSynchronize (dev_str
->stream
);
944 if (r
== CUDA_ERROR_LAUNCH_FAILED
)
945 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r
),
947 else if (r
!= CUDA_SUCCESS
)
948 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r
));
954 e
= (CUevent
*)GOMP_PLUGIN_malloc (sizeof (CUevent
));
956 r
= cuEventCreate (e
, CU_EVENT_DISABLE_TIMING
);
957 if (r
== CUDA_ERROR_LAUNCH_FAILED
)
958 GOMP_PLUGIN_fatal ("cuEventCreate error: %s %s\n", cuda_error (r
),
960 else if (r
!= CUDA_SUCCESS
)
961 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r
));
965 r
= cuEventRecord (*e
, dev_str
->stream
);
966 if (r
!= CUDA_SUCCESS
)
967 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r
));
969 event_add (PTX_EVT_KNL
, e
, (void *)dev_str
);
972 r
= cuCtxSynchronize ();
973 if (r
== CUDA_ERROR_LAUNCH_FAILED
)
974 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r
),
976 else if (r
!= CUDA_SUCCESS
)
977 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r
));
980 GOMP_PLUGIN_debug (0, " %s: kernel %s: finished\n", __FUNCTION__
,
981 targ_fn
->launch
->fn
);
983 #ifndef DISABLE_ASYNC
984 if (async
< acc_async_noval
)
989 void * openacc_get_current_cuda_context (void);
992 nvptx_alloc (size_t s
)
997 r
= cuMemAlloc (&d
, s
);
998 if (r
== CUDA_ERROR_OUT_OF_MEMORY
)
1000 if (r
!= CUDA_SUCCESS
)
1001 GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r
));
1006 nvptx_free (void *p
)
1012 r
= cuMemGetAddressRange (&pb
, &ps
, (CUdeviceptr
)p
);
1013 if (r
!= CUDA_SUCCESS
)
1014 GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r
));
1016 if ((CUdeviceptr
)p
!= pb
)
1017 GOMP_PLUGIN_fatal ("invalid device address");
1019 r
= cuMemFree ((CUdeviceptr
)p
);
1020 if (r
!= CUDA_SUCCESS
)
1021 GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r
));
1025 nvptx_host2dev (void *d
, const void *h
, size_t s
)
1030 struct nvptx_thread
*nvthd
= nvptx_thread ();
1036 GOMP_PLUGIN_fatal ("invalid device address");
1038 r
= cuMemGetAddressRange (&pb
, &ps
, (CUdeviceptr
)d
);
1039 if (r
!= CUDA_SUCCESS
)
1040 GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r
));
1043 GOMP_PLUGIN_fatal ("invalid device address");
1046 GOMP_PLUGIN_fatal ("invalid host address");
1049 GOMP_PLUGIN_fatal ("invalid host or device address");
1051 if ((void *)(d
+ s
) > (void *)(pb
+ ps
))
1052 GOMP_PLUGIN_fatal ("invalid size");
1054 #ifndef DISABLE_ASYNC
1055 if (nvthd
->current_stream
!= nvthd
->ptx_dev
->null_stream
)
1059 e
= (CUevent
*)GOMP_PLUGIN_malloc (sizeof (CUevent
));
1061 r
= cuEventCreate (e
, CU_EVENT_DISABLE_TIMING
);
1062 if (r
!= CUDA_SUCCESS
)
1063 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r
));
1067 r
= cuMemcpyHtoDAsync ((CUdeviceptr
)d
, h
, s
,
1068 nvthd
->current_stream
->stream
);
1069 if (r
!= CUDA_SUCCESS
)
1070 GOMP_PLUGIN_fatal ("cuMemcpyHtoDAsync error: %s", cuda_error (r
));
1072 r
= cuEventRecord (*e
, nvthd
->current_stream
->stream
);
1073 if (r
!= CUDA_SUCCESS
)
1074 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r
));
1076 event_add (PTX_EVT_MEM
, e
, (void *)h
);
1081 r
= cuMemcpyHtoD ((CUdeviceptr
)d
, h
, s
);
1082 if (r
!= CUDA_SUCCESS
)
1083 GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r
));
1090 nvptx_dev2host (void *h
, const void *d
, size_t s
)
1095 struct nvptx_thread
*nvthd
= nvptx_thread ();
1101 GOMP_PLUGIN_fatal ("invalid device address");
1103 r
= cuMemGetAddressRange (&pb
, &ps
, (CUdeviceptr
)d
);
1104 if (r
!= CUDA_SUCCESS
)
1105 GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r
));
1108 GOMP_PLUGIN_fatal ("invalid device address");
1111 GOMP_PLUGIN_fatal ("invalid host address");
1114 GOMP_PLUGIN_fatal ("invalid host or device address");
1116 if ((void *)(d
+ s
) > (void *)(pb
+ ps
))
1117 GOMP_PLUGIN_fatal ("invalid size");
1119 #ifndef DISABLE_ASYNC
1120 if (nvthd
->current_stream
!= nvthd
->ptx_dev
->null_stream
)
1124 e
= (CUevent
*)GOMP_PLUGIN_malloc (sizeof (CUevent
));
1126 r
= cuEventCreate (e
, CU_EVENT_DISABLE_TIMING
);
1127 if (r
!= CUDA_SUCCESS
)
1128 GOMP_PLUGIN_fatal ("cuEventCreate error: %s\n", cuda_error (r
));
1132 r
= cuMemcpyDtoHAsync (h
, (CUdeviceptr
)d
, s
,
1133 nvthd
->current_stream
->stream
);
1134 if (r
!= CUDA_SUCCESS
)
1135 GOMP_PLUGIN_fatal ("cuMemcpyDtoHAsync error: %s", cuda_error (r
));
1137 r
= cuEventRecord (*e
, nvthd
->current_stream
->stream
);
1138 if (r
!= CUDA_SUCCESS
)
1139 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r
));
1141 event_add (PTX_EVT_MEM
, e
, (void *)h
);
1146 r
= cuMemcpyDtoH (h
, (CUdeviceptr
)d
, s
);
1147 if (r
!= CUDA_SUCCESS
)
1148 GOMP_PLUGIN_fatal ("cuMemcpyDtoH error: %s", cuda_error (r
));
1155 nvptx_set_async (int async
)
1157 struct nvptx_thread
*nvthd
= nvptx_thread ();
1158 nvthd
->current_stream
1159 = select_stream_for_async (async
, pthread_self (), true, NULL
);
1163 nvptx_async_test (int async
)
1166 struct ptx_stream
*s
;
1168 s
= select_stream_for_async (async
, pthread_self (), false, NULL
);
1171 GOMP_PLUGIN_fatal ("unknown async %d", async
);
1173 r
= cuStreamQuery (s
->stream
);
1174 if (r
== CUDA_SUCCESS
)
1176 /* The oacc-parallel.c:goacc_wait function calls this hook to determine
1177 whether all work has completed on this stream, and if so omits the call
1178 to the wait hook. If that happens, event_gc might not get called
1179 (which prevents variables from getting unmapped and their associated
1180 device storage freed), so call it here. */
1184 else if (r
== CUDA_ERROR_NOT_READY
)
1187 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r
));
1193 nvptx_async_test_all (void)
1195 struct ptx_stream
*s
;
1196 pthread_t self
= pthread_self ();
1197 struct nvptx_thread
*nvthd
= nvptx_thread ();
1199 pthread_mutex_lock (&nvthd
->ptx_dev
->stream_lock
);
1201 for (s
= nvthd
->ptx_dev
->active_streams
; s
!= NULL
; s
= s
->next
)
1203 if ((s
->multithreaded
|| pthread_equal (s
->host_thread
, self
))
1204 && cuStreamQuery (s
->stream
) == CUDA_ERROR_NOT_READY
)
1206 pthread_mutex_unlock (&nvthd
->ptx_dev
->stream_lock
);
1211 pthread_mutex_unlock (&nvthd
->ptx_dev
->stream_lock
);
1219 nvptx_wait (int async
)
1222 struct ptx_stream
*s
;
1224 s
= select_stream_for_async (async
, pthread_self (), false, NULL
);
1227 GOMP_PLUGIN_fatal ("unknown async %d", async
);
1229 r
= cuStreamSynchronize (s
->stream
);
1230 if (r
!= CUDA_SUCCESS
)
1231 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r
));
1237 nvptx_wait_async (int async1
, int async2
)
1241 struct ptx_stream
*s1
, *s2
;
1242 pthread_t self
= pthread_self ();
1244 /* The stream that is waiting (rather than being waited for) doesn't
1245 necessarily have to exist already. */
1246 s2
= select_stream_for_async (async2
, self
, true, NULL
);
1248 s1
= select_stream_for_async (async1
, self
, false, NULL
);
1250 GOMP_PLUGIN_fatal ("invalid async 1\n");
1253 GOMP_PLUGIN_fatal ("identical parameters");
1255 e
= (CUevent
*)GOMP_PLUGIN_malloc (sizeof (CUevent
));
1257 r
= cuEventCreate (e
, CU_EVENT_DISABLE_TIMING
);
1258 if (r
!= CUDA_SUCCESS
)
1259 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r
));
1263 r
= cuEventRecord (*e
, s1
->stream
);
1264 if (r
!= CUDA_SUCCESS
)
1265 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r
));
1267 event_add (PTX_EVT_SYNC
, e
, NULL
);
1269 r
= cuStreamWaitEvent (s2
->stream
, *e
, 0);
1270 if (r
!= CUDA_SUCCESS
)
1271 GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuda_error (r
));
1275 nvptx_wait_all (void)
1278 struct ptx_stream
*s
;
1279 pthread_t self
= pthread_self ();
1280 struct nvptx_thread
*nvthd
= nvptx_thread ();
1282 pthread_mutex_lock (&nvthd
->ptx_dev
->stream_lock
);
1284 /* Wait for active streams initiated by this thread (or by multiple threads)
1286 for (s
= nvthd
->ptx_dev
->active_streams
; s
!= NULL
; s
= s
->next
)
1288 if (s
->multithreaded
|| pthread_equal (s
->host_thread
, self
))
1290 r
= cuStreamQuery (s
->stream
);
1291 if (r
== CUDA_SUCCESS
)
1293 else if (r
!= CUDA_ERROR_NOT_READY
)
1294 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r
));
1296 r
= cuStreamSynchronize (s
->stream
);
1297 if (r
!= CUDA_SUCCESS
)
1298 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r
));
1302 pthread_mutex_unlock (&nvthd
->ptx_dev
->stream_lock
);
1308 nvptx_wait_all_async (int async
)
1311 struct ptx_stream
*waiting_stream
, *other_stream
;
1313 struct nvptx_thread
*nvthd
= nvptx_thread ();
1314 pthread_t self
= pthread_self ();
1316 /* The stream doing the waiting. This could be the first mention of the
1317 stream, so create it if necessary. */
1319 = select_stream_for_async (async
, pthread_self (), true, NULL
);
1321 /* Launches on the null stream already block on other streams in the
1323 if (!waiting_stream
|| waiting_stream
== nvthd
->ptx_dev
->null_stream
)
1328 pthread_mutex_lock (&nvthd
->ptx_dev
->stream_lock
);
1330 for (other_stream
= nvthd
->ptx_dev
->active_streams
;
1331 other_stream
!= NULL
;
1332 other_stream
= other_stream
->next
)
1334 if (!other_stream
->multithreaded
1335 && !pthread_equal (other_stream
->host_thread
, self
))
1338 e
= (CUevent
*) GOMP_PLUGIN_malloc (sizeof (CUevent
));
1340 r
= cuEventCreate (e
, CU_EVENT_DISABLE_TIMING
);
1341 if (r
!= CUDA_SUCCESS
)
1342 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r
));
1344 /* Record an event on the waited-for stream. */
1345 r
= cuEventRecord (*e
, other_stream
->stream
);
1346 if (r
!= CUDA_SUCCESS
)
1347 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r
));
1349 event_add (PTX_EVT_SYNC
, e
, NULL
);
1351 r
= cuStreamWaitEvent (waiting_stream
->stream
, *e
, 0);
1352 if (r
!= CUDA_SUCCESS
)
1353 GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuda_error (r
));
1356 pthread_mutex_unlock (&nvthd
->ptx_dev
->stream_lock
);
1360 nvptx_get_current_cuda_device (void)
1362 struct nvptx_thread
*nvthd
= nvptx_thread ();
1364 if (!nvthd
|| !nvthd
->ptx_dev
)
1367 return &nvthd
->ptx_dev
->dev
;
1371 nvptx_get_current_cuda_context (void)
1373 struct nvptx_thread
*nvthd
= nvptx_thread ();
1375 if (!nvthd
|| !nvthd
->ptx_dev
)
1378 return nvthd
->ptx_dev
->ctx
;
1382 nvptx_get_cuda_stream (int async
)
1384 struct ptx_stream
*s
;
1385 struct nvptx_thread
*nvthd
= nvptx_thread ();
1387 if (!nvthd
|| !nvthd
->ptx_dev
)
1390 s
= select_stream_for_async (async
, pthread_self (), false, NULL
);
1392 return s
? s
->stream
: NULL
;
1396 nvptx_set_cuda_stream (int async
, void *stream
)
1398 struct ptx_stream
*oldstream
;
1399 pthread_t self
= pthread_self ();
1400 struct nvptx_thread
*nvthd
= nvptx_thread ();
1402 pthread_mutex_lock (&nvthd
->ptx_dev
->stream_lock
);
1405 GOMP_PLUGIN_fatal ("bad async %d", async
);
1407 /* We have a list of active streams and an array mapping async values to
1408 entries of that list. We need to take "ownership" of the passed-in stream,
1409 and add it to our list, removing the previous entry also (if there was one)
1410 in order to prevent resource leaks. Note the potential for surprise
1411 here: maybe we should keep track of passed-in streams and leave it up to
1412 the user to tidy those up, but that doesn't work for stream handles
1413 returned from acc_get_cuda_stream above... */
1415 oldstream
= select_stream_for_async (async
, self
, false, NULL
);
1419 if (nvthd
->ptx_dev
->active_streams
== oldstream
)
1420 nvthd
->ptx_dev
->active_streams
= nvthd
->ptx_dev
->active_streams
->next
;
1423 struct ptx_stream
*s
= nvthd
->ptx_dev
->active_streams
;
1424 while (s
->next
!= oldstream
)
1426 s
->next
= s
->next
->next
;
1429 cuStreamDestroy (oldstream
->stream
);
1430 map_fini (oldstream
);
1434 pthread_mutex_unlock (&nvthd
->ptx_dev
->stream_lock
);
1436 (void) select_stream_for_async (async
, self
, true, (CUstream
) stream
);
1441 /* Plugin entry points. */
1444 GOMP_OFFLOAD_get_name (void)
1450 GOMP_OFFLOAD_get_caps (void)
1452 return GOMP_OFFLOAD_CAP_OPENACC_200
;
1456 GOMP_OFFLOAD_get_type (void)
1458 return OFFLOAD_TARGET_TYPE_NVIDIA_PTX
;
1462 GOMP_OFFLOAD_get_num_devices (void)
1464 return nvptx_get_num_devices ();
1468 GOMP_OFFLOAD_init_device (int n
)
1470 pthread_mutex_lock (&ptx_dev_lock
);
1472 if (!nvptx_init () || ptx_devices
[n
] != NULL
)
1474 pthread_mutex_unlock (&ptx_dev_lock
);
1478 ptx_devices
[n
] = nvptx_open_device (n
);
1479 instantiated_devices
++;
1481 pthread_mutex_unlock (&ptx_dev_lock
);
1485 GOMP_OFFLOAD_fini_device (int n
)
1487 pthread_mutex_lock (&ptx_dev_lock
);
1489 if (ptx_devices
[n
] != NULL
)
1491 nvptx_attach_host_thread_to_device (n
);
1492 nvptx_close_device (ptx_devices
[n
]);
1493 ptx_devices
[n
] = NULL
;
1494 instantiated_devices
--;
1497 pthread_mutex_unlock (&ptx_dev_lock
);
1500 /* Return the libgomp version number we're compatible with. There is
1501 no requirement for cross-version compatibility. */
1504 GOMP_OFFLOAD_version (void)
1506 return GOMP_VERSION
;
1509 /* Load the (partial) program described by TARGET_DATA to device
1510 number ORD. Allocate and return TARGET_TABLE. */
1513 GOMP_OFFLOAD_load_image (int ord
, unsigned version
, const void *target_data
,
1514 struct addr_pair
**target_table
)
1517 const char *const *var_names
;
1518 const struct targ_fn_launch
*fn_descs
;
1519 unsigned int fn_entries
, var_entries
, i
, j
;
1521 struct targ_fn_descriptor
*targ_fns
;
1522 struct addr_pair
*targ_tbl
;
1523 const nvptx_tdata_t
*img_header
= (const nvptx_tdata_t
*) target_data
;
1524 struct ptx_image_data
*new_image
;
1525 struct ptx_device
*dev
;
1527 if (GOMP_VERSION_DEV (version
) > GOMP_VERSION_NVIDIA_PTX
)
1528 GOMP_PLUGIN_fatal ("Offload data incompatible with PTX plugin"
1529 " (expected %u, received %u)",
1530 GOMP_VERSION_NVIDIA_PTX
, GOMP_VERSION_DEV (version
));
1532 GOMP_OFFLOAD_init_device (ord
);
1534 dev
= ptx_devices
[ord
];
1536 nvptx_attach_host_thread_to_device (ord
);
1538 link_ptx (&module
, img_header
->ptx_objs
, img_header
->ptx_num
);
1540 /* The mkoffload utility emits a struct of pointers/integers at the
1541 start of each offload image. The array of kernel names and the
1542 functions addresses form a one-to-one correspondence. */
1544 var_entries
= img_header
->var_num
;
1545 var_names
= img_header
->var_names
;
1546 fn_entries
= img_header
->fn_num
;
1547 fn_descs
= img_header
->fn_descs
;
1549 targ_tbl
= GOMP_PLUGIN_malloc (sizeof (struct addr_pair
)
1550 * (fn_entries
+ var_entries
));
1551 targ_fns
= GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor
)
1554 *target_table
= targ_tbl
;
1556 new_image
= GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data
));
1557 new_image
->target_data
= target_data
;
1558 new_image
->module
= module
;
1559 new_image
->fns
= targ_fns
;
1561 pthread_mutex_lock (&dev
->image_lock
);
1562 new_image
->next
= dev
->images
;
1563 dev
->images
= new_image
;
1564 pthread_mutex_unlock (&dev
->image_lock
);
1566 for (i
= 0; i
< fn_entries
; i
++, targ_fns
++, targ_tbl
++)
1568 CUfunction function
;
1570 r
= cuModuleGetFunction (&function
, module
, fn_descs
[i
].fn
);
1571 if (r
!= CUDA_SUCCESS
)
1572 GOMP_PLUGIN_fatal ("cuModuleGetFunction error: %s", cuda_error (r
));
1574 targ_fns
->fn
= function
;
1575 targ_fns
->launch
= &fn_descs
[i
];
1577 targ_tbl
->start
= (uintptr_t) targ_fns
;
1578 targ_tbl
->end
= targ_tbl
->start
+ 1;
1581 for (j
= 0; j
< var_entries
; j
++, targ_tbl
++)
1586 r
= cuModuleGetGlobal (&var
, &bytes
, module
, var_names
[j
]);
1587 if (r
!= CUDA_SUCCESS
)
1588 GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r
));
1590 targ_tbl
->start
= (uintptr_t) var
;
1591 targ_tbl
->end
= targ_tbl
->start
+ bytes
;
1594 return fn_entries
+ var_entries
;
1597 /* Unload the program described by TARGET_DATA. DEV_DATA is the
1598 function descriptors allocated by G_O_load_image. */
1601 GOMP_OFFLOAD_unload_image (int ord
, unsigned version
, const void *target_data
)
1603 struct ptx_image_data
*image
, **prev_p
;
1604 struct ptx_device
*dev
= ptx_devices
[ord
];
1606 if (GOMP_VERSION_DEV (version
) > GOMP_VERSION_NVIDIA_PTX
)
1609 pthread_mutex_lock (&dev
->image_lock
);
1610 for (prev_p
= &dev
->images
; (image
= *prev_p
) != 0; prev_p
= &image
->next
)
1611 if (image
->target_data
== target_data
)
1613 *prev_p
= image
->next
;
1614 cuModuleUnload (image
->module
);
1619 pthread_mutex_unlock (&dev
->image_lock
);
1623 GOMP_OFFLOAD_alloc (int ord
, size_t size
)
1625 nvptx_attach_host_thread_to_device (ord
);
1626 return nvptx_alloc (size
);
1630 GOMP_OFFLOAD_free (int ord
, void *ptr
)
1632 nvptx_attach_host_thread_to_device (ord
);
1637 GOMP_OFFLOAD_dev2host (int ord
, void *dst
, const void *src
, size_t n
)
1639 nvptx_attach_host_thread_to_device (ord
);
1640 return nvptx_dev2host (dst
, src
, n
);
1644 GOMP_OFFLOAD_host2dev (int ord
, void *dst
, const void *src
, size_t n
)
1646 nvptx_attach_host_thread_to_device (ord
);
1647 return nvptx_host2dev (dst
, src
, n
);
1650 void (*device_run
) (int n
, void *fn_ptr
, void *vars
) = NULL
;
1653 GOMP_OFFLOAD_openacc_parallel (void (*fn
) (void *), size_t mapnum
,
1654 void **hostaddrs
, void **devaddrs
,
1655 int async
, unsigned *dims
, void *targ_mem_desc
)
1657 nvptx_exec (fn
, mapnum
, hostaddrs
, devaddrs
, async
, dims
, targ_mem_desc
);
1661 GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc
)
1665 struct nvptx_thread
*nvthd
= nvptx_thread ();
1667 e
= (CUevent
*) GOMP_PLUGIN_malloc (sizeof (CUevent
));
1669 r
= cuEventCreate (e
, CU_EVENT_DISABLE_TIMING
);
1670 if (r
!= CUDA_SUCCESS
)
1671 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r
));
1673 r
= cuEventRecord (*e
, nvthd
->current_stream
->stream
);
1674 if (r
!= CUDA_SUCCESS
)
1675 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r
));
1677 event_add (PTX_EVT_ASYNC_CLEANUP
, e
, targ_mem_desc
);
1681 GOMP_OFFLOAD_openacc_async_test (int async
)
1683 return nvptx_async_test (async
);
1687 GOMP_OFFLOAD_openacc_async_test_all (void)
1689 return nvptx_async_test_all ();
1693 GOMP_OFFLOAD_openacc_async_wait (int async
)
1699 GOMP_OFFLOAD_openacc_async_wait_async (int async1
, int async2
)
1701 nvptx_wait_async (async1
, async2
);
1705 GOMP_OFFLOAD_openacc_async_wait_all (void)
1711 GOMP_OFFLOAD_openacc_async_wait_all_async (int async
)
1713 nvptx_wait_all_async (async
);
1717 GOMP_OFFLOAD_openacc_async_set_async (int async
)
1719 nvptx_set_async (async
);
1723 GOMP_OFFLOAD_openacc_create_thread_data (int ord
)
1725 struct ptx_device
*ptx_dev
;
1726 struct nvptx_thread
*nvthd
1727 = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread
));
1731 ptx_dev
= ptx_devices
[ord
];
1735 r
= cuCtxGetCurrent (&thd_ctx
);
1736 if (r
!= CUDA_SUCCESS
)
1737 GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r
));
1739 assert (ptx_dev
->ctx
);
1743 r
= cuCtxPushCurrent (ptx_dev
->ctx
);
1744 if (r
!= CUDA_SUCCESS
)
1745 GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s", cuda_error (r
));
1748 nvthd
->current_stream
= ptx_dev
->null_stream
;
1749 nvthd
->ptx_dev
= ptx_dev
;
1751 return (void *) nvthd
;
1755 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data
)
1761 GOMP_OFFLOAD_openacc_get_current_cuda_device (void)
1763 return nvptx_get_current_cuda_device ();
1767 GOMP_OFFLOAD_openacc_get_current_cuda_context (void)
1769 return nvptx_get_current_cuda_context ();
1772 /* NOTE: This returns a CUstream, not a ptx_stream pointer. */
1775 GOMP_OFFLOAD_openacc_get_cuda_stream (int async
)
1777 return nvptx_get_cuda_stream (async
);
1780 /* NOTE: This takes a CUstream, not a ptx_stream pointer. */
1783 GOMP_OFFLOAD_openacc_set_cuda_stream (int async
, void *stream
)
1785 return nvptx_set_cuda_stream (async
, stream
);