1 /* Plugin for NVPTX execution.
3 Copyright (C) 2013-2015 Free Software Foundation, Inc.
5 Contributed by Mentor Embedded.
7 This file is part of the GNU Offloading and Multi Processing Library
10 Libgomp is free software; you can redistribute it and/or modify it
11 under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 3, or (at your option)
15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
20 Under Section 7 of GPL version 3, you are granted additional
21 permissions described in the GCC Runtime Library Exception, version
22 3.1, as published by the Free Software Foundation.
24 You should have received a copy of the GNU General Public License and
25 a copy of the GCC Runtime Library Exception along with this program;
26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
27 <http://www.gnu.org/licenses/>. */
29 /* Nvidia PTX-specific parts of OpenACC support. The cuda driver
30 library appears to hold some implicit state, but the documentation
31 is not clear as to what that state might be. Or how one might
32 propagate it from one thread to another. */
36 #include "libgomp-plugin.h"
38 #include "oacc-plugin.h"
39 #include "gomp-constants.h"
51 cuda_error (CUresult r
)
53 #if CUDA_VERSION < 7000
54 /* Specified in documentation and present in library from at least
55 5.5. Not declared in header file prior to 7.0. */
56 extern CUresult
cuGetErrorString (CUresult
, const char **);
60 r
= cuGetErrorString (r
, &desc
);
61 if (r
!= CUDA_SUCCESS
)
62 desc
= "unknown cuda error";
67 static unsigned int instantiated_devices
= 0;
68 static pthread_mutex_t ptx_dev_lock
= PTHREAD_MUTEX_INITIALIZER
;
73 pthread_t host_thread
;
84 struct ptx_stream
*next
;
87 /* Thread-specific data for PTX. */
91 struct ptx_stream
*current_stream
;
92 struct ptx_device
*ptx_dev
;
103 map_init (struct ptx_stream
*s
)
107 int size
= getpagesize ();
113 r
= cuMemAllocHost (&s
->h
, size
);
114 if (r
!= CUDA_SUCCESS
)
115 GOMP_PLUGIN_fatal ("cuMemAllocHost error: %s", cuda_error (r
));
117 r
= cuMemHostGetDevicePointer (&s
->d
, s
->h
, 0);
118 if (r
!= CUDA_SUCCESS
)
119 GOMP_PLUGIN_fatal ("cuMemHostGetDevicePointer error: %s", cuda_error (r
));
124 s
->h_end
= s
->h_begin
+ size
;
125 s
->h_next
= s
->h_prev
= s
->h_tail
= s
->h_begin
;
132 map_fini (struct ptx_stream
*s
)
136 r
= cuMemFreeHost (s
->h
);
137 if (r
!= CUDA_SUCCESS
)
138 GOMP_PLUGIN_fatal ("cuMemFreeHost error: %s", cuda_error (r
));
142 map_pop (struct ptx_stream
*s
)
153 s
->h_tail
+= m
->size
;
155 if (s
->h_tail
>= s
->h_end
)
156 s
->h_tail
= s
->h_begin
+ (int) (s
->h_tail
- s
->h_end
);
158 if (s
->h_next
== s
->h_tail
)
159 s
->h_prev
= s
->h_next
;
161 assert (s
->h_next
>= s
->h_begin
);
162 assert (s
->h_tail
>= s
->h_begin
);
163 assert (s
->h_prev
>= s
->h_begin
);
165 assert (s
->h_next
<= s
->h_end
);
166 assert (s
->h_tail
<= s
->h_end
);
167 assert (s
->h_prev
<= s
->h_end
);
171 map_push (struct ptx_stream
*s
, int async
, size_t size
, void **h
, void **d
)
179 left
= s
->h_end
- s
->h_next
;
180 size
+= sizeof (struct map
);
189 s
->h_next
= s
->h_begin
;
191 if (s
->h_next
+ size
> s
->h_end
)
192 GOMP_PLUGIN_fatal ("unable to push map");
201 offset
= (void *)&m
->mappings
[0] - s
->h
;
203 *d
= (void *)(s
->d
+ offset
);
204 *h
= (void *)(s
->h
+ offset
);
206 s
->h_prev
= s
->h_next
;
212 assert (s
->h_next
>= s
->h_begin
);
213 assert (s
->h_tail
>= s
->h_begin
);
214 assert (s
->h_prev
>= s
->h_begin
);
215 assert (s
->h_next
<= s
->h_end
);
216 assert (s
->h_tail
<= s
->h_end
);
217 assert (s
->h_prev
<= s
->h_end
);
222 /* Target data function launch information. */
224 struct targ_fn_launch
227 unsigned short dim
[3];
230 /* Descriptor of a loaded function. */
232 struct targ_fn_descriptor
235 const struct targ_fn_launch
*launch
;
238 /* A loaded PTX image. */
239 struct ptx_image_data
241 const void *target_data
;
244 struct targ_fn_descriptor
*fns
; /* Array of functions. */
246 struct ptx_image_data
*next
;
254 struct ptx_stream
*null_stream
;
255 /* All non-null streams associated with this device (actually context),
256 either created implicitly or passed in from the user (via
257 acc_set_cuda_stream). */
258 struct ptx_stream
*active_streams
;
260 struct ptx_stream
**arr
;
263 /* A lock for use when manipulating the above stream list and array. */
264 pthread_mutex_t stream_lock
;
272 struct ptx_image_data
*images
; /* Images loaded on device. */
273 pthread_mutex_t image_lock
; /* Lock for above list. */
275 struct ptx_device
*next
;
283 PTX_EVT_ASYNC_CLEANUP
293 struct ptx_event
*next
;
296 static pthread_mutex_t ptx_event_lock
;
297 static struct ptx_event
*ptx_events
;
299 static struct ptx_device
**ptx_devices
;
301 static inline struct nvptx_thread
*
304 return (struct nvptx_thread
*) GOMP_PLUGIN_acc_thread ();
308 init_streams_for_device (struct ptx_device
*ptx_dev
, int concurrency
)
311 struct ptx_stream
*null_stream
312 = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream
));
314 null_stream
->stream
= NULL
;
315 null_stream
->host_thread
= pthread_self ();
316 null_stream
->multithreaded
= true;
317 null_stream
->d
= (CUdeviceptr
) NULL
;
318 null_stream
->h
= NULL
;
319 map_init (null_stream
);
320 ptx_dev
->null_stream
= null_stream
;
322 ptx_dev
->active_streams
= NULL
;
323 pthread_mutex_init (&ptx_dev
->stream_lock
, NULL
);
328 /* This is just a guess -- make space for as many async streams as the
329 current device is capable of concurrently executing. This can grow
330 later as necessary. No streams are created yet. */
331 ptx_dev
->async_streams
.arr
332 = GOMP_PLUGIN_malloc (concurrency
* sizeof (struct ptx_stream
*));
333 ptx_dev
->async_streams
.size
= concurrency
;
335 for (i
= 0; i
< concurrency
; i
++)
336 ptx_dev
->async_streams
.arr
[i
] = NULL
;
340 fini_streams_for_device (struct ptx_device
*ptx_dev
)
342 free (ptx_dev
->async_streams
.arr
);
344 while (ptx_dev
->active_streams
!= NULL
)
346 struct ptx_stream
*s
= ptx_dev
->active_streams
;
347 ptx_dev
->active_streams
= ptx_dev
->active_streams
->next
;
350 cuStreamDestroy (s
->stream
);
354 map_fini (ptx_dev
->null_stream
);
355 free (ptx_dev
->null_stream
);
358 /* Select a stream for (OpenACC-semantics) ASYNC argument for the current
359 thread THREAD (and also current device/context). If CREATE is true, create
360 the stream if it does not exist (or use EXISTING if it is non-NULL), and
361 associate the stream with the same thread argument. Returns stream to use
364 static struct ptx_stream
*
365 select_stream_for_async (int async
, pthread_t thread
, bool create
,
368 struct nvptx_thread
*nvthd
= nvptx_thread ();
369 /* Local copy of TLS variable. */
370 struct ptx_device
*ptx_dev
= nvthd
->ptx_dev
;
371 struct ptx_stream
*stream
= NULL
;
372 int orig_async
= async
;
374 /* The special value acc_async_noval (-1) maps (for now) to an
375 implicitly-created stream, which is then handled the same as any other
376 numbered async stream. Other options are available, e.g. using the null
377 stream for anonymous async operations, or choosing an idle stream from an
378 active set. But, stick with this for now. */
379 if (async
> acc_async_sync
)
383 pthread_mutex_lock (&ptx_dev
->stream_lock
);
385 /* NOTE: AFAICT there's no particular need for acc_async_sync to map to the
386 null stream, and in fact better performance may be obtainable if it doesn't
387 (because the null stream enforces overly-strict synchronisation with
388 respect to other streams for legacy reasons, and that's probably not
389 needed with OpenACC). Maybe investigate later. */
390 if (async
== acc_async_sync
)
391 stream
= ptx_dev
->null_stream
;
392 else if (async
>= 0 && async
< ptx_dev
->async_streams
.size
393 && ptx_dev
->async_streams
.arr
[async
] && !(create
&& existing
))
394 stream
= ptx_dev
->async_streams
.arr
[async
];
395 else if (async
>= 0 && create
)
397 if (async
>= ptx_dev
->async_streams
.size
)
399 int i
, newsize
= ptx_dev
->async_streams
.size
* 2;
401 if (async
>= newsize
)
404 ptx_dev
->async_streams
.arr
405 = GOMP_PLUGIN_realloc (ptx_dev
->async_streams
.arr
,
406 newsize
* sizeof (struct ptx_stream
*));
408 for (i
= ptx_dev
->async_streams
.size
; i
< newsize
; i
++)
409 ptx_dev
->async_streams
.arr
[i
] = NULL
;
411 ptx_dev
->async_streams
.size
= newsize
;
414 /* Create a new stream on-demand if there isn't one already, or if we're
415 setting a particular async value to an existing (externally-provided)
417 if (!ptx_dev
->async_streams
.arr
[async
] || existing
)
421 = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream
));
424 s
->stream
= existing
;
427 r
= cuStreamCreate (&s
->stream
, CU_STREAM_DEFAULT
);
428 if (r
!= CUDA_SUCCESS
)
429 GOMP_PLUGIN_fatal ("cuStreamCreate error: %s", cuda_error (r
));
432 /* If CREATE is true, we're going to be queueing some work on this
433 stream. Associate it with the current host thread. */
434 s
->host_thread
= thread
;
435 s
->multithreaded
= false;
437 s
->d
= (CUdeviceptr
) NULL
;
441 s
->next
= ptx_dev
->active_streams
;
442 ptx_dev
->active_streams
= s
;
443 ptx_dev
->async_streams
.arr
[async
] = s
;
446 stream
= ptx_dev
->async_streams
.arr
[async
];
449 GOMP_PLUGIN_fatal ("bad async %d", async
);
453 assert (stream
!= NULL
);
455 /* If we're trying to use the same stream from different threads
456 simultaneously, set stream->multithreaded to true. This affects the
457 behaviour of acc_async_test_all and acc_wait_all, which are supposed to
458 only wait for asynchronous launches from the same host thread they are
459 invoked on. If multiple threads use the same async value, we make note
460 of that here and fall back to testing/waiting for all threads in those
462 if (thread
!= stream
->host_thread
)
463 stream
->multithreaded
= true;
465 pthread_mutex_unlock (&ptx_dev
->stream_lock
);
467 else if (stream
&& !stream
->multithreaded
468 && !pthread_equal (stream
->host_thread
, thread
))
469 GOMP_PLUGIN_fatal ("async %d used on wrong thread", orig_async
);
474 /* Initialize the device. Return TRUE on success, else FALSE. PTX_DEV_LOCK
475 should be locked on entry and remains locked on exit. */
483 if (instantiated_devices
!= 0)
487 if (r
!= CUDA_SUCCESS
)
488 GOMP_PLUGIN_fatal ("cuInit error: %s", cuda_error (r
));
492 pthread_mutex_init (&ptx_event_lock
, NULL
);
494 r
= cuDeviceGetCount (&ndevs
);
495 if (r
!= CUDA_SUCCESS
)
496 GOMP_PLUGIN_fatal ("cuDeviceGetCount error: %s", cuda_error (r
));
498 ptx_devices
= GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device
*)
504 /* Select the N'th PTX device for the current host thread. The device must
505 have been previously opened before calling this function. */
508 nvptx_attach_host_thread_to_device (int n
)
512 struct ptx_device
*ptx_dev
;
515 r
= cuCtxGetDevice (&dev
);
516 if (r
!= CUDA_SUCCESS
&& r
!= CUDA_ERROR_INVALID_CONTEXT
)
517 GOMP_PLUGIN_fatal ("cuCtxGetDevice error: %s", cuda_error (r
));
519 if (r
!= CUDA_ERROR_INVALID_CONTEXT
&& dev
== n
)
525 ptx_dev
= ptx_devices
[n
];
528 r
= cuCtxGetCurrent (&thd_ctx
);
529 if (r
!= CUDA_SUCCESS
)
530 GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r
));
532 /* We don't necessarily have a current context (e.g. if it has been
533 destroyed. Pop it if we do though. */
536 r
= cuCtxPopCurrent (&old_ctx
);
537 if (r
!= CUDA_SUCCESS
)
538 GOMP_PLUGIN_fatal ("cuCtxPopCurrent error: %s", cuda_error (r
));
541 r
= cuCtxPushCurrent (ptx_dev
->ctx
);
542 if (r
!= CUDA_SUCCESS
)
543 GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s", cuda_error (r
));
547 static struct ptx_device
*
548 nvptx_open_device (int n
)
550 struct ptx_device
*ptx_dev
;
551 CUdevice dev
, ctx_dev
;
553 int async_engines
, pi
;
555 r
= cuDeviceGet (&dev
, n
);
556 if (r
!= CUDA_SUCCESS
)
557 GOMP_PLUGIN_fatal ("cuDeviceGet error: %s", cuda_error (r
));
559 ptx_dev
= GOMP_PLUGIN_malloc (sizeof (struct ptx_device
));
563 ptx_dev
->ctx_shared
= false;
565 r
= cuCtxGetDevice (&ctx_dev
);
566 if (r
!= CUDA_SUCCESS
&& r
!= CUDA_ERROR_INVALID_CONTEXT
)
567 GOMP_PLUGIN_fatal ("cuCtxGetDevice error: %s", cuda_error (r
));
569 if (r
!= CUDA_ERROR_INVALID_CONTEXT
&& ctx_dev
!= dev
)
571 /* The current host thread has an active context for a different device.
575 r
= cuCtxPopCurrent (&old_ctx
);
576 if (r
!= CUDA_SUCCESS
)
577 GOMP_PLUGIN_fatal ("cuCtxPopCurrent error: %s", cuda_error (r
));
580 r
= cuCtxGetCurrent (&ptx_dev
->ctx
);
581 if (r
!= CUDA_SUCCESS
)
582 GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r
));
586 r
= cuCtxCreate (&ptx_dev
->ctx
, CU_CTX_SCHED_AUTO
, dev
);
587 if (r
!= CUDA_SUCCESS
)
588 GOMP_PLUGIN_fatal ("cuCtxCreate error: %s", cuda_error (r
));
591 ptx_dev
->ctx_shared
= true;
593 r
= cuDeviceGetAttribute (&pi
, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP
, dev
);
594 if (r
!= CUDA_SUCCESS
)
595 GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r
));
597 ptx_dev
->overlap
= pi
;
599 r
= cuDeviceGetAttribute (&pi
, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY
, dev
);
600 if (r
!= CUDA_SUCCESS
)
601 GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r
));
605 r
= cuDeviceGetAttribute (&pi
, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS
, dev
);
606 if (r
!= CUDA_SUCCESS
)
607 GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r
));
609 ptx_dev
->concur
= pi
;
611 r
= cuDeviceGetAttribute (&pi
, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE
, dev
);
612 if (r
!= CUDA_SUCCESS
)
613 GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r
));
617 r
= cuDeviceGetAttribute (&pi
, CU_DEVICE_ATTRIBUTE_INTEGRATED
, dev
);
618 if (r
!= CUDA_SUCCESS
)
619 GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r
));
623 r
= cuDeviceGetAttribute (&async_engines
,
624 CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT
, dev
);
625 if (r
!= CUDA_SUCCESS
)
628 ptx_dev
->images
= NULL
;
629 pthread_mutex_init (&ptx_dev
->image_lock
, NULL
);
631 init_streams_for_device (ptx_dev
, async_engines
);
637 nvptx_close_device (struct ptx_device
*ptx_dev
)
644 fini_streams_for_device (ptx_dev
);
646 pthread_mutex_destroy (&ptx_dev
->image_lock
);
648 if (!ptx_dev
->ctx_shared
)
650 r
= cuCtxDestroy (ptx_dev
->ctx
);
651 if (r
!= CUDA_SUCCESS
)
652 GOMP_PLUGIN_fatal ("cuCtxDestroy error: %s", cuda_error (r
));
659 nvptx_get_num_devices (void)
664 /* PR libgomp/65099: Currently, we only support offloading in 64-bit
666 if (sizeof (void *) != 8)
669 /* This function will be called before the plugin has been initialized in
670 order to enumerate available devices, but CUDA API routines can't be used
671 until cuInit has been called. Just call it now (but don't yet do any
672 further initialization). */
673 if (instantiated_devices
== 0)
676 /* This is not an error: e.g. we may have CUDA libraries installed but
677 no devices available. */
678 if (r
!= CUDA_SUCCESS
)
682 r
= cuDeviceGetCount (&n
);
683 if (r
!= CUDA_SUCCESS
)
684 GOMP_PLUGIN_fatal ("cuDeviceGetCount error: %s", cuda_error (r
));
691 link_ptx (CUmodule
*module
, const char *ptx_code
)
693 CUjit_option opts
[7];
699 unsigned long logsize
= LOGSIZE
;
700 CUlinkState linkstate
;
703 size_t linkoutsize
__attribute__ ((unused
));
705 GOMP_PLUGIN_debug (0, "attempting to load:\n---\n%s\n---\n", ptx_code
);
707 opts
[0] = CU_JIT_WALL_TIME
;
708 optvals
[0] = &elapsed
;
710 opts
[1] = CU_JIT_INFO_LOG_BUFFER
;
711 optvals
[1] = &ilog
[0];
713 opts
[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES
;
714 optvals
[2] = (void *) logsize
;
716 opts
[3] = CU_JIT_ERROR_LOG_BUFFER
;
717 optvals
[3] = &elog
[0];
719 opts
[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES
;
720 optvals
[4] = (void *) logsize
;
722 opts
[5] = CU_JIT_LOG_VERBOSE
;
723 optvals
[5] = (void *) 1;
725 opts
[6] = CU_JIT_TARGET
;
726 optvals
[6] = (void *) CU_TARGET_COMPUTE_30
;
728 r
= cuLinkCreate (7, opts
, optvals
, &linkstate
);
729 if (r
!= CUDA_SUCCESS
)
730 GOMP_PLUGIN_fatal ("cuLinkCreate error: %s", cuda_error (r
));
732 char *abort_ptx
= ABORT_PTX
;
733 r
= cuLinkAddData (linkstate
, CU_JIT_INPUT_PTX
, abort_ptx
,
734 strlen (abort_ptx
) + 1, 0, 0, 0, 0);
735 if (r
!= CUDA_SUCCESS
)
737 GOMP_PLUGIN_error ("Link error log %s\n", &elog
[0]);
738 GOMP_PLUGIN_fatal ("cuLinkAddData (abort) error: %s", cuda_error (r
));
741 char *acc_on_device_ptx
= ACC_ON_DEVICE_PTX
;
742 r
= cuLinkAddData (linkstate
, CU_JIT_INPUT_PTX
, acc_on_device_ptx
,
743 strlen (acc_on_device_ptx
) + 1, 0, 0, 0, 0);
744 if (r
!= CUDA_SUCCESS
)
746 GOMP_PLUGIN_error ("Link error log %s\n", &elog
[0]);
747 GOMP_PLUGIN_fatal ("cuLinkAddData (acc_on_device) error: %s",
751 char *goacc_internal_ptx
= GOACC_INTERNAL_PTX
;
752 r
= cuLinkAddData (linkstate
, CU_JIT_INPUT_PTX
, goacc_internal_ptx
,
753 strlen (goacc_internal_ptx
) + 1, 0, 0, 0, 0);
754 if (r
!= CUDA_SUCCESS
)
756 GOMP_PLUGIN_error ("Link error log %s\n", &elog
[0]);
757 GOMP_PLUGIN_fatal ("cuLinkAddData (goacc_internal_ptx) error: %s",
761 /* cuLinkAddData's 'data' argument erroneously omits the const qualifier. */
762 r
= cuLinkAddData (linkstate
, CU_JIT_INPUT_PTX
, (char *)ptx_code
,
763 strlen (ptx_code
) + 1, 0, 0, 0, 0);
764 if (r
!= CUDA_SUCCESS
)
766 GOMP_PLUGIN_error ("Link error log %s\n", &elog
[0]);
767 GOMP_PLUGIN_fatal ("cuLinkAddData (ptx_code) error: %s", cuda_error (r
));
770 r
= cuLinkComplete (linkstate
, &linkout
, &linkoutsize
);
771 if (r
!= CUDA_SUCCESS
)
772 GOMP_PLUGIN_fatal ("cuLinkComplete error: %s", cuda_error (r
));
774 GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed
);
775 GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog
[0]);
777 r
= cuModuleLoadData (module
, linkout
);
778 if (r
!= CUDA_SUCCESS
)
779 GOMP_PLUGIN_fatal ("cuModuleLoadData error: %s", cuda_error (r
));
783 event_gc (bool memmap_lockable
)
785 struct ptx_event
*ptx_event
= ptx_events
;
786 struct nvptx_thread
*nvthd
= nvptx_thread ();
788 pthread_mutex_lock (&ptx_event_lock
);
790 while (ptx_event
!= NULL
)
793 struct ptx_event
*e
= ptx_event
;
795 ptx_event
= ptx_event
->next
;
797 if (e
->ord
!= nvthd
->ptx_dev
->ord
)
800 r
= cuEventQuery (*e
->evt
);
801 if (r
== CUDA_SUCCESS
)
817 case PTX_EVT_ASYNC_CLEANUP
:
819 /* The function gomp_plugin_async_unmap_vars needs to claim the
820 memory-map splay tree lock for the current device, so we
821 can't call it when one of our callers has already claimed
822 the lock. In that case, just delay the GC for this event
824 if (!memmap_lockable
)
827 GOMP_PLUGIN_async_unmap_vars (e
->addr
);
832 cuEventDestroy (*te
);
836 ptx_events
= ptx_events
->next
;
839 struct ptx_event
*e_
= ptx_events
;
840 while (e_
->next
!= e
)
842 e_
->next
= e_
->next
->next
;
849 pthread_mutex_unlock (&ptx_event_lock
);
853 event_add (enum ptx_event_type type
, CUevent
*e
, void *h
)
855 struct ptx_event
*ptx_event
;
856 struct nvptx_thread
*nvthd
= nvptx_thread ();
858 assert (type
== PTX_EVT_MEM
|| type
== PTX_EVT_KNL
|| type
== PTX_EVT_SYNC
859 || type
== PTX_EVT_ASYNC_CLEANUP
);
861 ptx_event
= GOMP_PLUGIN_malloc (sizeof (struct ptx_event
));
862 ptx_event
->type
= type
;
865 ptx_event
->ord
= nvthd
->ptx_dev
->ord
;
867 pthread_mutex_lock (&ptx_event_lock
);
869 ptx_event
->next
= ptx_events
;
870 ptx_events
= ptx_event
;
872 pthread_mutex_unlock (&ptx_event_lock
);
876 nvptx_exec (void (*fn
), size_t mapnum
, void **hostaddrs
, void **devaddrs
,
877 size_t *sizes
, unsigned short *kinds
, int async
, unsigned *dims
,
880 struct targ_fn_descriptor
*targ_fn
= (struct targ_fn_descriptor
*) fn
;
884 struct ptx_stream
*dev_str
;
887 struct nvptx_thread
*nvthd
= nvptx_thread ();
888 const char *maybe_abort_msg
= "(perhaps abort was called)";
890 function
= targ_fn
->fn
;
892 dev_str
= select_stream_for_async (async
, pthread_self (), false, NULL
);
893 assert (dev_str
== nvthd
->current_stream
);
895 /* Initialize the launch dimensions. Typically this is constant,
896 provided by the device compiler, but we must permit runtime
898 for (i
= 0; i
!= 3; i
++)
899 if (targ_fn
->launch
->dim
[i
])
900 dims
[i
] = targ_fn
->launch
->dim
[i
];
902 if (dims
[GOMP_DIM_GANG
] != 1)
903 GOMP_PLUGIN_fatal ("non-unity num_gangs (%d) not supported",
904 dims
[GOMP_DIM_GANG
]);
905 if (dims
[GOMP_DIM_WORKER
] != 1)
906 GOMP_PLUGIN_fatal ("non-unity num_workers (%d) not supported",
907 dims
[GOMP_DIM_WORKER
]);
909 /* This reserves a chunk of a pre-allocated page of memory mapped on both
910 the host and the device. HP is a host pointer to the new chunk, and DP is
911 the corresponding device pointer. */
912 map_push (dev_str
, async
, mapnum
* sizeof (void *), &hp
, &dp
);
914 GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__
);
916 /* Copy the array of arguments to the mapped page. */
917 for (i
= 0; i
< mapnum
; i
++)
918 ((void **) hp
)[i
] = devaddrs
[i
];
920 /* Copy the (device) pointers to arguments to the device (dp and hp might in
921 fact have the same value on a unified-memory system). */
922 r
= cuMemcpy ((CUdeviceptr
)dp
, (CUdeviceptr
)hp
, mapnum
* sizeof (void *));
923 if (r
!= CUDA_SUCCESS
)
924 GOMP_PLUGIN_fatal ("cuMemcpy failed: %s", cuda_error (r
));
926 GOMP_PLUGIN_debug (0, " %s: kernel %s: launch"
927 " gangs=%u, workers=%u, vectors=%u\n",
928 __FUNCTION__
, targ_fn
->launch
->fn
,
929 dims
[0], dims
[1], dims
[2]);
933 // num_gangs nctaid.x
934 // num_workers ntid.y
935 // vector length ntid.x
938 r
= cuLaunchKernel (function
,
939 dims
[GOMP_DIM_GANG
], 1, 1,
940 dims
[GOMP_DIM_VECTOR
], dims
[GOMP_DIM_WORKER
], 1,
941 0, dev_str
->stream
, kargs
, 0);
942 if (r
!= CUDA_SUCCESS
)
943 GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r
));
945 #ifndef DISABLE_ASYNC
946 if (async
< acc_async_noval
)
948 r
= cuStreamSynchronize (dev_str
->stream
);
949 if (r
== CUDA_ERROR_LAUNCH_FAILED
)
950 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r
),
952 else if (r
!= CUDA_SUCCESS
)
953 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r
));
959 e
= (CUevent
*)GOMP_PLUGIN_malloc (sizeof (CUevent
));
961 r
= cuEventCreate (e
, CU_EVENT_DISABLE_TIMING
);
962 if (r
== CUDA_ERROR_LAUNCH_FAILED
)
963 GOMP_PLUGIN_fatal ("cuEventCreate error: %s %s\n", cuda_error (r
),
965 else if (r
!= CUDA_SUCCESS
)
966 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r
));
970 r
= cuEventRecord (*e
, dev_str
->stream
);
971 if (r
!= CUDA_SUCCESS
)
972 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r
));
974 event_add (PTX_EVT_KNL
, e
, (void *)dev_str
);
977 r
= cuCtxSynchronize ();
978 if (r
== CUDA_ERROR_LAUNCH_FAILED
)
979 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r
),
981 else if (r
!= CUDA_SUCCESS
)
982 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r
));
985 GOMP_PLUGIN_debug (0, " %s: kernel %s: finished\n", __FUNCTION__
,
986 targ_fn
->launch
->fn
);
988 #ifndef DISABLE_ASYNC
989 if (async
< acc_async_noval
)
994 void * openacc_get_current_cuda_context (void);
997 nvptx_alloc (size_t s
)
1002 r
= cuMemAlloc (&d
, s
);
1003 if (r
== CUDA_ERROR_OUT_OF_MEMORY
)
1005 if (r
!= CUDA_SUCCESS
)
1006 GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r
));
1011 nvptx_free (void *p
)
1017 r
= cuMemGetAddressRange (&pb
, &ps
, (CUdeviceptr
)p
);
1018 if (r
!= CUDA_SUCCESS
)
1019 GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r
));
1021 if ((CUdeviceptr
)p
!= pb
)
1022 GOMP_PLUGIN_fatal ("invalid device address");
1024 r
= cuMemFree ((CUdeviceptr
)p
);
1025 if (r
!= CUDA_SUCCESS
)
1026 GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r
));
1030 nvptx_host2dev (void *d
, const void *h
, size_t s
)
1035 struct nvptx_thread
*nvthd
= nvptx_thread ();
1041 GOMP_PLUGIN_fatal ("invalid device address");
1043 r
= cuMemGetAddressRange (&pb
, &ps
, (CUdeviceptr
)d
);
1044 if (r
!= CUDA_SUCCESS
)
1045 GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r
));
1048 GOMP_PLUGIN_fatal ("invalid device address");
1051 GOMP_PLUGIN_fatal ("invalid host address");
1054 GOMP_PLUGIN_fatal ("invalid host or device address");
1056 if ((void *)(d
+ s
) > (void *)(pb
+ ps
))
1057 GOMP_PLUGIN_fatal ("invalid size");
1059 #ifndef DISABLE_ASYNC
1060 if (nvthd
->current_stream
!= nvthd
->ptx_dev
->null_stream
)
1064 e
= (CUevent
*)GOMP_PLUGIN_malloc (sizeof (CUevent
));
1066 r
= cuEventCreate (e
, CU_EVENT_DISABLE_TIMING
);
1067 if (r
!= CUDA_SUCCESS
)
1068 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r
));
1072 r
= cuMemcpyHtoDAsync ((CUdeviceptr
)d
, h
, s
,
1073 nvthd
->current_stream
->stream
);
1074 if (r
!= CUDA_SUCCESS
)
1075 GOMP_PLUGIN_fatal ("cuMemcpyHtoDAsync error: %s", cuda_error (r
));
1077 r
= cuEventRecord (*e
, nvthd
->current_stream
->stream
);
1078 if (r
!= CUDA_SUCCESS
)
1079 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r
));
1081 event_add (PTX_EVT_MEM
, e
, (void *)h
);
1086 r
= cuMemcpyHtoD ((CUdeviceptr
)d
, h
, s
);
1087 if (r
!= CUDA_SUCCESS
)
1088 GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r
));
1095 nvptx_dev2host (void *h
, const void *d
, size_t s
)
1100 struct nvptx_thread
*nvthd
= nvptx_thread ();
1106 GOMP_PLUGIN_fatal ("invalid device address");
1108 r
= cuMemGetAddressRange (&pb
, &ps
, (CUdeviceptr
)d
);
1109 if (r
!= CUDA_SUCCESS
)
1110 GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r
));
1113 GOMP_PLUGIN_fatal ("invalid device address");
1116 GOMP_PLUGIN_fatal ("invalid host address");
1119 GOMP_PLUGIN_fatal ("invalid host or device address");
1121 if ((void *)(d
+ s
) > (void *)(pb
+ ps
))
1122 GOMP_PLUGIN_fatal ("invalid size");
1124 #ifndef DISABLE_ASYNC
1125 if (nvthd
->current_stream
!= nvthd
->ptx_dev
->null_stream
)
1129 e
= (CUevent
*)GOMP_PLUGIN_malloc (sizeof (CUevent
));
1131 r
= cuEventCreate (e
, CU_EVENT_DISABLE_TIMING
);
1132 if (r
!= CUDA_SUCCESS
)
1133 GOMP_PLUGIN_fatal ("cuEventCreate error: %s\n", cuda_error (r
));
1137 r
= cuMemcpyDtoHAsync (h
, (CUdeviceptr
)d
, s
,
1138 nvthd
->current_stream
->stream
);
1139 if (r
!= CUDA_SUCCESS
)
1140 GOMP_PLUGIN_fatal ("cuMemcpyDtoHAsync error: %s", cuda_error (r
));
1142 r
= cuEventRecord (*e
, nvthd
->current_stream
->stream
);
1143 if (r
!= CUDA_SUCCESS
)
1144 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r
));
1146 event_add (PTX_EVT_MEM
, e
, (void *)h
);
1151 r
= cuMemcpyDtoH (h
, (CUdeviceptr
)d
, s
);
1152 if (r
!= CUDA_SUCCESS
)
1153 GOMP_PLUGIN_fatal ("cuMemcpyDtoH error: %s", cuda_error (r
));
1160 nvptx_set_async (int async
)
1162 struct nvptx_thread
*nvthd
= nvptx_thread ();
1163 nvthd
->current_stream
1164 = select_stream_for_async (async
, pthread_self (), true, NULL
);
1168 nvptx_async_test (int async
)
1171 struct ptx_stream
*s
;
1173 s
= select_stream_for_async (async
, pthread_self (), false, NULL
);
1176 GOMP_PLUGIN_fatal ("unknown async %d", async
);
1178 r
= cuStreamQuery (s
->stream
);
1179 if (r
== CUDA_SUCCESS
)
1181 /* The oacc-parallel.c:goacc_wait function calls this hook to determine
1182 whether all work has completed on this stream, and if so omits the call
1183 to the wait hook. If that happens, event_gc might not get called
1184 (which prevents variables from getting unmapped and their associated
1185 device storage freed), so call it here. */
1189 else if (r
== CUDA_ERROR_NOT_READY
)
1192 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r
));
1198 nvptx_async_test_all (void)
1200 struct ptx_stream
*s
;
1201 pthread_t self
= pthread_self ();
1202 struct nvptx_thread
*nvthd
= nvptx_thread ();
1204 pthread_mutex_lock (&nvthd
->ptx_dev
->stream_lock
);
1206 for (s
= nvthd
->ptx_dev
->active_streams
; s
!= NULL
; s
= s
->next
)
1208 if ((s
->multithreaded
|| pthread_equal (s
->host_thread
, self
))
1209 && cuStreamQuery (s
->stream
) == CUDA_ERROR_NOT_READY
)
1211 pthread_mutex_unlock (&nvthd
->ptx_dev
->stream_lock
);
1216 pthread_mutex_unlock (&nvthd
->ptx_dev
->stream_lock
);
1224 nvptx_wait (int async
)
1227 struct ptx_stream
*s
;
1229 s
= select_stream_for_async (async
, pthread_self (), false, NULL
);
1232 GOMP_PLUGIN_fatal ("unknown async %d", async
);
1234 r
= cuStreamSynchronize (s
->stream
);
1235 if (r
!= CUDA_SUCCESS
)
1236 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r
));
1242 nvptx_wait_async (int async1
, int async2
)
1246 struct ptx_stream
*s1
, *s2
;
1247 pthread_t self
= pthread_self ();
1249 /* The stream that is waiting (rather than being waited for) doesn't
1250 necessarily have to exist already. */
1251 s2
= select_stream_for_async (async2
, self
, true, NULL
);
1253 s1
= select_stream_for_async (async1
, self
, false, NULL
);
1255 GOMP_PLUGIN_fatal ("invalid async 1\n");
1258 GOMP_PLUGIN_fatal ("identical parameters");
1260 e
= (CUevent
*)GOMP_PLUGIN_malloc (sizeof (CUevent
));
1262 r
= cuEventCreate (e
, CU_EVENT_DISABLE_TIMING
);
1263 if (r
!= CUDA_SUCCESS
)
1264 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r
));
1268 r
= cuEventRecord (*e
, s1
->stream
);
1269 if (r
!= CUDA_SUCCESS
)
1270 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r
));
1272 event_add (PTX_EVT_SYNC
, e
, NULL
);
1274 r
= cuStreamWaitEvent (s2
->stream
, *e
, 0);
1275 if (r
!= CUDA_SUCCESS
)
1276 GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuda_error (r
));
1280 nvptx_wait_all (void)
1283 struct ptx_stream
*s
;
1284 pthread_t self
= pthread_self ();
1285 struct nvptx_thread
*nvthd
= nvptx_thread ();
1287 pthread_mutex_lock (&nvthd
->ptx_dev
->stream_lock
);
1289 /* Wait for active streams initiated by this thread (or by multiple threads)
1291 for (s
= nvthd
->ptx_dev
->active_streams
; s
!= NULL
; s
= s
->next
)
1293 if (s
->multithreaded
|| pthread_equal (s
->host_thread
, self
))
1295 r
= cuStreamQuery (s
->stream
);
1296 if (r
== CUDA_SUCCESS
)
1298 else if (r
!= CUDA_ERROR_NOT_READY
)
1299 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r
));
1301 r
= cuStreamSynchronize (s
->stream
);
1302 if (r
!= CUDA_SUCCESS
)
1303 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r
));
1307 pthread_mutex_unlock (&nvthd
->ptx_dev
->stream_lock
);
1313 nvptx_wait_all_async (int async
)
1316 struct ptx_stream
*waiting_stream
, *other_stream
;
1318 struct nvptx_thread
*nvthd
= nvptx_thread ();
1319 pthread_t self
= pthread_self ();
1321 /* The stream doing the waiting. This could be the first mention of the
1322 stream, so create it if necessary. */
1324 = select_stream_for_async (async
, pthread_self (), true, NULL
);
1326 /* Launches on the null stream already block on other streams in the
1328 if (!waiting_stream
|| waiting_stream
== nvthd
->ptx_dev
->null_stream
)
1333 pthread_mutex_lock (&nvthd
->ptx_dev
->stream_lock
);
1335 for (other_stream
= nvthd
->ptx_dev
->active_streams
;
1336 other_stream
!= NULL
;
1337 other_stream
= other_stream
->next
)
1339 if (!other_stream
->multithreaded
1340 && !pthread_equal (other_stream
->host_thread
, self
))
1343 e
= (CUevent
*) GOMP_PLUGIN_malloc (sizeof (CUevent
));
1345 r
= cuEventCreate (e
, CU_EVENT_DISABLE_TIMING
);
1346 if (r
!= CUDA_SUCCESS
)
1347 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r
));
1349 /* Record an event on the waited-for stream. */
1350 r
= cuEventRecord (*e
, other_stream
->stream
);
1351 if (r
!= CUDA_SUCCESS
)
1352 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r
));
1354 event_add (PTX_EVT_SYNC
, e
, NULL
);
1356 r
= cuStreamWaitEvent (waiting_stream
->stream
, *e
, 0);
1357 if (r
!= CUDA_SUCCESS
)
1358 GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuda_error (r
));
1361 pthread_mutex_unlock (&nvthd
->ptx_dev
->stream_lock
);
1365 nvptx_get_current_cuda_device (void)
1367 struct nvptx_thread
*nvthd
= nvptx_thread ();
1369 if (!nvthd
|| !nvthd
->ptx_dev
)
1372 return &nvthd
->ptx_dev
->dev
;
1376 nvptx_get_current_cuda_context (void)
1378 struct nvptx_thread
*nvthd
= nvptx_thread ();
1380 if (!nvthd
|| !nvthd
->ptx_dev
)
1383 return nvthd
->ptx_dev
->ctx
;
1387 nvptx_get_cuda_stream (int async
)
1389 struct ptx_stream
*s
;
1390 struct nvptx_thread
*nvthd
= nvptx_thread ();
1392 if (!nvthd
|| !nvthd
->ptx_dev
)
1395 s
= select_stream_for_async (async
, pthread_self (), false, NULL
);
1397 return s
? s
->stream
: NULL
;
1401 nvptx_set_cuda_stream (int async
, void *stream
)
1403 struct ptx_stream
*oldstream
;
1404 pthread_t self
= pthread_self ();
1405 struct nvptx_thread
*nvthd
= nvptx_thread ();
1407 pthread_mutex_lock (&nvthd
->ptx_dev
->stream_lock
);
1410 GOMP_PLUGIN_fatal ("bad async %d", async
);
1412 /* We have a list of active streams and an array mapping async values to
1413 entries of that list. We need to take "ownership" of the passed-in stream,
1414 and add it to our list, removing the previous entry also (if there was one)
1415 in order to prevent resource leaks. Note the potential for surprise
1416 here: maybe we should keep track of passed-in streams and leave it up to
1417 the user to tidy those up, but that doesn't work for stream handles
1418 returned from acc_get_cuda_stream above... */
1420 oldstream
= select_stream_for_async (async
, self
, false, NULL
);
1424 if (nvthd
->ptx_dev
->active_streams
== oldstream
)
1425 nvthd
->ptx_dev
->active_streams
= nvthd
->ptx_dev
->active_streams
->next
;
1428 struct ptx_stream
*s
= nvthd
->ptx_dev
->active_streams
;
1429 while (s
->next
!= oldstream
)
1431 s
->next
= s
->next
->next
;
1434 cuStreamDestroy (oldstream
->stream
);
1435 map_fini (oldstream
);
1439 pthread_mutex_unlock (&nvthd
->ptx_dev
->stream_lock
);
1441 (void) select_stream_for_async (async
, self
, true, (CUstream
) stream
);
1446 /* Plugin entry points. */
1449 GOMP_OFFLOAD_get_name (void)
1455 GOMP_OFFLOAD_get_caps (void)
1457 return GOMP_OFFLOAD_CAP_OPENACC_200
;
1461 GOMP_OFFLOAD_get_type (void)
1463 return OFFLOAD_TARGET_TYPE_NVIDIA_PTX
;
1467 GOMP_OFFLOAD_get_num_devices (void)
1469 return nvptx_get_num_devices ();
1473 GOMP_OFFLOAD_init_device (int n
)
1475 pthread_mutex_lock (&ptx_dev_lock
);
1477 if (!nvptx_init () || ptx_devices
[n
] != NULL
)
1479 pthread_mutex_unlock (&ptx_dev_lock
);
1483 ptx_devices
[n
] = nvptx_open_device (n
);
1484 instantiated_devices
++;
1486 pthread_mutex_unlock (&ptx_dev_lock
);
1490 GOMP_OFFLOAD_fini_device (int n
)
1492 pthread_mutex_lock (&ptx_dev_lock
);
1494 if (ptx_devices
[n
] != NULL
)
1496 nvptx_attach_host_thread_to_device (n
);
1497 nvptx_close_device (ptx_devices
[n
]);
1498 ptx_devices
[n
] = NULL
;
1499 instantiated_devices
--;
1502 pthread_mutex_unlock (&ptx_dev_lock
);
1505 /* Data emitted by mkoffload. */
1507 typedef struct nvptx_tdata
1509 const char *ptx_src
;
1511 const char *const *var_names
;
1514 const struct targ_fn_launch
*fn_descs
;
1518 /* Return the libgomp version number we're compatible with. There is
1519 no requirement for cross-version compatibility. */
1522 GOMP_OFFLOAD_version (void)
1524 return GOMP_VERSION
;
1527 /* Load the (partial) program described by TARGET_DATA to device
1528 number ORD. Allocate and return TARGET_TABLE. */
1531 GOMP_OFFLOAD_load_image (int ord
, unsigned version
, const void *target_data
,
1532 struct addr_pair
**target_table
)
1535 const char *const *var_names
;
1536 const struct targ_fn_launch
*fn_descs
;
1537 unsigned int fn_entries
, var_entries
, i
, j
;
1539 struct targ_fn_descriptor
*targ_fns
;
1540 struct addr_pair
*targ_tbl
;
1541 const nvptx_tdata_t
*img_header
= (const nvptx_tdata_t
*) target_data
;
1542 struct ptx_image_data
*new_image
;
1543 struct ptx_device
*dev
;
1545 if (GOMP_VERSION_DEV (version
) > GOMP_VERSION_NVIDIA_PTX
)
1546 GOMP_PLUGIN_fatal ("Offload data incompatible with PTX plugin"
1547 " (expected %u, received %u)",
1548 GOMP_VERSION_NVIDIA_PTX
, GOMP_VERSION_DEV (version
));
1550 GOMP_OFFLOAD_init_device (ord
);
1552 dev
= ptx_devices
[ord
];
1554 nvptx_attach_host_thread_to_device (ord
);
1556 link_ptx (&module
, img_header
->ptx_src
);
1558 /* The mkoffload utility emits a struct of pointers/integers at the
1559 start of each offload image. The array of kernel names and the
1560 functions addresses form a one-to-one correspondence. */
1562 var_entries
= img_header
->var_num
;
1563 var_names
= img_header
->var_names
;
1564 fn_entries
= img_header
->fn_num
;
1565 fn_descs
= img_header
->fn_descs
;
1567 targ_tbl
= GOMP_PLUGIN_malloc (sizeof (struct addr_pair
)
1568 * (fn_entries
+ var_entries
));
1569 targ_fns
= GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor
)
1572 *target_table
= targ_tbl
;
1574 new_image
= GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data
));
1575 new_image
->target_data
= target_data
;
1576 new_image
->module
= module
;
1577 new_image
->fns
= targ_fns
;
1579 pthread_mutex_lock (&dev
->image_lock
);
1580 new_image
->next
= dev
->images
;
1581 dev
->images
= new_image
;
1582 pthread_mutex_unlock (&dev
->image_lock
);
1584 for (i
= 0; i
< fn_entries
; i
++, targ_fns
++, targ_tbl
++)
1586 CUfunction function
;
1588 r
= cuModuleGetFunction (&function
, module
, fn_descs
[i
].fn
);
1589 if (r
!= CUDA_SUCCESS
)
1590 GOMP_PLUGIN_fatal ("cuModuleGetFunction error: %s", cuda_error (r
));
1592 targ_fns
->fn
= function
;
1593 targ_fns
->launch
= &fn_descs
[i
];
1595 targ_tbl
->start
= (uintptr_t) targ_fns
;
1596 targ_tbl
->end
= targ_tbl
->start
+ 1;
1599 for (j
= 0; j
< var_entries
; j
++, targ_tbl
++)
1604 r
= cuModuleGetGlobal (&var
, &bytes
, module
, var_names
[j
]);
1605 if (r
!= CUDA_SUCCESS
)
1606 GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r
));
1608 targ_tbl
->start
= (uintptr_t) var
;
1609 targ_tbl
->end
= targ_tbl
->start
+ bytes
;
1612 return fn_entries
+ var_entries
;
1615 /* Unload the program described by TARGET_DATA. DEV_DATA is the
1616 function descriptors allocated by G_O_load_image. */
1619 GOMP_OFFLOAD_unload_image (int ord
, unsigned version
, const void *target_data
)
1621 struct ptx_image_data
*image
, **prev_p
;
1622 struct ptx_device
*dev
= ptx_devices
[ord
];
1624 if (GOMP_VERSION_DEV (version
) > GOMP_VERSION_NVIDIA_PTX
)
1627 pthread_mutex_lock (&dev
->image_lock
);
1628 for (prev_p
= &dev
->images
; (image
= *prev_p
) != 0; prev_p
= &image
->next
)
1629 if (image
->target_data
== target_data
)
1631 *prev_p
= image
->next
;
1632 cuModuleUnload (image
->module
);
1637 pthread_mutex_unlock (&dev
->image_lock
);
1641 GOMP_OFFLOAD_alloc (int ord
, size_t size
)
1643 nvptx_attach_host_thread_to_device (ord
);
1644 return nvptx_alloc (size
);
1648 GOMP_OFFLOAD_free (int ord
, void *ptr
)
1650 nvptx_attach_host_thread_to_device (ord
);
1655 GOMP_OFFLOAD_dev2host (int ord
, void *dst
, const void *src
, size_t n
)
1657 nvptx_attach_host_thread_to_device (ord
);
1658 return nvptx_dev2host (dst
, src
, n
);
1662 GOMP_OFFLOAD_host2dev (int ord
, void *dst
, const void *src
, size_t n
)
1664 nvptx_attach_host_thread_to_device (ord
);
1665 return nvptx_host2dev (dst
, src
, n
);
1668 void (*device_run
) (int n
, void *fn_ptr
, void *vars
) = NULL
;
1671 GOMP_OFFLOAD_openacc_parallel (void (*fn
) (void *), size_t mapnum
,
1672 void **hostaddrs
, void **devaddrs
,
1673 size_t *sizes
, unsigned short *kinds
,
1674 int async
, unsigned *dims
, void *targ_mem_desc
)
1676 nvptx_exec (fn
, mapnum
, hostaddrs
, devaddrs
, sizes
, kinds
,
1677 async
, dims
, targ_mem_desc
);
1681 GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc
)
1685 struct nvptx_thread
*nvthd
= nvptx_thread ();
1687 e
= (CUevent
*) GOMP_PLUGIN_malloc (sizeof (CUevent
));
1689 r
= cuEventCreate (e
, CU_EVENT_DISABLE_TIMING
);
1690 if (r
!= CUDA_SUCCESS
)
1691 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r
));
1693 r
= cuEventRecord (*e
, nvthd
->current_stream
->stream
);
1694 if (r
!= CUDA_SUCCESS
)
1695 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r
));
1697 event_add (PTX_EVT_ASYNC_CLEANUP
, e
, targ_mem_desc
);
1701 GOMP_OFFLOAD_openacc_async_test (int async
)
1703 return nvptx_async_test (async
);
1707 GOMP_OFFLOAD_openacc_async_test_all (void)
1709 return nvptx_async_test_all ();
1713 GOMP_OFFLOAD_openacc_async_wait (int async
)
1719 GOMP_OFFLOAD_openacc_async_wait_async (int async1
, int async2
)
1721 nvptx_wait_async (async1
, async2
);
1725 GOMP_OFFLOAD_openacc_async_wait_all (void)
1731 GOMP_OFFLOAD_openacc_async_wait_all_async (int async
)
1733 nvptx_wait_all_async (async
);
1737 GOMP_OFFLOAD_openacc_async_set_async (int async
)
1739 nvptx_set_async (async
);
1743 GOMP_OFFLOAD_openacc_create_thread_data (int ord
)
1745 struct ptx_device
*ptx_dev
;
1746 struct nvptx_thread
*nvthd
1747 = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread
));
1751 ptx_dev
= ptx_devices
[ord
];
1755 r
= cuCtxGetCurrent (&thd_ctx
);
1756 if (r
!= CUDA_SUCCESS
)
1757 GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r
));
1759 assert (ptx_dev
->ctx
);
1763 r
= cuCtxPushCurrent (ptx_dev
->ctx
);
1764 if (r
!= CUDA_SUCCESS
)
1765 GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s", cuda_error (r
));
1768 nvthd
->current_stream
= ptx_dev
->null_stream
;
1769 nvthd
->ptx_dev
= ptx_dev
;
1771 return (void *) nvthd
;
1775 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data
)
1781 GOMP_OFFLOAD_openacc_get_current_cuda_device (void)
1783 return nvptx_get_current_cuda_device ();
1787 GOMP_OFFLOAD_openacc_get_current_cuda_context (void)
1789 return nvptx_get_current_cuda_context ();
1792 /* NOTE: This returns a CUstream, not a ptx_stream pointer. */
1795 GOMP_OFFLOAD_openacc_get_cuda_stream (int async
)
1797 return nvptx_get_cuda_stream (async
);
1800 /* NOTE: This takes a CUstream, not a ptx_stream pointer. */
1803 GOMP_OFFLOAD_openacc_set_cuda_stream (int async
, void *stream
)
1805 return nvptx_set_cuda_stream (async
, stream
);