1 /* Plugin for NVPTX execution.
3 Copyright (C) 2013-2014 Free Software Foundation, Inc.
5 Contributed by Mentor Embedded.
7 This file is part of the GNU OpenMP Library (libgomp).
9 Libgomp is free software; you can redistribute it and/or modify it
10 under the terms of the GNU General Public License as published by
11 the Free Software Foundation; either version 3, or (at your option)
14 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
15 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
16 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
19 Under Section 7 of GPL version 3, you are granted additional
20 permissions described in the GCC Runtime Library Exception, version
21 3.1, as published by the Free Software Foundation.
23 You should have received a copy of the GNU General Public License and
24 a copy of the GCC Runtime Library Exception along with this program;
25 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
26 <http://www.gnu.org/licenses/>. */
28 /* Nvidia PTX-specific parts of OpenACC support. The cuda driver
29 library appears to hold some implicit state, but the documentation
30 is not clear as to what that state might be. Or how one might
31 propagate it from one thread to another. */
34 //#define DISABLE_ASYNC
40 #include "libgomp-plugin.h"
50 #define ARRAYSIZE(X) (sizeof (X) / sizeof ((X)[0]))
52 static struct _errlist
57 { CUDA_ERROR_INVALID_VALUE
, "invalid value" },
58 { CUDA_ERROR_OUT_OF_MEMORY
, "out of memory" },
59 { CUDA_ERROR_NOT_INITIALIZED
, "not initialized" },
60 { CUDA_ERROR_DEINITIALIZED
, "deinitialized" },
61 { CUDA_ERROR_PROFILER_DISABLED
, "profiler disabled" },
62 { CUDA_ERROR_PROFILER_NOT_INITIALIZED
, "profiler not initialized" },
63 { CUDA_ERROR_PROFILER_ALREADY_STARTED
, "already started" },
64 { CUDA_ERROR_PROFILER_ALREADY_STOPPED
, "already stopped" },
65 { CUDA_ERROR_NO_DEVICE
, "no device" },
66 { CUDA_ERROR_INVALID_DEVICE
, "invalid device" },
67 { CUDA_ERROR_INVALID_IMAGE
, "invalid image" },
68 { CUDA_ERROR_INVALID_CONTEXT
, "invalid context" },
69 { CUDA_ERROR_CONTEXT_ALREADY_CURRENT
, "context already current" },
70 { CUDA_ERROR_MAP_FAILED
, "map error" },
71 { CUDA_ERROR_UNMAP_FAILED
, "unmap error" },
72 { CUDA_ERROR_ARRAY_IS_MAPPED
, "array is mapped" },
73 { CUDA_ERROR_ALREADY_MAPPED
, "already mapped" },
74 { CUDA_ERROR_NO_BINARY_FOR_GPU
, "no binary for gpu" },
75 { CUDA_ERROR_ALREADY_ACQUIRED
, "already acquired" },
76 { CUDA_ERROR_NOT_MAPPED
, "not mapped" },
77 { CUDA_ERROR_NOT_MAPPED_AS_ARRAY
, "not mapped as array" },
78 { CUDA_ERROR_NOT_MAPPED_AS_POINTER
, "not mapped as pointer" },
79 { CUDA_ERROR_ECC_UNCORRECTABLE
, "ecc uncorrectable" },
80 { CUDA_ERROR_UNSUPPORTED_LIMIT
, "unsupported limit" },
81 { CUDA_ERROR_CONTEXT_ALREADY_IN_USE
, "context already in use" },
82 { CUDA_ERROR_PEER_ACCESS_UNSUPPORTED
, "peer access unsupported" },
83 { CUDA_ERROR_INVALID_SOURCE
, "invalid source" },
84 { CUDA_ERROR_FILE_NOT_FOUND
, "file not found" },
85 { CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND
,
86 "shared object symbol not found" },
87 { CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
, "shared object init error" },
88 { CUDA_ERROR_OPERATING_SYSTEM
, "operating system" },
89 { CUDA_ERROR_INVALID_HANDLE
, "invalid handle" },
90 { CUDA_ERROR_NOT_FOUND
, "not found" },
91 { CUDA_ERROR_NOT_READY
, "not ready" },
92 { CUDA_ERROR_LAUNCH_FAILED
, "launch error" },
93 { CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES
, "launch out of resources" },
94 { CUDA_ERROR_LAUNCH_TIMEOUT
, "launch timeout" },
95 { CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING
,
96 "launch incompatibe texturing" },
97 { CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED
, "peer access already enabled" },
98 { CUDA_ERROR_PEER_ACCESS_NOT_ENABLED
, "peer access not enabled " },
99 { CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE
, "primary cotext active" },
100 { CUDA_ERROR_CONTEXT_IS_DESTROYED
, "context is destroyed" },
101 { CUDA_ERROR_ASSERT
, "assert" },
102 { CUDA_ERROR_TOO_MANY_PEERS
, "too many peers" },
103 { CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED
,
104 "host memory already registered" },
105 { CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED
, "host memory not registered" },
106 { CUDA_ERROR_NOT_PERMITTED
, "no permitted" },
107 { CUDA_ERROR_NOT_SUPPORTED
, "not supported" },
108 { CUDA_ERROR_UNKNOWN
, "unknown" }
111 static char errmsg
[128];
114 cuErrorMsg (CUresult r
)
118 for (i
= 0; i
< ARRAYSIZE (cuErrorList
); i
++)
120 if (cuErrorList
[i
].r
== r
)
121 return &cuErrorList
[i
].m
[0];
124 sprintf (&errmsg
[0], "unknown result code: %5d", r
);
129 struct targ_fn_descriptor
135 static bool PTX_inited
= false;
140 pthread_t host_thread
;
151 struct PTX_stream
*next
;
154 /* Each thread may select a stream (also specific to a device/context). */
155 static __thread
struct PTX_stream
*current_stream
;
165 map_init (struct PTX_stream
*s
)
169 int size
= getpagesize ();
175 r
= cuMemAllocHost (&s
->h
, size
);
176 if (r
!= CUDA_SUCCESS
)
177 GOMP_PLUGIN_fatal ("cuMemAllocHost error: %s", cuErrorMsg (r
));
179 r
= cuMemHostGetDevicePointer (&s
->d
, s
->h
, 0);
180 if (r
!= CUDA_SUCCESS
)
181 GOMP_PLUGIN_fatal ("cuMemHostGetDevicePointer error: %s", cuErrorMsg (r
));
186 s
->h_end
= s
->h_begin
+ size
;
187 s
->h_next
= s
->h_prev
= s
->h_tail
= s
->h_begin
;
194 map_fini (struct PTX_stream
*s
)
198 r
= cuMemFreeHost (s
->h
);
199 if (r
!= CUDA_SUCCESS
)
200 GOMP_PLUGIN_fatal ("cuMemFreeHost error: %s", cuErrorMsg (r
));
204 map_pop (struct PTX_stream
*s
)
215 s
->h_tail
+= m
->size
;
217 if (s
->h_tail
>= s
->h_end
)
218 s
->h_tail
= s
->h_begin
+ (int) (s
->h_tail
- s
->h_end
);
220 if (s
->h_next
== s
->h_tail
)
221 s
->h_prev
= s
->h_next
;
223 assert (s
->h_next
>= s
->h_begin
);
224 assert (s
->h_tail
>= s
->h_begin
);
225 assert (s
->h_prev
>= s
->h_begin
);
227 assert (s
->h_next
<= s
->h_end
);
228 assert (s
->h_tail
<= s
->h_end
);
229 assert (s
->h_prev
<= s
->h_end
);
233 map_push (struct PTX_stream
*s
, int async
, size_t size
, void **h
, void **d
)
241 left
= s
->h_end
- s
->h_next
;
242 size
+= sizeof (struct map
);
251 s
->h_next
= s
->h_begin
;
253 if (s
->h_next
+ size
> s
->h_end
)
254 GOMP_PLUGIN_fatal ("unable to push map");
263 offset
= (void *)&m
->mappings
[0] - s
->h
;
265 *d
= (void *)(s
->d
+ offset
);
266 *h
= (void *)(s
->h
+ offset
);
268 s
->h_prev
= s
->h_next
;
274 assert (s
->h_next
>= s
->h_begin
);
275 assert (s
->h_tail
>= s
->h_begin
);
276 assert (s
->h_prev
>= s
->h_begin
);
277 assert (s
->h_next
<= s
->h_end
);
278 assert (s
->h_tail
<= s
->h_end
);
279 assert (s
->h_prev
<= s
->h_end
);
289 struct PTX_stream
*null_stream
;
290 /* All non-null streams associated with this device (actually context),
291 either created implicitly or passed in from the user (via
292 acc_set_cuda_stream). */
293 struct PTX_stream
*active_streams
;
295 struct PTX_stream
**arr
;
298 /* A lock for use when manipulating the above stream list and array. */
299 gomp_mutex_t stream_lock
;
307 struct PTX_device
*next
;
310 static __thread
struct PTX_device
*PTX_dev
;
311 static struct PTX_device
*PTX_devices
;
318 PTX_EVT_ASYNC_CLEANUP
328 struct PTX_event
*next
;
331 static gomp_mutex_t PTX_event_lock
;
332 static struct PTX_event
*PTX_events
;
334 #define _XSTR(s) _STR(s)
337 static struct _synames
342 { _XSTR(cuCtxCreate
) },
343 { _XSTR(cuCtxDestroy
) },
344 { _XSTR(cuCtxGetCurrent
) },
345 { _XSTR(cuCtxPushCurrent
) },
346 { _XSTR(cuCtxSynchronize
) },
347 { _XSTR(cuDeviceGet
) },
348 { _XSTR(cuDeviceGetAttribute
) },
349 { _XSTR(cuDeviceGetCount
) },
350 { _XSTR(cuEventCreate
) },
351 { _XSTR(cuEventDestroy
) },
352 { _XSTR(cuEventQuery
) },
353 { _XSTR(cuEventRecord
) },
355 { _XSTR(cuLaunchKernel
) },
356 { _XSTR(cuLinkAddData
) },
357 { _XSTR(cuLinkComplete
) },
358 { _XSTR(cuLinkCreate
) },
359 { _XSTR(cuMemAlloc
) },
360 { _XSTR(cuMemAllocHost
) },
362 { _XSTR(cuMemcpyDtoH
) },
363 { _XSTR(cuMemcpyDtoHAsync
) },
364 { _XSTR(cuMemcpyHtoD
) },
365 { _XSTR(cuMemcpyHtoDAsync
) },
366 { _XSTR(cuMemFree
) },
367 { _XSTR(cuMemFreeHost
) },
368 { _XSTR(cuMemGetAddressRange
) },
369 { _XSTR(cuMemHostGetDevicePointer
) },
370 { _XSTR(cuMemHostRegister
) },
371 { _XSTR(cuMemHostUnregister
) },
372 { _XSTR(cuModuleGetFunction
) },
373 { _XSTR(cuModuleLoadData
) },
374 { _XSTR(cuStreamDestroy
) },
375 { _XSTR(cuStreamQuery
) },
376 { _XSTR(cuStreamSynchronize
) },
377 { _XSTR(cuStreamWaitEvent
) }
381 verify_device_library (void)
386 dh
= dlopen ("libcuda.so", RTLD_LAZY
);
390 for (i
= 0; i
< ARRAYSIZE (cuSymNames
); i
++)
392 ds
= dlsym (dh
, cuSymNames
[i
].n
);
403 init_streams_for_device (struct PTX_device
*ptx_dev
, int concurrency
)
406 struct PTX_stream
*null_stream
407 = GOMP_PLUGIN_malloc (sizeof (struct PTX_stream
));
409 null_stream
->stream
= NULL
;
410 null_stream
->host_thread
= pthread_self ();
411 null_stream
->multithreaded
= true;
412 null_stream
->d
= (CUdeviceptr
) NULL
;
413 null_stream
->h
= NULL
;
414 map_init (null_stream
);
415 ptx_dev
->null_stream
= null_stream
;
417 ptx_dev
->active_streams
= NULL
;
418 GOMP_PLUGIN_mutex_init (&ptx_dev
->stream_lock
);
423 /* This is just a guess -- make space for as many async streams as the
424 current device is capable of concurrently executing. This can grow
425 later as necessary. No streams are created yet. */
426 ptx_dev
->async_streams
.arr
427 = GOMP_PLUGIN_malloc (concurrency
* sizeof (struct PTX_stream
*));
428 ptx_dev
->async_streams
.size
= concurrency
;
430 for (i
= 0; i
< concurrency
; i
++)
431 ptx_dev
->async_streams
.arr
[i
] = NULL
;
435 fini_streams_for_device (struct PTX_device
*ptx_dev
)
437 free (ptx_dev
->async_streams
.arr
);
439 while (ptx_dev
->active_streams
!= NULL
)
441 struct PTX_stream
*s
= ptx_dev
->active_streams
;
442 ptx_dev
->active_streams
= ptx_dev
->active_streams
->next
;
444 cuStreamDestroy (s
->stream
);
449 map_fini (ptx_dev
->null_stream
);
450 free (ptx_dev
->null_stream
);
453 /* Select a stream for (OpenACC-semantics) ASYNC argument for the current
454 thread THREAD (and also current device/context). If CREATE is true, create
455 the stream if it does not exist (or use EXISTING if it is non-NULL), and
456 associate the stream with the same thread argument. Returns stream to use
459 static struct PTX_stream
*
460 select_stream_for_async (int async
, pthread_t thread
, bool create
,
463 /* Local copy of TLS variable. */
464 struct PTX_device
*ptx_dev
= PTX_dev
;
465 struct PTX_stream
*stream
= NULL
;
466 int orig_async
= async
;
468 /* The special value acc_async_noval (-1) maps (for now) to an
469 implicitly-created stream, which is then handled the same as any other
470 numbered async stream. Other options are available, e.g. using the null
471 stream for anonymous async operations, or choosing an idle stream from an
472 active set. But, stick with this for now. */
473 if (async
> acc_async_sync
)
477 GOMP_PLUGIN_mutex_lock (&ptx_dev
->stream_lock
);
479 /* NOTE: AFAICT there's no particular need for acc_async_sync to map to the
480 null stream, and in fact better performance may be obtainable if it doesn't
481 (because the null stream enforces overly-strict synchronisation with
482 respect to other streams for legacy reasons, and that's probably not
483 needed with OpenACC). Maybe investigate later. */
484 if (async
== acc_async_sync
)
485 stream
= ptx_dev
->null_stream
;
486 else if (async
>= 0 && async
< ptx_dev
->async_streams
.size
487 && ptx_dev
->async_streams
.arr
[async
] && !(create
&& existing
))
488 stream
= ptx_dev
->async_streams
.arr
[async
];
489 else if (async
>= 0 && create
)
491 if (async
>= ptx_dev
->async_streams
.size
)
493 int i
, newsize
= ptx_dev
->async_streams
.size
* 2;
495 if (async
>= newsize
)
498 ptx_dev
->async_streams
.arr
499 = GOMP_PLUGIN_realloc (ptx_dev
->async_streams
.arr
,
500 newsize
* sizeof (struct PTX_stream
*));
502 for (i
= ptx_dev
->async_streams
.size
; i
< newsize
; i
++)
503 ptx_dev
->async_streams
.arr
[i
] = NULL
;
505 ptx_dev
->async_streams
.size
= newsize
;
508 /* Create a new stream on-demand if there isn't one already, or if we're
509 setting a particular async value to an existing (externally-provided)
511 if (!ptx_dev
->async_streams
.arr
[async
] || existing
)
515 = GOMP_PLUGIN_malloc (sizeof (struct PTX_stream
));
518 s
->stream
= existing
;
521 r
= cuStreamCreate (&s
->stream
, CU_STREAM_DEFAULT
);
522 if (r
!= CUDA_SUCCESS
)
523 GOMP_PLUGIN_fatal ("cuStreamCreate error: %s", cuErrorMsg (r
));
526 /* If CREATE is true, we're going to be queueing some work on this
527 stream. Associate it with the current host thread. */
528 s
->host_thread
= thread
;
529 s
->multithreaded
= false;
531 s
->d
= (CUdeviceptr
) NULL
;
535 s
->next
= ptx_dev
->active_streams
;
536 ptx_dev
->active_streams
= s
;
537 ptx_dev
->async_streams
.arr
[async
] = s
;
540 stream
= ptx_dev
->async_streams
.arr
[async
];
543 GOMP_PLUGIN_fatal ("bad async %d", async
);
547 assert (stream
!= NULL
);
549 /* If we're trying to use the same stream from different threads
550 simultaneously, set stream->multithreaded to true. This affects the
551 behaviour of acc_async_test_all and acc_wait_all, which are supposed to
552 only wait for asynchronous launches from the same host thread they are
553 invoked on. If multiple threads use the same async value, we make note
554 of that here and fall back to testing/waiting for all threads in those
556 if (thread
!= stream
->host_thread
)
557 stream
->multithreaded
= true;
559 GOMP_PLUGIN_mutex_unlock (&ptx_dev
->stream_lock
);
561 else if (stream
&& !stream
->multithreaded
562 && !pthread_equal (stream
->host_thread
, thread
))
563 GOMP_PLUGIN_fatal ("async %d used on wrong thread", orig_async
);
566 fprintf (stderr
, "libgomp plugin: %s:%s using stream %p (CUDA stream %p) "
567 "for async %d\n", __FILE__
, __FUNCTION__
, stream
,
568 stream
? stream
->stream
: NULL
, orig_async
);
574 static int PTX_get_num_devices (void);
576 /* Initialize the device. */
584 return PTX_get_num_devices ();
586 rc
= verify_device_library ();
591 if (r
!= CUDA_SUCCESS
)
592 GOMP_PLUGIN_fatal ("cuInit error: %s", cuErrorMsg (r
));
597 GOMP_PLUGIN_mutex_init (&PTX_event_lock
);
601 return PTX_get_num_devices ();
613 PTX_open_device (int n
)
617 int async_engines
, pi
;
621 struct PTX_device
*ptx_device
;
623 for (ptx_device
= PTX_devices
;
625 ptx_device
= ptx_device
->next
)
627 if (ptx_device
->ord
== n
)
629 PTX_dev
= ptx_device
;
633 r
= cuCtxPushCurrent (PTX_dev
->ctx
);
634 if (r
!= CUDA_SUCCESS
)
635 GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s",
639 return (void *)PTX_dev
;
644 r
= cuDeviceGet (&dev
, n
);
645 if (r
!= CUDA_SUCCESS
)
646 GOMP_PLUGIN_fatal ("cuDeviceGet error: %s", cuErrorMsg (r
));
648 PTX_dev
= GOMP_PLUGIN_malloc (sizeof (struct PTX_device
));
651 PTX_dev
->ctx_shared
= false;
653 PTX_dev
->next
= PTX_devices
;
654 PTX_devices
= PTX_dev
;
656 r
= cuCtxGetCurrent (&PTX_dev
->ctx
);
657 if (r
!= CUDA_SUCCESS
)
658 GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuErrorMsg (r
));
662 r
= cuCtxCreate (&PTX_dev
->ctx
, CU_CTX_SCHED_AUTO
, dev
);
663 if (r
!= CUDA_SUCCESS
)
664 GOMP_PLUGIN_fatal ("cuCtxCreate error: %s", cuErrorMsg (r
));
668 PTX_dev
->ctx_shared
= true;
671 r
= cuDeviceGetAttribute (&pi
, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP
, dev
);
672 if (r
!= CUDA_SUCCESS
)
673 GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuErrorMsg (r
));
675 PTX_dev
->overlap
= pi
;
677 r
= cuDeviceGetAttribute (&pi
, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY
, dev
);
678 if (r
!= CUDA_SUCCESS
)
679 GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuErrorMsg (r
));
683 r
= cuDeviceGetAttribute (&pi
, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS
, dev
);
684 if (r
!= CUDA_SUCCESS
)
685 GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuErrorMsg (r
));
687 PTX_dev
->concur
= pi
;
689 r
= cuDeviceGetAttribute (&pi
, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE
, dev
);
690 if (r
!= CUDA_SUCCESS
)
691 GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuErrorMsg (r
));
695 r
= cuDeviceGetAttribute (&pi
, CU_DEVICE_ATTRIBUTE_INTEGRATED
, dev
);
696 if (r
!= CUDA_SUCCESS
)
697 GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuErrorMsg (r
));
701 r
= cuDeviceGetAttribute (&async_engines
,
702 CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT
, dev
);
703 if (r
!= CUDA_SUCCESS
)
706 init_streams_for_device (PTX_dev
, async_engines
);
708 current_stream
= PTX_dev
->null_stream
;
710 return (void *)PTX_dev
;
714 PTX_close_device (void *h
__attribute__((unused
)))
721 fini_streams_for_device (PTX_dev
);
723 if (!PTX_dev
->ctx_shared
)
725 r
= cuCtxDestroy (PTX_dev
->ctx
);
726 if (r
!= CUDA_SUCCESS
)
727 GOMP_PLUGIN_fatal ("cuCtxDestroy error: %s", cuErrorMsg (r
));
730 if (PTX_devices
== PTX_dev
)
731 PTX_devices
= PTX_devices
->next
;
734 struct PTX_device
* d
= PTX_devices
;
735 while (d
->next
!= PTX_dev
)
737 d
->next
= d
->next
->next
;
747 PTX_get_num_devices (void)
754 r
= cuDeviceGetCount (&n
);
755 if (r
!= CUDA_SUCCESS
)
756 GOMP_PLUGIN_fatal ("cuDeviceGetCount error: %s", cuErrorMsg (r
));
775 ".address_size 64\n" \
776 ".visible .func abort;\n" \
777 ".visible .func abort\n" \
782 ".visible .func _gfortran_abort;\n" \
783 ".visible .func _gfortran_abort\n" \
791 $ echo 'int acc_on_device(int d) { return __builtin_acc_on_device(d); } int acc_on_device_h_(int *d) { return acc_on_device(*d); }' | accel-gcc/xgcc -Baccel-gcc -x c - -o - -S -m64 -O3 -fno-builtin-acc_on_device -fno-inline
793 #define ACC_ON_DEVICE_PTX \
796 " .address_size 64\n" \
797 ".visible .func (.param.u32 %out_retval)acc_on_device(.param.u32 %in_ar1);\n" \
798 ".visible .func (.param.u32 %out_retval)acc_on_device(.param.u32 %in_ar1)\n" \
800 " .reg.u32 %ar1;\n" \
801 ".reg.u32 %retval;\n" \
802 " .reg.u64 %hr10;\n" \
803 " .reg.u32 %r24;\n" \
804 " .reg.u32 %r25;\n" \
805 " .reg.pred %r27;\n" \
806 " .reg.u32 %r30;\n" \
807 " ld.param.u32 %ar1, [%in_ar1];\n" \
808 " mov.u32 %r24, %ar1;\n" \
809 " setp.ne.u32 %r27,%r24,4;\n" \
810 " set.u32.eq.u32 %r30,%r24,5;\n" \
811 " neg.s32 %r25, %r30;\n" \
812 " @%r27 bra $L3;\n" \
813 " mov.u32 %r25, 1;\n" \
815 " mov.u32 %retval, %r25;\n" \
816 " st.param.u32 [%out_retval], %retval;\n" \
819 ".visible .func (.param.u32 %out_retval)acc_on_device_h_(.param.u64 %in_ar1);\n" \
820 ".visible .func (.param.u32 %out_retval)acc_on_device_h_(.param.u64 %in_ar1)\n" \
822 " .reg.u64 %ar1;\n" \
823 ".reg.u32 %retval;\n" \
824 " .reg.u64 %hr10;\n" \
825 " .reg.u64 %r25;\n" \
826 " .reg.u32 %r26;\n" \
827 " .reg.u32 %r27;\n" \
828 " ld.param.u64 %ar1, [%in_ar1];\n" \
829 " mov.u64 %r25, %ar1;\n" \
830 " ld.u32 %r26, [%r25];\n" \
832 " .param.u32 %retval_in;\n" \
834 " .param.u32 %out_arg0;\n" \
835 " st.param.u32 [%out_arg0], %r26;\n" \
836 " call (%retval_in), acc_on_device, (%out_arg0);\n" \
838 " ld.param.u32 %r27, [%retval_in];\n" \
840 " mov.u32 %retval, %r27;\n" \
841 " st.param.u32 [%out_retval], %retval;\n" \
846 link_ptx (CUmodule
*module
, char *ptx_code
)
848 CUjit_option opts
[7];
854 unsigned long logsize
= LOGSIZE
;
855 CUlinkState linkstate
;
858 size_t linkoutsize
__attribute__((unused
));
860 GOMP_PLUGIN_notify ("attempting to load:\n---\n%s\n---\n", ptx_code
);
862 opts
[0] = CU_JIT_WALL_TIME
;
863 optvals
[0] = &elapsed
;
865 opts
[1] = CU_JIT_INFO_LOG_BUFFER
;
866 optvals
[1] = &ilog
[0];
868 opts
[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES
;
869 optvals
[2] = (void *) logsize
;
871 opts
[3] = CU_JIT_ERROR_LOG_BUFFER
;
872 optvals
[3] = &elog
[0];
874 opts
[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES
;
875 optvals
[4] = (void *) logsize
;
877 opts
[5] = CU_JIT_LOG_VERBOSE
;
878 optvals
[5] = (void *) 1;
880 opts
[6] = CU_JIT_TARGET
;
881 optvals
[6] = (void *) CU_TARGET_COMPUTE_30
;
883 r
= cuLinkCreate (7, opts
, optvals
, &linkstate
);
884 if (r
!= CUDA_SUCCESS
)
885 GOMP_PLUGIN_fatal ("cuLinkCreate error: %s", cuErrorMsg (r
));
887 char *abort_ptx
= ABORT_PTX
;
888 r
= cuLinkAddData (linkstate
, CU_JIT_INPUT_PTX
, abort_ptx
,
889 strlen (abort_ptx
) + 1, 0, 0, 0, 0);
890 if (r
!= CUDA_SUCCESS
)
892 GOMP_PLUGIN_error ("Link error log %s\n", &elog
[0]);
893 GOMP_PLUGIN_fatal ("cuLinkAddData (abort) error: %s", cuErrorMsg (r
));
896 char *acc_on_device_ptx
= ACC_ON_DEVICE_PTX
;
897 r
= cuLinkAddData (linkstate
, CU_JIT_INPUT_PTX
, acc_on_device_ptx
,
898 strlen (acc_on_device_ptx
) + 1, 0, 0, 0, 0);
899 if (r
!= CUDA_SUCCESS
)
901 GOMP_PLUGIN_error ("Link error log %s\n", &elog
[0]);
902 GOMP_PLUGIN_fatal ("cuLinkAddData (acc_on_device) error: %s",
906 r
= cuLinkAddData (linkstate
, CU_JIT_INPUT_PTX
, ptx_code
,
907 strlen (ptx_code
) + 1, 0, 0, 0, 0);
908 if (r
!= CUDA_SUCCESS
)
910 GOMP_PLUGIN_error ("Link error log %s\n", &elog
[0]);
911 GOMP_PLUGIN_fatal ("cuLinkAddData (ptx_code) error: %s", cuErrorMsg (r
));
914 r
= cuLinkComplete (linkstate
, &linkout
, &linkoutsize
);
915 if (r
!= CUDA_SUCCESS
)
916 GOMP_PLUGIN_fatal ("cuLinkComplete error: %s", cuErrorMsg (r
));
918 GOMP_PLUGIN_notify ("Link complete: %fms\n", elapsed
);
919 GOMP_PLUGIN_notify ("Link log %s\n", &ilog
[0]);
921 r
= cuModuleLoadData (module
, linkout
);
922 if (r
!= CUDA_SUCCESS
)
923 GOMP_PLUGIN_fatal ("cuModuleLoadData error: %s", cuErrorMsg (r
));
927 event_gc (bool memmap_lockable
)
929 struct PTX_event
*ptx_event
= PTX_events
;
931 GOMP_PLUGIN_mutex_lock (&PTX_event_lock
);
933 while (ptx_event
!= NULL
)
936 struct PTX_event
*e
= ptx_event
;
938 ptx_event
= ptx_event
->next
;
940 if (e
->ord
!= PTX_dev
->ord
)
943 r
= cuEventQuery (*e
->evt
);
944 if (r
== CUDA_SUCCESS
)
960 case PTX_EVT_ASYNC_CLEANUP
:
962 /* The function GOMP_PLUGIN_async_unmap_vars needs to claim the
963 memory-map splay tree lock for the current device, so we
964 can't call it when one of our callers has already claimed
965 the lock. In that case, just delay the GC for this event
967 if (!memmap_lockable
)
970 GOMP_PLUGIN_async_unmap_vars (e
->addr
);
975 cuEventDestroy (*te
);
979 PTX_events
= PTX_events
->next
;
982 struct PTX_event
*e_
= PTX_events
;
983 while (e_
->next
!= e
)
985 e_
->next
= e_
->next
->next
;
992 GOMP_PLUGIN_mutex_unlock (&PTX_event_lock
);
996 event_add (enum PTX_event_type type
, CUevent
*e
, void *h
)
998 struct PTX_event
*ptx_event
;
1000 assert (type
== PTX_EVT_MEM
|| type
== PTX_EVT_KNL
|| type
== PTX_EVT_SYNC
1001 || type
== PTX_EVT_ASYNC_CLEANUP
);
1003 ptx_event
= GOMP_PLUGIN_malloc (sizeof (struct PTX_event
));
1004 ptx_event
->type
= type
;
1006 ptx_event
->addr
= h
;
1007 ptx_event
->ord
= PTX_dev
->ord
;
1009 GOMP_PLUGIN_mutex_lock (&PTX_event_lock
);
1011 ptx_event
->next
= PTX_events
;
1012 PTX_events
= ptx_event
;
1014 GOMP_PLUGIN_mutex_unlock (&PTX_event_lock
);
1018 PTX_exec (void (*fn
), size_t mapnum
, void **hostaddrs
, void **devaddrs
,
1019 size_t *sizes
, unsigned short *kinds
, int num_gangs
, int num_workers
,
1020 int vector_length
, int async
, void *targ_mem_desc
)
1022 struct targ_fn_descriptor
*targ_fn
= (struct targ_fn_descriptor
*) fn
;
1023 CUfunction function
;
1026 struct PTX_stream
*dev_str
;
1029 unsigned int nthreads_in_block
;
1031 function
= targ_fn
->fn
;
1033 dev_str
= select_stream_for_async (async
, pthread_self (), false, NULL
);
1034 assert (dev_str
== current_stream
);
1036 /* This reserves a chunk of a pre-allocated page of memory mapped on both
1037 the host and the device. HP is a host pointer to the new chunk, and DP is
1038 the corresponding device pointer. */
1039 map_push (dev_str
, async
, mapnum
* sizeof (void *), &hp
, &dp
);
1041 GOMP_PLUGIN_notify (" %s: prepare mappings\n", __FUNCTION__
);
1043 /* Copy the array of arguments to the mapped page. */
1044 for (i
= 0; i
< mapnum
; i
++)
1045 ((void **) hp
)[i
] = devaddrs
[i
];
1047 /* Copy the (device) pointers to arguments to the device (dp and hp might in
1048 fact have the same value on a unified-memory system). */
1049 r
= cuMemcpy ((CUdeviceptr
)dp
, (CUdeviceptr
)hp
, mapnum
* sizeof (void *));
1050 if (r
!= CUDA_SUCCESS
)
1051 GOMP_PLUGIN_fatal ("cuMemcpy failed: %s", cuErrorMsg (r
));
1053 GOMP_PLUGIN_notify (" %s: kernel %s: launch\n", __FUNCTION__
, targ_fn
->name
);
1055 // XXX: possible geometry mappings??
1060 // num_workers warps (where a warp is equivalent to 32 threads)
1061 // vector length threads
1064 /* The openacc vector_length clause 'determines the vector length to use for
1065 vector or SIMD operations'. The question is how to map this to CUDA.
1067 In CUDA, the warp size is the vector length of a CUDA device. However, the
1068 CUDA interface abstracts away from that, and only shows us warp size
1069 indirectly in maximum number of threads per block, which is a product of
1070 warp size and the number of hyperthreads of a multiprocessor.
1072 We choose to map openacc vector_length directly onto the number of threads
1073 in a block, in the x dimension. This is reflected in gcc code generation
1074 that uses ThreadIdx.x to access vector elements.
1076 Attempting to use an openacc vector_length of more than the maximum number
1077 of threads per block will result in a cuda error. */
1078 nthreads_in_block
= vector_length
;
1081 r
= cuLaunchKernel (function
,
1083 nthreads_in_block
, 1, 1,
1084 0, dev_str
->stream
, kargs
, 0);
1085 if (r
!= CUDA_SUCCESS
)
1086 GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuErrorMsg (r
));
1088 #ifndef DISABLE_ASYNC
1089 if (async
< acc_async_noval
)
1091 r
= cuStreamSynchronize (dev_str
->stream
);
1092 if (r
!= CUDA_SUCCESS
)
1093 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuErrorMsg (r
));
1099 e
= (CUevent
*)GOMP_PLUGIN_malloc (sizeof (CUevent
));
1101 r
= cuEventCreate (e
, CU_EVENT_DISABLE_TIMING
);
1102 if (r
!= CUDA_SUCCESS
)
1103 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuErrorMsg (r
));
1107 r
= cuEventRecord (*e
, dev_str
->stream
);
1108 if (r
!= CUDA_SUCCESS
)
1109 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuErrorMsg (r
));
1111 event_add (PTX_EVT_KNL
, e
, (void *)dev_str
);
1114 r
= cuCtxSynchronize ();
1115 if (r
!= CUDA_SUCCESS
)
1116 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuErrorMsg (r
));
1119 GOMP_PLUGIN_notify (" %s: kernel %s: finished\n", __FUNCTION__
,
1122 #ifndef DISABLE_ASYNC
1123 if (async
< acc_async_noval
)
1128 void * openacc_get_current_cuda_context (void);
1131 PTX_alloc (size_t s
)
1136 r
= cuMemAlloc (&d
, s
);
1137 if (r
== CUDA_ERROR_OUT_OF_MEMORY
)
1139 if (r
!= CUDA_SUCCESS
)
1140 GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuErrorMsg (r
));
1151 r
= cuMemGetAddressRange (&pb
, &ps
, (CUdeviceptr
)p
);
1152 if (r
!= CUDA_SUCCESS
)
1153 GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuErrorMsg (r
));
1155 if ((CUdeviceptr
)p
!= pb
)
1156 GOMP_PLUGIN_fatal ("invalid device address");
1158 r
= cuMemFree ((CUdeviceptr
)p
);
1159 if (r
!= CUDA_SUCCESS
)
1160 GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuErrorMsg (r
));
1164 PTX_host2dev (void *d
, const void *h
, size_t s
)
1174 GOMP_PLUGIN_fatal ("invalid device address");
1176 r
= cuMemGetAddressRange (&pb
, &ps
, (CUdeviceptr
)d
);
1177 if (r
!= CUDA_SUCCESS
)
1178 GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuErrorMsg (r
));
1181 GOMP_PLUGIN_fatal ("invalid device address");
1184 GOMP_PLUGIN_fatal ("invalid host address");
1187 GOMP_PLUGIN_fatal ("invalid host or device address");
1189 if ((void *)(d
+ s
) > (void *)(pb
+ ps
))
1190 GOMP_PLUGIN_fatal ("invalid size");
1192 #ifndef DISABLE_ASYNC
1193 if (current_stream
!= PTX_dev
->null_stream
)
1197 e
= (CUevent
*)GOMP_PLUGIN_malloc (sizeof (CUevent
));
1199 r
= cuEventCreate (e
, CU_EVENT_DISABLE_TIMING
);
1200 if (r
!= CUDA_SUCCESS
)
1201 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuErrorMsg (r
));
1205 r
= cuMemcpyHtoDAsync ((CUdeviceptr
)d
, h
, s
, current_stream
->stream
);
1206 if (r
!= CUDA_SUCCESS
)
1207 GOMP_PLUGIN_fatal ("cuMemcpyHtoDAsync error: %s", cuErrorMsg (r
));
1209 r
= cuEventRecord (*e
, current_stream
->stream
);
1210 if (r
!= CUDA_SUCCESS
)
1211 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuErrorMsg (r
));
1213 event_add (PTX_EVT_MEM
, e
, (void *)h
);
1218 r
= cuMemcpyHtoD ((CUdeviceptr
)d
, h
, s
);
1219 if (r
!= CUDA_SUCCESS
)
1220 GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuErrorMsg (r
));
1227 PTX_dev2host (void *h
, const void *d
, size_t s
)
1237 GOMP_PLUGIN_fatal ("invalid device address");
1239 r
= cuMemGetAddressRange (&pb
, &ps
, (CUdeviceptr
)d
);
1240 if (r
!= CUDA_SUCCESS
)
1241 GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuErrorMsg (r
));
1244 GOMP_PLUGIN_fatal ("invalid device address");
1247 GOMP_PLUGIN_fatal ("invalid host address");
1250 GOMP_PLUGIN_fatal ("invalid host or device address");
1252 if ((void *)(d
+ s
) > (void *)(pb
+ ps
))
1253 GOMP_PLUGIN_fatal ("invalid size");
1255 #ifndef DISABLE_ASYNC
1256 if (current_stream
!= PTX_dev
->null_stream
)
1260 e
= (CUevent
*)GOMP_PLUGIN_malloc (sizeof (CUevent
));
1262 r
= cuEventCreate (e
, CU_EVENT_DISABLE_TIMING
);
1263 if (r
!= CUDA_SUCCESS
)
1264 GOMP_PLUGIN_fatal ("cuEventCreate error: %s\n", cuErrorMsg (r
));
1268 r
= cuMemcpyDtoHAsync (h
, (CUdeviceptr
)d
, s
, current_stream
->stream
);
1269 if (r
!= CUDA_SUCCESS
)
1270 GOMP_PLUGIN_fatal ("cuMemcpyDtoHAsync error: %s", cuErrorMsg (r
));
1272 r
= cuEventRecord (*e
, current_stream
->stream
);
1273 if (r
!= CUDA_SUCCESS
)
1274 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuErrorMsg (r
));
1276 event_add (PTX_EVT_MEM
, e
, (void *)h
);
1281 r
= cuMemcpyDtoH (h
, (CUdeviceptr
)d
, s
);
1282 if (r
!= CUDA_SUCCESS
)
1283 GOMP_PLUGIN_fatal ("cuMemcpyDtoH error: %s", cuErrorMsg (r
));
1290 PTX_set_async (int async
)
1292 current_stream
= select_stream_for_async (async
, pthread_self (), true, NULL
);
1296 PTX_async_test (int async
)
1299 struct PTX_stream
*s
;
1301 s
= select_stream_for_async (async
, pthread_self (), false, NULL
);
1304 GOMP_PLUGIN_fatal ("unknown async %d", async
);
1306 r
= cuStreamQuery (s
->stream
);
1307 if (r
== CUDA_SUCCESS
)
1309 /* The oacc-parallel.c:goacc_wait function calls this hook to determine
1310 whether all work has completed on this stream, and if so omits the call
1311 to the wait hook. If that happens, event_gc might not get called
1312 (which prevents variables from getting unmapped and their associated
1313 device storage freed), so call it here. */
1317 else if (r
== CUDA_ERROR_NOT_READY
)
1320 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuErrorMsg (r
));
1326 PTX_async_test_all (void)
1328 struct PTX_stream
*s
;
1329 pthread_t self
= pthread_self ();
1331 GOMP_PLUGIN_mutex_lock (&PTX_dev
->stream_lock
);
1333 for (s
= PTX_dev
->active_streams
; s
!= NULL
; s
= s
->next
)
1335 if ((s
->multithreaded
|| pthread_equal (s
->host_thread
, self
))
1336 && cuStreamQuery (s
->stream
) == CUDA_ERROR_NOT_READY
)
1338 GOMP_PLUGIN_mutex_unlock (&PTX_dev
->stream_lock
);
1343 GOMP_PLUGIN_mutex_unlock (&PTX_dev
->stream_lock
);
1351 PTX_wait (int async
)
1354 struct PTX_stream
*s
;
1356 s
= select_stream_for_async (async
, pthread_self (), false, NULL
);
1359 GOMP_PLUGIN_fatal ("unknown async %d", async
);
1361 r
= cuStreamSynchronize (s
->stream
);
1362 if (r
!= CUDA_SUCCESS
)
1363 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuErrorMsg (r
));
1369 PTX_wait_async (int async1
, int async2
)
1373 struct PTX_stream
*s1
, *s2
;
1374 pthread_t self
= pthread_self ();
1376 /* The stream that is waiting (rather than being waited for) doesn't
1377 necessarily have to exist already. */
1378 s2
= select_stream_for_async (async2
, self
, true, NULL
);
1380 s1
= select_stream_for_async (async1
, self
, false, NULL
);
1382 GOMP_PLUGIN_fatal ("invalid async 1\n");
1385 GOMP_PLUGIN_fatal ("identical parameters");
1387 e
= (CUevent
*)GOMP_PLUGIN_malloc (sizeof (CUevent
));
1389 r
= cuEventCreate (e
, CU_EVENT_DISABLE_TIMING
);
1390 if (r
!= CUDA_SUCCESS
)
1391 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuErrorMsg (r
));
1395 r
= cuEventRecord (*e
, s1
->stream
);
1396 if (r
!= CUDA_SUCCESS
)
1397 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuErrorMsg (r
));
1399 event_add (PTX_EVT_SYNC
, e
, NULL
);
1401 r
= cuStreamWaitEvent (s2
->stream
, *e
, 0);
1402 if (r
!= CUDA_SUCCESS
)
1403 GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuErrorMsg (r
));
1410 struct PTX_stream
*s
;
1411 pthread_t self
= pthread_self ();
1413 GOMP_PLUGIN_mutex_lock (&PTX_dev
->stream_lock
);
1415 /* Wait for active streams initiated by this thread (or by multiple threads)
1417 for (s
= PTX_dev
->active_streams
; s
!= NULL
; s
= s
->next
)
1419 if (s
->multithreaded
|| pthread_equal (s
->host_thread
, self
))
1421 r
= cuStreamQuery (s
->stream
);
1422 if (r
== CUDA_SUCCESS
)
1424 else if (r
!= CUDA_ERROR_NOT_READY
)
1425 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuErrorMsg (r
));
1427 r
= cuStreamSynchronize (s
->stream
);
1428 if (r
!= CUDA_SUCCESS
)
1429 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuErrorMsg (r
));
1433 GOMP_PLUGIN_mutex_unlock (&PTX_dev
->stream_lock
);
1439 PTX_wait_all_async (int async
)
1442 struct PTX_stream
*waiting_stream
, *other_stream
;
1444 pthread_t self
= pthread_self ();
1446 /* The stream doing the waiting. This could be the first mention of the
1447 stream, so create it if necessary. */
1449 = select_stream_for_async (async
, pthread_self (), true, NULL
);
1451 /* Launches on the null stream already block on other streams in the
1453 if (!waiting_stream
|| waiting_stream
== PTX_dev
->null_stream
)
1458 GOMP_PLUGIN_mutex_lock (&PTX_dev
->stream_lock
);
1460 for (other_stream
= PTX_dev
->active_streams
;
1461 other_stream
!= NULL
;
1462 other_stream
= other_stream
->next
)
1464 if (!other_stream
->multithreaded
1465 && !pthread_equal (other_stream
->host_thread
, self
))
1468 e
= (CUevent
*) GOMP_PLUGIN_malloc (sizeof (CUevent
));
1470 r
= cuEventCreate (e
, CU_EVENT_DISABLE_TIMING
);
1471 if (r
!= CUDA_SUCCESS
)
1472 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuErrorMsg (r
));
1474 /* Record an event on the waited-for stream. */
1475 r
= cuEventRecord (*e
, other_stream
->stream
);
1476 if (r
!= CUDA_SUCCESS
)
1477 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuErrorMsg (r
));
1479 event_add (PTX_EVT_SYNC
, e
, NULL
);
1481 r
= cuStreamWaitEvent (waiting_stream
->stream
, *e
, 0);
1482 if (r
!= CUDA_SUCCESS
)
1483 GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuErrorMsg (r
));
1486 GOMP_PLUGIN_mutex_unlock (&PTX_dev
->stream_lock
);
1490 PTX_get_current_cuda_device (void)
1495 return &PTX_dev
->dev
;
1499 PTX_get_current_cuda_context (void)
1504 return PTX_dev
->ctx
;
1508 PTX_get_cuda_stream (int async
)
1510 struct PTX_stream
*s
;
1515 s
= select_stream_for_async (async
, pthread_self (), false, NULL
);
1517 return s
? s
->stream
: NULL
;
1521 PTX_set_cuda_stream (int async
, void *stream
)
1523 struct PTX_stream
*oldstream
;
1524 pthread_t self
= pthread_self ();
1526 GOMP_PLUGIN_mutex_lock (&PTX_dev
->stream_lock
);
1529 GOMP_PLUGIN_fatal ("bad async %d", async
);
1531 /* We have a list of active streams and an array mapping async values to
1532 entries of that list. We need to take "ownership" of the passed-in stream,
1533 and add it to our list, removing the previous entry also (if there was one)
1534 in order to prevent resource leaks. Note the potential for surprise
1535 here: maybe we should keep track of passed-in streams and leave it up to
1536 the user to tidy those up, but that doesn't work for stream handles
1537 returned from acc_get_cuda_stream above... */
1539 oldstream
= select_stream_for_async (async
, self
, false, NULL
);
1543 if (PTX_dev
->active_streams
== oldstream
)
1544 PTX_dev
->active_streams
= PTX_dev
->active_streams
->next
;
1547 struct PTX_stream
*s
= PTX_dev
->active_streams
;
1548 while (s
->next
!= oldstream
)
1550 s
->next
= s
->next
->next
;
1553 cuStreamDestroy (oldstream
->stream
);
1554 map_fini (oldstream
);
1558 GOMP_PLUGIN_mutex_unlock (&PTX_dev
->stream_lock
);
1560 (void) select_stream_for_async (async
, self
, true, (CUstream
) stream
);
1565 /* Plugin entry points. */
1572 fprintf (stderr
, "libgomp plugin: %s:%s\n", __FILE__
, __FUNCTION__
);
1575 return TARGET_TYPE_NVIDIA_PTX
;
1581 return TARGET_CAP_OPENACC_200
;
1591 get_num_devices (void)
1594 fprintf (stderr
, "libgomp plugin: %s:%s\n", __FILE__
, __FUNCTION__
);
1597 return PTX_get_num_devices ();
1600 static void **kernel_target_data
;
1601 static void **kernel_host_table
;
1604 offload_register (void *host_table
, void *target_data
)
1607 fprintf (stderr
, "libgomp plugin: %s:%s (%p, %p)\n", __FILE__
, __FUNCTION__
,
1608 host_table
, target_data
);
1611 kernel_target_data
= target_data
;
1612 kernel_host_table
= host_table
;
1619 fprintf (stderr
, "libgomp plugin: %s:%s\n", __FILE__
, __FUNCTION__
);
1629 fprintf (stderr
, "libgomp plugin: %s:%s\n", __FILE__
, __FUNCTION__
);
1636 device_get_table (struct mapping_table
**tablep
)
1643 struct targ_fn_descriptor
*targ_fns
;
1646 fprintf (stderr
, "libgomp plugin: %s:%s (%p)\n", __FILE__
, __FUNCTION__
,
1650 if (PTX_init () <= 0)
1653 /* This isn't an error, because an image may legitimately have no offloaded
1654 regions and so will not call GOMP_offload_register. */
1655 if (kernel_target_data
== NULL
)
1658 link_ptx (&module
, kernel_target_data
[0]);
1660 /* kernel_target_data[0] -> ptx code
1661 kernel_target_data[1] -> variable mappings
1662 kernel_target_data[2] -> array of kernel names in ascii
1664 kernel_host_table[0] -> start of function addresses (_omp_func_table)
1665 kernel_host_table[1] -> end of function addresses (_omp_funcs_end)
1667 The array of kernel names and the functions addresses form a
1668 one-to-one correspondence. */
1670 fn_table
= kernel_host_table
[0];
1671 fn_names
= (char **) kernel_target_data
[2];
1672 fn_entries
= (kernel_host_table
[1] - kernel_host_table
[0]) / sizeof (void *);
1674 *tablep
= GOMP_PLUGIN_malloc (sizeof (struct mapping_table
) * fn_entries
);
1675 targ_fns
= GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor
)
1678 for (i
= 0; i
< fn_entries
; i
++)
1680 CUfunction function
;
1682 r
= cuModuleGetFunction (&function
, module
, fn_names
[i
]);
1683 if (r
!= CUDA_SUCCESS
)
1684 GOMP_PLUGIN_fatal ("cuModuleGetFunction error: %s", cuErrorMsg (r
));
1686 targ_fns
[i
].fn
= function
;
1687 targ_fns
[i
].name
= (const char *) fn_names
[i
];
1689 (*tablep
)[i
].host_start
= (uintptr_t) fn_table
[i
];
1690 (*tablep
)[i
].host_end
= (*tablep
)[i
].host_start
+ 1;
1691 (*tablep
)[i
].tgt_start
= (uintptr_t) &targ_fns
[i
];
1692 (*tablep
)[i
].tgt_end
= (*tablep
)[i
].tgt_start
+ 1;
1699 device_alloc (size_t size
)
1702 fprintf (stderr
, "libgomp plugin: %s:%s (%zu)\n", __FILE__
, __FUNCTION__
,
1706 return PTX_alloc (size
);
1710 device_free (void *ptr
)
1713 fprintf (stderr
, "libgomp plugin: %s:%s (%p)\n", __FILE__
, __FUNCTION__
, ptr
);
1720 device_dev2host (void *dst
, const void *src
, size_t n
)
1723 fprintf (stderr
, "libgomp plugin: %s:%s (%p, %p, %zu)\n", __FILE__
,
1728 return PTX_dev2host (dst
, src
, n
);
1732 device_host2dev (void *dst
, const void *src
, size_t n
)
1735 fprintf (stderr
, "libgomp plugin: %s:%s (%p, %p, %zu)\n", __FILE__
,
1736 __FUNCTION__
, dst
, src
, n
);
1739 return PTX_host2dev (dst
, src
, n
);
1742 void (*device_run
) (void *fn_ptr
, void *vars
) = NULL
;
1745 openacc_parallel (void (*fn
) (void *), size_t mapnum
, void **hostaddrs
,
1746 void **devaddrs
, size_t *sizes
, unsigned short *kinds
,
1747 int num_gangs
, int num_workers
, int vector_length
,
1748 int async
, void *targ_mem_desc
)
1751 fprintf (stderr
, "libgomp plugin: %s:%s (%p, %zu, %p, %p, %p, %d, %d, %d, "
1752 "%d, %p)\n", __FILE__
, __FUNCTION__
, fn
, mapnum
, hostaddrs
, sizes
,
1753 kinds
, num_gangs
, num_workers
, vector_length
, async
, targ_mem_desc
);
1756 PTX_exec (fn
, mapnum
, hostaddrs
, devaddrs
, sizes
, kinds
, num_gangs
,
1757 num_workers
, vector_length
, async
, targ_mem_desc
);
1761 openacc_open_device (int n
)
1764 fprintf (stderr
, "libgomp plugin: %s:%s (%d)\n", __FILE__
, __FUNCTION__
, n
);
1766 return PTX_open_device (n
);
1770 openacc_close_device (void *h
)
1773 fprintf (stderr
, "libgomp plugin: %s:%s (%p)\n", __FILE__
, __FUNCTION__
, h
);
1775 return PTX_close_device (h
);
1779 openacc_set_device_num (int n
)
1783 if (!PTX_dev
|| PTX_dev
->ord
!= n
)
1784 (void) PTX_open_device (n
);
1787 /* This can be called before the device is "opened" for the current thread, in
1788 which case we can't tell which device number should be returned. We don't
1789 actually want to open the device here, so just return -1 and let the caller
1790 (oacc-init.c:acc_get_device_num) handle it. */
1793 openacc_get_device_num (void)
1796 return PTX_dev
->ord
;
1802 openacc_avail (void)
1805 fprintf (stderr
, "libgomp plugin: %s:%s\n", __FILE__
, __FUNCTION__
);
1807 return PTX_avail ();
1811 openacc_register_async_cleanup (void *targ_mem_desc
)
1817 fprintf (stderr
, "libgomp plugin: %s:%s (%p)\n", __FILE__
, __FUNCTION__
,
1821 e
= (CUevent
*) GOMP_PLUGIN_malloc (sizeof (CUevent
));
1823 r
= cuEventCreate (e
, CU_EVENT_DISABLE_TIMING
);
1824 if (r
!= CUDA_SUCCESS
)
1825 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuErrorMsg (r
));
1827 r
= cuEventRecord (*e
, current_stream
->stream
);
1828 if (r
!= CUDA_SUCCESS
)
1829 GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuErrorMsg (r
));
1831 event_add (PTX_EVT_ASYNC_CLEANUP
, e
, targ_mem_desc
);
1835 openacc_async_test (int async
)
1838 fprintf (stderr
, "libgomp plugin: %s:%s (%d)\n", __FILE__
, __FUNCTION__
,
1841 return PTX_async_test (async
);
1845 openacc_async_test_all (void)
1848 fprintf (stderr
, "libgomp plugin: %s:%s\n", __FILE__
, __FUNCTION__
);
1850 return PTX_async_test_all ();
1854 openacc_async_wait (int async
)
1857 fprintf (stderr
, "libgomp plugin: %s:%s (%d)\n", __FILE__
, __FUNCTION__
,
1864 openacc_async_wait_async (int async1
, int async2
)
1867 fprintf (stderr
, "libgomp plugin: %s:%s (%d, %d)\n", __FILE__
, __FUNCTION__
,
1870 PTX_wait_async (async1
, async2
);
1874 openacc_async_wait_all (void)
1877 fprintf (stderr
, "libgomp plugin: %s:%s\n", __FILE__
, __FUNCTION__
);
1883 openacc_async_wait_all_async (int async
)
1886 fprintf (stderr
, "libgomp plugin: %s:%s (%d)\n", __FILE__
, __FUNCTION__
,
1889 PTX_wait_all_async (async
);
1893 openacc_async_set_async (int async
)
1896 fprintf (stderr
, "libgomp plugin: %s:%s (%d)\n", __FILE__
, __FUNCTION__
,
1899 PTX_set_async (async
);
1903 openacc_get_current_cuda_device (void)
1906 fprintf (stderr
, "libgomp plugin: %s:%s\n", __FILE__
, __FUNCTION__
);
1908 return PTX_get_current_cuda_device ();
1912 openacc_get_current_cuda_context (void)
1915 fprintf (stderr
, "libgomp plugin: %s:%s\n", __FILE__
, __FUNCTION__
);
1917 return PTX_get_current_cuda_context ();
1920 /* NOTE: This returns a CUstream, not a PTX_stream pointer. */
1923 openacc_get_cuda_stream (int async
)
1926 fprintf (stderr
, "libgomp plugin: %s:%s (%d)\n", __FILE__
, __FUNCTION__
,
1929 return PTX_get_cuda_stream (async
);
1932 /* NOTE: This takes a CUstream, not a PTX_stream pointer. */
1935 openacc_set_cuda_stream (int async
, void *stream
)
1938 fprintf (stderr
, "libgomp plugin: %s:%s (%d, %p)\n", __FILE__
, __FUNCTION__
,
1941 return PTX_set_cuda_stream (async
, stream
);