1 /* Plugin for NVPTX execution.
3 Copyright (C) 2013-2018 Free Software Foundation, Inc.
5 Contributed by Mentor Embedded.
7 This file is part of the GNU Offloading and Multi Processing Library
10 Libgomp is free software; you can redistribute it and/or modify it
11 under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 3, or (at your option)
15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
20 Under Section 7 of GPL version 3, you are granted additional
21 permissions described in the GCC Runtime Library Exception, version
22 3.1, as published by the Free Software Foundation.
24 You should have received a copy of the GNU General Public License and
25 a copy of the GCC Runtime Library Exception along with this program;
26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
27 <http://www.gnu.org/licenses/>. */
29 /* Nvidia PTX-specific parts of OpenACC support. The cuda driver
30 library appears to hold some implicit state, but the documentation
31 is not clear as to what that state might be. Or how one might
32 propagate it from one thread to another. */
37 #include "libgomp-plugin.h"
38 #include "oacc-plugin.h"
39 #include "gomp-constants.h"
52 #if PLUGIN_NVPTX_DYNAMIC
56 CUDA_ONE_CALL (cuCtxCreate) \
57 CUDA_ONE_CALL (cuCtxDestroy) \
58 CUDA_ONE_CALL (cuCtxGetCurrent) \
59 CUDA_ONE_CALL (cuCtxGetDevice) \
60 CUDA_ONE_CALL (cuCtxPopCurrent) \
61 CUDA_ONE_CALL (cuCtxPushCurrent) \
62 CUDA_ONE_CALL (cuCtxSynchronize) \
63 CUDA_ONE_CALL (cuDeviceGet) \
64 CUDA_ONE_CALL (cuDeviceGetAttribute) \
65 CUDA_ONE_CALL (cuDeviceGetCount) \
66 CUDA_ONE_CALL (cuEventCreate) \
67 CUDA_ONE_CALL (cuEventDestroy) \
68 CUDA_ONE_CALL (cuEventElapsedTime) \
69 CUDA_ONE_CALL (cuEventQuery) \
70 CUDA_ONE_CALL (cuEventRecord) \
71 CUDA_ONE_CALL (cuEventSynchronize) \
72 CUDA_ONE_CALL (cuFuncGetAttribute) \
73 CUDA_ONE_CALL (cuGetErrorString) \
74 CUDA_ONE_CALL (cuInit) \
75 CUDA_ONE_CALL (cuLaunchKernel) \
76 CUDA_ONE_CALL (cuLinkAddData) \
77 CUDA_ONE_CALL (cuLinkComplete) \
78 CUDA_ONE_CALL (cuLinkCreate) \
79 CUDA_ONE_CALL (cuLinkDestroy) \
80 CUDA_ONE_CALL (cuMemAlloc) \
81 CUDA_ONE_CALL (cuMemAllocHost) \
82 CUDA_ONE_CALL (cuMemcpy) \
83 CUDA_ONE_CALL (cuMemcpyDtoDAsync) \
84 CUDA_ONE_CALL (cuMemcpyDtoH) \
85 CUDA_ONE_CALL (cuMemcpyDtoHAsync) \
86 CUDA_ONE_CALL (cuMemcpyHtoD) \
87 CUDA_ONE_CALL (cuMemcpyHtoDAsync) \
88 CUDA_ONE_CALL (cuMemFree) \
89 CUDA_ONE_CALL (cuMemFreeHost) \
90 CUDA_ONE_CALL (cuMemGetAddressRange) \
91 CUDA_ONE_CALL (cuMemHostGetDevicePointer)\
92 CUDA_ONE_CALL (cuModuleGetFunction) \
93 CUDA_ONE_CALL (cuModuleGetGlobal) \
94 CUDA_ONE_CALL (cuModuleLoad) \
95 CUDA_ONE_CALL (cuModuleLoadData) \
96 CUDA_ONE_CALL (cuModuleUnload) \
97 CUDA_ONE_CALL (cuStreamCreate) \
98 CUDA_ONE_CALL (cuStreamDestroy) \
99 CUDA_ONE_CALL (cuStreamQuery) \
100 CUDA_ONE_CALL (cuStreamSynchronize) \
101 CUDA_ONE_CALL (cuStreamWaitEvent)
102 # define CUDA_ONE_CALL(call) \
103 __typeof (call) *call;
108 /* -1 if init_cuda_lib has not been called yet, false
109 if it has been and failed, true if it has been and succeeded. */
110 static signed char cuda_lib_inited
= -1;
112 /* Dynamically load the CUDA runtime library and initialize function
113 pointers, return false if unsuccessful, true if successful. */
117 if (cuda_lib_inited
!= -1)
118 return cuda_lib_inited
;
119 const char *cuda_runtime_lib
= "libcuda.so.1";
120 void *h
= dlopen (cuda_runtime_lib
, RTLD_LAZY
);
121 cuda_lib_inited
= false;
124 # undef CUDA_ONE_CALL
125 # define CUDA_ONE_CALL(call) CUDA_ONE_CALL_1 (call)
126 # define CUDA_ONE_CALL_1(call) \
127 cuda_lib.call = dlsym (h, #call); \
128 if (cuda_lib.call == NULL) \
131 cuda_lib_inited
= true;
134 # undef CUDA_ONE_CALL
135 # undef CUDA_ONE_CALL_1
136 # define CUDA_CALL_PREFIX cuda_lib.
138 # define CUDA_CALL_PREFIX
139 # define init_cuda_lib() true
142 #include "secure_getenv.h"
144 /* Convenience macros for the frequently used CUDA library call and
145 error handling sequence as well as CUDA library calls that
146 do the error checking themselves or don't do it at all. */
148 #define CUDA_CALL_ERET(ERET, FN, ...) \
151 = CUDA_CALL_PREFIX FN (__VA_ARGS__); \
152 if (__r != CUDA_SUCCESS) \
154 GOMP_PLUGIN_error (#FN " error: %s", \
160 #define CUDA_CALL(FN, ...) \
161 CUDA_CALL_ERET (false, FN, __VA_ARGS__)
163 #define CUDA_CALL_ASSERT(FN, ...) \
166 = CUDA_CALL_PREFIX FN (__VA_ARGS__); \
167 if (__r != CUDA_SUCCESS) \
169 GOMP_PLUGIN_fatal (#FN " error: %s", \
174 #define CUDA_CALL_NOCHECK(FN, ...) \
175 CUDA_CALL_PREFIX FN (__VA_ARGS__)
178 cuda_error (CUresult r
)
180 #if CUDA_VERSION < 7000
181 /* Specified in documentation and present in library from at least
182 5.5. Not declared in header file prior to 7.0. */
183 extern CUresult
cuGetErrorString (CUresult
, const char **);
187 r
= CUDA_CALL_NOCHECK (cuGetErrorString
, r
, &desc
);
188 if (r
!= CUDA_SUCCESS
)
189 desc
= "unknown cuda error";
194 static unsigned int instantiated_devices
= 0;
195 static pthread_mutex_t ptx_dev_lock
= PTHREAD_MUTEX_INITIALIZER
;
200 pthread_t host_thread
;
211 struct ptx_stream
*next
;
214 /* Thread-specific data for PTX. */
218 struct ptx_stream
*current_stream
;
219 struct ptx_device
*ptx_dev
;
230 map_init (struct ptx_stream
*s
)
232 int size
= getpagesize ();
238 CUDA_CALL (cuMemAllocHost
, &s
->h
, size
);
239 CUDA_CALL (cuMemHostGetDevicePointer
, &s
->d
, s
->h
, 0);
244 s
->h_end
= s
->h_begin
+ size
;
245 s
->h_next
= s
->h_prev
= s
->h_tail
= s
->h_begin
;
253 map_fini (struct ptx_stream
*s
)
255 CUDA_CALL (cuMemFreeHost
, s
->h
);
260 map_pop (struct ptx_stream
*s
)
271 s
->h_tail
+= m
->size
;
273 if (s
->h_tail
>= s
->h_end
)
274 s
->h_tail
= s
->h_begin
+ (int) (s
->h_tail
- s
->h_end
);
276 if (s
->h_next
== s
->h_tail
)
277 s
->h_prev
= s
->h_next
;
279 assert (s
->h_next
>= s
->h_begin
);
280 assert (s
->h_tail
>= s
->h_begin
);
281 assert (s
->h_prev
>= s
->h_begin
);
283 assert (s
->h_next
<= s
->h_end
);
284 assert (s
->h_tail
<= s
->h_end
);
285 assert (s
->h_prev
<= s
->h_end
);
289 map_push (struct ptx_stream
*s
, int async
, size_t size
, void **h
, void **d
)
297 left
= s
->h_end
- s
->h_next
;
298 size
+= sizeof (struct map
);
307 s
->h_next
= s
->h_begin
;
309 if (s
->h_next
+ size
> s
->h_end
)
310 GOMP_PLUGIN_fatal ("unable to push map");
319 offset
= (void *)&m
->mappings
[0] - s
->h
;
321 *d
= (void *)(s
->d
+ offset
);
322 *h
= (void *)(s
->h
+ offset
);
324 s
->h_prev
= s
->h_next
;
330 assert (s
->h_next
>= s
->h_begin
);
331 assert (s
->h_tail
>= s
->h_begin
);
332 assert (s
->h_prev
>= s
->h_begin
);
333 assert (s
->h_next
<= s
->h_end
);
334 assert (s
->h_tail
<= s
->h_end
);
335 assert (s
->h_prev
<= s
->h_end
);
340 /* Target data function launch information. */
342 struct targ_fn_launch
345 unsigned short dim
[GOMP_DIM_MAX
];
348 /* Target PTX object information. */
356 /* Target data image information. */
358 typedef struct nvptx_tdata
360 const struct targ_ptx_obj
*ptx_objs
;
363 const char *const *var_names
;
366 const struct targ_fn_launch
*fn_descs
;
370 /* Descriptor of a loaded function. */
372 struct targ_fn_descriptor
375 const struct targ_fn_launch
*launch
;
377 int max_threads_per_block
;
380 /* A loaded PTX image. */
381 struct ptx_image_data
383 const void *target_data
;
386 struct targ_fn_descriptor
*fns
; /* Array of functions. */
388 struct ptx_image_data
*next
;
396 struct ptx_stream
*null_stream
;
397 /* All non-null streams associated with this device (actually context),
398 either created implicitly or passed in from the user (via
399 acc_set_cuda_stream). */
400 struct ptx_stream
*active_streams
;
402 struct ptx_stream
**arr
;
405 /* A lock for use when manipulating the above stream list and array. */
406 pthread_mutex_t stream_lock
;
418 struct ptx_image_data
*images
; /* Images loaded on device. */
419 pthread_mutex_t image_lock
; /* Lock for above list. */
421 struct ptx_device
*next
;
429 PTX_EVT_ASYNC_CLEANUP
440 struct ptx_event
*next
;
443 static pthread_mutex_t ptx_event_lock
;
444 static struct ptx_event
*ptx_events
;
446 static struct ptx_device
**ptx_devices
;
448 static inline struct nvptx_thread
*
451 return (struct nvptx_thread
*) GOMP_PLUGIN_acc_thread ();
455 init_streams_for_device (struct ptx_device
*ptx_dev
, int concurrency
)
458 struct ptx_stream
*null_stream
459 = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream
));
461 null_stream
->stream
= NULL
;
462 null_stream
->host_thread
= pthread_self ();
463 null_stream
->multithreaded
= true;
464 null_stream
->d
= (CUdeviceptr
) NULL
;
465 null_stream
->h
= NULL
;
466 if (!map_init (null_stream
))
469 ptx_dev
->null_stream
= null_stream
;
470 ptx_dev
->active_streams
= NULL
;
471 pthread_mutex_init (&ptx_dev
->stream_lock
, NULL
);
476 /* This is just a guess -- make space for as many async streams as the
477 current device is capable of concurrently executing. This can grow
478 later as necessary. No streams are created yet. */
479 ptx_dev
->async_streams
.arr
480 = GOMP_PLUGIN_malloc (concurrency
* sizeof (struct ptx_stream
*));
481 ptx_dev
->async_streams
.size
= concurrency
;
483 for (i
= 0; i
< concurrency
; i
++)
484 ptx_dev
->async_streams
.arr
[i
] = NULL
;
490 fini_streams_for_device (struct ptx_device
*ptx_dev
)
492 free (ptx_dev
->async_streams
.arr
);
495 while (ptx_dev
->active_streams
!= NULL
)
497 struct ptx_stream
*s
= ptx_dev
->active_streams
;
498 ptx_dev
->active_streams
= ptx_dev
->active_streams
->next
;
502 CUresult r
= CUDA_CALL_NOCHECK (cuStreamDestroy
, s
->stream
);
503 if (r
!= CUDA_SUCCESS
)
505 GOMP_PLUGIN_error ("cuStreamDestroy error: %s", cuda_error (r
));
511 ret
&= map_fini (ptx_dev
->null_stream
);
512 free (ptx_dev
->null_stream
);
516 /* Select a stream for (OpenACC-semantics) ASYNC argument for the current
517 thread THREAD (and also current device/context). If CREATE is true, create
518 the stream if it does not exist (or use EXISTING if it is non-NULL), and
519 associate the stream with the same thread argument. Returns stream to use
522 static struct ptx_stream
*
523 select_stream_for_async (int async
, pthread_t thread
, bool create
,
526 struct nvptx_thread
*nvthd
= nvptx_thread ();
527 /* Local copy of TLS variable. */
528 struct ptx_device
*ptx_dev
= nvthd
->ptx_dev
;
529 struct ptx_stream
*stream
= NULL
;
530 int orig_async
= async
;
532 /* The special value acc_async_noval (-1) maps (for now) to an
533 implicitly-created stream, which is then handled the same as any other
534 numbered async stream. Other options are available, e.g. using the null
535 stream for anonymous async operations, or choosing an idle stream from an
536 active set. But, stick with this for now. */
537 if (async
> acc_async_sync
)
541 pthread_mutex_lock (&ptx_dev
->stream_lock
);
543 /* NOTE: AFAICT there's no particular need for acc_async_sync to map to the
544 null stream, and in fact better performance may be obtainable if it doesn't
545 (because the null stream enforces overly-strict synchronisation with
546 respect to other streams for legacy reasons, and that's probably not
547 needed with OpenACC). Maybe investigate later. */
548 if (async
== acc_async_sync
)
549 stream
= ptx_dev
->null_stream
;
550 else if (async
>= 0 && async
< ptx_dev
->async_streams
.size
551 && ptx_dev
->async_streams
.arr
[async
] && !(create
&& existing
))
552 stream
= ptx_dev
->async_streams
.arr
[async
];
553 else if (async
>= 0 && create
)
555 if (async
>= ptx_dev
->async_streams
.size
)
557 int i
, newsize
= ptx_dev
->async_streams
.size
* 2;
559 if (async
>= newsize
)
562 ptx_dev
->async_streams
.arr
563 = GOMP_PLUGIN_realloc (ptx_dev
->async_streams
.arr
,
564 newsize
* sizeof (struct ptx_stream
*));
566 for (i
= ptx_dev
->async_streams
.size
; i
< newsize
; i
++)
567 ptx_dev
->async_streams
.arr
[i
] = NULL
;
569 ptx_dev
->async_streams
.size
= newsize
;
572 /* Create a new stream on-demand if there isn't one already, or if we're
573 setting a particular async value to an existing (externally-provided)
575 if (!ptx_dev
->async_streams
.arr
[async
] || existing
)
579 = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream
));
582 s
->stream
= existing
;
585 r
= CUDA_CALL_NOCHECK (cuStreamCreate
, &s
->stream
,
587 if (r
!= CUDA_SUCCESS
)
589 pthread_mutex_unlock (&ptx_dev
->stream_lock
);
590 GOMP_PLUGIN_fatal ("cuStreamCreate error: %s",
595 /* If CREATE is true, we're going to be queueing some work on this
596 stream. Associate it with the current host thread. */
597 s
->host_thread
= thread
;
598 s
->multithreaded
= false;
600 s
->d
= (CUdeviceptr
) NULL
;
604 pthread_mutex_unlock (&ptx_dev
->stream_lock
);
605 GOMP_PLUGIN_fatal ("map_init fail");
608 s
->next
= ptx_dev
->active_streams
;
609 ptx_dev
->active_streams
= s
;
610 ptx_dev
->async_streams
.arr
[async
] = s
;
613 stream
= ptx_dev
->async_streams
.arr
[async
];
618 pthread_mutex_unlock (&ptx_dev
->stream_lock
);
619 GOMP_PLUGIN_fatal ("bad async %d", async
);
624 assert (stream
!= NULL
);
626 /* If we're trying to use the same stream from different threads
627 simultaneously, set stream->multithreaded to true. This affects the
628 behaviour of acc_async_test_all and acc_wait_all, which are supposed to
629 only wait for asynchronous launches from the same host thread they are
630 invoked on. If multiple threads use the same async value, we make note
631 of that here and fall back to testing/waiting for all threads in those
633 if (thread
!= stream
->host_thread
)
634 stream
->multithreaded
= true;
636 pthread_mutex_unlock (&ptx_dev
->stream_lock
);
638 else if (stream
&& !stream
->multithreaded
639 && !pthread_equal (stream
->host_thread
, thread
))
640 GOMP_PLUGIN_fatal ("async %d used on wrong thread", orig_async
);
645 /* Initialize the device. Return TRUE on success, else FALSE. PTX_DEV_LOCK
646 should be locked on entry and remains locked on exit. */
653 if (instantiated_devices
!= 0)
657 pthread_mutex_init (&ptx_event_lock
, NULL
);
659 if (!init_cuda_lib ())
662 CUDA_CALL (cuInit
, 0);
664 CUDA_CALL (cuDeviceGetCount
, &ndevs
);
665 ptx_devices
= GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device
*)
670 /* Select the N'th PTX device for the current host thread. The device must
671 have been previously opened before calling this function. */
674 nvptx_attach_host_thread_to_device (int n
)
678 struct ptx_device
*ptx_dev
;
681 r
= CUDA_CALL_NOCHECK (cuCtxGetDevice
, &dev
);
682 if (r
!= CUDA_SUCCESS
&& r
!= CUDA_ERROR_INVALID_CONTEXT
)
684 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r
));
688 if (r
!= CUDA_ERROR_INVALID_CONTEXT
&& dev
== n
)
694 ptx_dev
= ptx_devices
[n
];
697 GOMP_PLUGIN_error ("device %d not found", n
);
701 CUDA_CALL (cuCtxGetCurrent
, &thd_ctx
);
703 /* We don't necessarily have a current context (e.g. if it has been
704 destroyed. Pop it if we do though. */
706 CUDA_CALL (cuCtxPopCurrent
, &old_ctx
);
708 CUDA_CALL (cuCtxPushCurrent
, ptx_dev
->ctx
);
713 static struct ptx_device
*
714 nvptx_open_device (int n
)
716 struct ptx_device
*ptx_dev
;
717 CUdevice dev
, ctx_dev
;
719 int async_engines
, pi
;
721 CUDA_CALL_ERET (NULL
, cuDeviceGet
, &dev
, n
);
723 ptx_dev
= GOMP_PLUGIN_malloc (sizeof (struct ptx_device
));
727 ptx_dev
->ctx_shared
= false;
729 r
= CUDA_CALL_NOCHECK (cuCtxGetDevice
, &ctx_dev
);
730 if (r
!= CUDA_SUCCESS
&& r
!= CUDA_ERROR_INVALID_CONTEXT
)
732 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r
));
736 if (r
!= CUDA_ERROR_INVALID_CONTEXT
&& ctx_dev
!= dev
)
738 /* The current host thread has an active context for a different device.
741 CUDA_CALL_ERET (NULL
, cuCtxPopCurrent
, &old_ctx
);
744 CUDA_CALL_ERET (NULL
, cuCtxGetCurrent
, &ptx_dev
->ctx
);
747 CUDA_CALL_ERET (NULL
, cuCtxCreate
, &ptx_dev
->ctx
, CU_CTX_SCHED_AUTO
, dev
);
749 ptx_dev
->ctx_shared
= true;
751 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
752 &pi
, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP
, dev
);
753 ptx_dev
->overlap
= pi
;
755 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
756 &pi
, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY
, dev
);
759 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
760 &pi
, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS
, dev
);
761 ptx_dev
->concur
= pi
;
763 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
764 &pi
, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE
, dev
);
767 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
768 &pi
, CU_DEVICE_ATTRIBUTE_INTEGRATED
, dev
);
771 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
772 &pi
, CU_DEVICE_ATTRIBUTE_CLOCK_RATE
, dev
);
773 ptx_dev
->clock_khz
= pi
;
775 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
776 &pi
, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT
, dev
);
777 ptx_dev
->num_sms
= pi
;
779 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
780 &pi
, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK
, dev
);
781 ptx_dev
->regs_per_block
= pi
;
783 /* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82 is defined only
784 in CUDA 6.0 and newer. */
785 r
= CUDA_CALL_NOCHECK (cuDeviceGetAttribute
, &pi
, 82, dev
);
786 /* Fallback: use limit of registers per block, which is usually equal. */
787 if (r
== CUDA_ERROR_INVALID_VALUE
)
788 pi
= ptx_dev
->regs_per_block
;
789 else if (r
!= CUDA_SUCCESS
)
791 GOMP_PLUGIN_error ("cuDeviceGetAttribute error: %s", cuda_error (r
));
794 ptx_dev
->regs_per_sm
= pi
;
796 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
797 &pi
, CU_DEVICE_ATTRIBUTE_WARP_SIZE
, dev
);
800 GOMP_PLUGIN_error ("Only warp size 32 is supported");
804 r
= CUDA_CALL_NOCHECK (cuDeviceGetAttribute
, &async_engines
,
805 CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT
, dev
);
806 if (r
!= CUDA_SUCCESS
)
809 ptx_dev
->images
= NULL
;
810 pthread_mutex_init (&ptx_dev
->image_lock
, NULL
);
812 if (!init_streams_for_device (ptx_dev
, async_engines
))
819 nvptx_close_device (struct ptx_device
*ptx_dev
)
824 if (!fini_streams_for_device (ptx_dev
))
827 pthread_mutex_destroy (&ptx_dev
->image_lock
);
829 if (!ptx_dev
->ctx_shared
)
830 CUDA_CALL (cuCtxDestroy
, ptx_dev
->ctx
);
837 nvptx_get_num_devices (void)
841 /* PR libgomp/65099: Currently, we only support offloading in 64-bit
843 if (sizeof (void *) != 8)
845 GOMP_PLUGIN_debug (0, "Disabling nvptx offloading;"
846 " only 64-bit configurations are supported\n");
850 /* This function will be called before the plugin has been initialized in
851 order to enumerate available devices, but CUDA API routines can't be used
852 until cuInit has been called. Just call it now (but don't yet do any
853 further initialization). */
854 if (instantiated_devices
== 0)
856 if (!init_cuda_lib ())
858 CUresult r
= CUDA_CALL_NOCHECK (cuInit
, 0);
859 /* This is not an error: e.g. we may have CUDA libraries installed but
860 no devices available. */
861 if (r
!= CUDA_SUCCESS
)
863 GOMP_PLUGIN_debug (0, "Disabling nvptx offloading; cuInit: %s\n",
869 CUDA_CALL_ERET (-1, cuDeviceGetCount
, &n
);
874 notify_var (const char *var_name
, const char *env_var
)
877 GOMP_PLUGIN_debug (0, "%s: <Not defined>\n", var_name
);
879 GOMP_PLUGIN_debug (0, "%s: '%s'\n", var_name
, env_var
);
883 process_GOMP_NVPTX_JIT (intptr_t *gomp_nvptx_o
)
885 const char *var_name
= "GOMP_NVPTX_JIT";
886 const char *env_var
= secure_getenv (var_name
);
887 notify_var (var_name
, env_var
);
892 const char *c
= env_var
;
898 if (c
[0] == '-' && c
[1] == 'O'
899 && '0' <= c
[2] && c
[2] <= '4'
900 && (c
[3] == '\0' || c
[3] == ' '))
902 *gomp_nvptx_o
= c
[2] - '0';
907 GOMP_PLUGIN_error ("Error parsing %s", var_name
);
913 link_ptx (CUmodule
*module
, const struct targ_ptx_obj
*ptx_objs
,
916 CUjit_option opts
[7];
921 CUlinkState linkstate
;
924 size_t linkoutsize
__attribute__ ((unused
));
926 opts
[0] = CU_JIT_WALL_TIME
;
927 optvals
[0] = &elapsed
;
929 opts
[1] = CU_JIT_INFO_LOG_BUFFER
;
930 optvals
[1] = &ilog
[0];
932 opts
[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES
;
933 optvals
[2] = (void *) sizeof ilog
;
935 opts
[3] = CU_JIT_ERROR_LOG_BUFFER
;
936 optvals
[3] = &elog
[0];
938 opts
[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES
;
939 optvals
[4] = (void *) sizeof elog
;
941 opts
[5] = CU_JIT_LOG_VERBOSE
;
942 optvals
[5] = (void *) 1;
944 static intptr_t gomp_nvptx_o
= -1;
946 static bool init_done
= false;
949 process_GOMP_NVPTX_JIT (&gomp_nvptx_o
);
954 if (gomp_nvptx_o
!= -1)
956 opts
[nopts
] = CU_JIT_OPTIMIZATION_LEVEL
;
957 optvals
[nopts
] = (void *) gomp_nvptx_o
;
961 CUDA_CALL (cuLinkCreate
, nopts
, opts
, optvals
, &linkstate
);
963 for (; num_objs
--; ptx_objs
++)
965 /* cuLinkAddData's 'data' argument erroneously omits the const
967 GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs
->code
);
968 r
= CUDA_CALL_NOCHECK (cuLinkAddData
, linkstate
, CU_JIT_INPUT_PTX
,
969 (char *) ptx_objs
->code
, ptx_objs
->size
,
971 if (r
!= CUDA_SUCCESS
)
973 GOMP_PLUGIN_error ("Link error log %s\n", &elog
[0]);
974 GOMP_PLUGIN_error ("cuLinkAddData (ptx_code) error: %s",
980 GOMP_PLUGIN_debug (0, "Linking\n");
981 r
= CUDA_CALL_NOCHECK (cuLinkComplete
, linkstate
, &linkout
, &linkoutsize
);
983 GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed
);
984 GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog
[0]);
986 if (r
!= CUDA_SUCCESS
)
988 GOMP_PLUGIN_error ("cuLinkComplete error: %s", cuda_error (r
));
992 CUDA_CALL (cuModuleLoadData
, module
, linkout
);
993 CUDA_CALL (cuLinkDestroy
, linkstate
);
998 event_gc (bool memmap_lockable
)
1000 struct ptx_event
*ptx_event
= ptx_events
;
1001 struct ptx_event
*async_cleanups
= NULL
;
1002 struct nvptx_thread
*nvthd
= nvptx_thread ();
1004 pthread_mutex_lock (&ptx_event_lock
);
1006 while (ptx_event
!= NULL
)
1009 struct ptx_event
*e
= ptx_event
;
1011 ptx_event
= ptx_event
->next
;
1013 if (e
->ord
!= nvthd
->ptx_dev
->ord
)
1016 r
= CUDA_CALL_NOCHECK (cuEventQuery
, *e
->evt
);
1017 if (r
== CUDA_SUCCESS
)
1019 bool append_async
= false;
1034 case PTX_EVT_ASYNC_CLEANUP
:
1036 /* The function gomp_plugin_async_unmap_vars needs to claim the
1037 memory-map splay tree lock for the current device, so we
1038 can't call it when one of our callers has already claimed
1039 the lock. In that case, just delay the GC for this event
1041 if (!memmap_lockable
)
1044 append_async
= true;
1049 CUDA_CALL_NOCHECK (cuEventDestroy
, *te
);
1052 /* Unlink 'e' from ptx_events list. */
1053 if (ptx_events
== e
)
1054 ptx_events
= ptx_events
->next
;
1057 struct ptx_event
*e_
= ptx_events
;
1058 while (e_
->next
!= e
)
1060 e_
->next
= e_
->next
->next
;
1065 e
->next
= async_cleanups
;
1073 pthread_mutex_unlock (&ptx_event_lock
);
1075 /* We have to do these here, after ptx_event_lock is released. */
1076 while (async_cleanups
)
1078 struct ptx_event
*e
= async_cleanups
;
1079 async_cleanups
= async_cleanups
->next
;
1081 GOMP_PLUGIN_async_unmap_vars (e
->addr
, e
->val
);
1087 event_add (enum ptx_event_type type
, CUevent
*e
, void *h
, int val
)
1089 struct ptx_event
*ptx_event
;
1090 struct nvptx_thread
*nvthd
= nvptx_thread ();
1092 assert (type
== PTX_EVT_MEM
|| type
== PTX_EVT_KNL
|| type
== PTX_EVT_SYNC
1093 || type
== PTX_EVT_ASYNC_CLEANUP
);
1095 ptx_event
= GOMP_PLUGIN_malloc (sizeof (struct ptx_event
));
1096 ptx_event
->type
= type
;
1098 ptx_event
->addr
= h
;
1099 ptx_event
->ord
= nvthd
->ptx_dev
->ord
;
1100 ptx_event
->val
= val
;
1102 pthread_mutex_lock (&ptx_event_lock
);
1104 ptx_event
->next
= ptx_events
;
1105 ptx_events
= ptx_event
;
1107 pthread_mutex_unlock (&ptx_event_lock
);
1111 nvptx_exec (void (*fn
), size_t mapnum
, void **hostaddrs
, void **devaddrs
,
1112 int async
, unsigned *dims
, void *targ_mem_desc
)
1114 struct targ_fn_descriptor
*targ_fn
= (struct targ_fn_descriptor
*) fn
;
1115 CUfunction function
;
1118 struct ptx_stream
*dev_str
;
1121 struct nvptx_thread
*nvthd
= nvptx_thread ();
1122 const char *maybe_abort_msg
= "(perhaps abort was called)";
1124 function
= targ_fn
->fn
;
1126 dev_str
= select_stream_for_async (async
, pthread_self (), false, NULL
);
1127 assert (dev_str
== nvthd
->current_stream
);
1129 /* Initialize the launch dimensions. Typically this is constant,
1130 provided by the device compiler, but we must permit runtime
1133 for (i
= 0; i
!= GOMP_DIM_MAX
; i
++)
1135 if (targ_fn
->launch
->dim
[i
])
1136 dims
[i
] = targ_fn
->launch
->dim
[i
];
1143 /* See if the user provided GOMP_OPENACC_DIM environment
1144 variable to specify runtime defaults. */
1145 static int default_dims
[GOMP_DIM_MAX
];
1147 pthread_mutex_lock (&ptx_dev_lock
);
1148 if (!default_dims
[0])
1150 for (int i
= 0; i
< GOMP_DIM_MAX
; ++i
)
1151 default_dims
[i
] = GOMP_PLUGIN_acc_default_dim (i
);
1153 int warp_size
, block_size
, dev_size
, cpu_size
;
1154 CUdevice dev
= nvptx_thread()->ptx_dev
->dev
;
1155 /* 32 is the default for known hardware. */
1156 int gang
= 0, worker
= 32, vector
= 32;
1157 CUdevice_attribute cu_tpb
, cu_ws
, cu_mpc
, cu_tpm
;
1159 cu_tpb
= CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK
;
1160 cu_ws
= CU_DEVICE_ATTRIBUTE_WARP_SIZE
;
1161 cu_mpc
= CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT
;
1162 cu_tpm
= CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR
;
1164 if (CUDA_CALL_NOCHECK (cuDeviceGetAttribute
, &block_size
, cu_tpb
,
1165 dev
) == CUDA_SUCCESS
1166 && CUDA_CALL_NOCHECK (cuDeviceGetAttribute
, &warp_size
, cu_ws
,
1167 dev
) == CUDA_SUCCESS
1168 && CUDA_CALL_NOCHECK (cuDeviceGetAttribute
, &dev_size
, cu_mpc
,
1169 dev
) == CUDA_SUCCESS
1170 && CUDA_CALL_NOCHECK (cuDeviceGetAttribute
, &cpu_size
, cu_tpm
,
1171 dev
) == CUDA_SUCCESS
)
1173 GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
1174 " dev_size=%d, cpu_size=%d\n",
1175 warp_size
, block_size
, dev_size
, cpu_size
);
1176 gang
= (cpu_size
/ block_size
) * dev_size
;
1177 worker
= block_size
/ warp_size
;
1181 /* There is no upper bound on the gang size. The best size
1182 matches the hardware configuration. Logical gangs are
1183 scheduled onto physical hardware. To maximize usage, we
1184 should guess a large number. */
1185 if (default_dims
[GOMP_DIM_GANG
] < 1)
1186 default_dims
[GOMP_DIM_GANG
] = gang
? gang
: 1024;
1187 /* The worker size must not exceed the hardware. */
1188 if (default_dims
[GOMP_DIM_WORKER
] < 1
1189 || (default_dims
[GOMP_DIM_WORKER
] > worker
&& gang
))
1190 default_dims
[GOMP_DIM_WORKER
] = worker
;
1191 /* The vector size must exactly match the hardware. */
1192 if (default_dims
[GOMP_DIM_VECTOR
] < 1
1193 || (default_dims
[GOMP_DIM_VECTOR
] != vector
&& gang
))
1194 default_dims
[GOMP_DIM_VECTOR
] = vector
;
1196 GOMP_PLUGIN_debug (0, " default dimensions [%d,%d,%d]\n",
1197 default_dims
[GOMP_DIM_GANG
],
1198 default_dims
[GOMP_DIM_WORKER
],
1199 default_dims
[GOMP_DIM_VECTOR
]);
1201 pthread_mutex_unlock (&ptx_dev_lock
);
1203 for (i
= 0; i
!= GOMP_DIM_MAX
; i
++)
1205 dims
[i
] = default_dims
[i
];
1208 /* This reserves a chunk of a pre-allocated page of memory mapped on both
1209 the host and the device. HP is a host pointer to the new chunk, and DP is
1210 the corresponding device pointer. */
1211 map_push (dev_str
, async
, mapnum
* sizeof (void *), &hp
, &dp
);
1213 GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__
);
1215 /* Copy the array of arguments to the mapped page. */
1216 for (i
= 0; i
< mapnum
; i
++)
1217 ((void **) hp
)[i
] = devaddrs
[i
];
1219 /* Copy the (device) pointers to arguments to the device (dp and hp might in
1220 fact have the same value on a unified-memory system). */
1221 CUDA_CALL_ASSERT (cuMemcpy
, (CUdeviceptr
) dp
, (CUdeviceptr
) hp
,
1222 mapnum
* sizeof (void *));
1223 GOMP_PLUGIN_debug (0, " %s: kernel %s: launch"
1224 " gangs=%u, workers=%u, vectors=%u\n",
1225 __FUNCTION__
, targ_fn
->launch
->fn
, dims
[GOMP_DIM_GANG
],
1226 dims
[GOMP_DIM_WORKER
], dims
[GOMP_DIM_VECTOR
]);
1230 // num_gangs nctaid.x
1231 // num_workers ntid.y
1232 // vector length ntid.x
1235 CUDA_CALL_ASSERT (cuLaunchKernel
, function
,
1236 dims
[GOMP_DIM_GANG
], 1, 1,
1237 dims
[GOMP_DIM_VECTOR
], dims
[GOMP_DIM_WORKER
], 1,
1238 0, dev_str
->stream
, kargs
, 0);
1240 #ifndef DISABLE_ASYNC
1241 if (async
< acc_async_noval
)
1243 r
= CUDA_CALL_NOCHECK (cuStreamSynchronize
, dev_str
->stream
);
1244 if (r
== CUDA_ERROR_LAUNCH_FAILED
)
1245 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r
),
1247 else if (r
!= CUDA_SUCCESS
)
1248 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r
));
1254 e
= (CUevent
*)GOMP_PLUGIN_malloc (sizeof (CUevent
));
1256 r
= CUDA_CALL_NOCHECK (cuEventCreate
, e
, CU_EVENT_DISABLE_TIMING
);
1257 if (r
== CUDA_ERROR_LAUNCH_FAILED
)
1258 GOMP_PLUGIN_fatal ("cuEventCreate error: %s %s\n", cuda_error (r
),
1260 else if (r
!= CUDA_SUCCESS
)
1261 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r
));
1265 CUDA_CALL_ASSERT (cuEventRecord
, *e
, dev_str
->stream
);
1267 event_add (PTX_EVT_KNL
, e
, (void *)dev_str
, 0);
1270 r
= CUDA_CALL_NOCHECK (cuCtxSynchronize
, );
1271 if (r
== CUDA_ERROR_LAUNCH_FAILED
)
1272 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r
),
1274 else if (r
!= CUDA_SUCCESS
)
1275 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r
));
1278 GOMP_PLUGIN_debug (0, " %s: kernel %s: finished\n", __FUNCTION__
,
1279 targ_fn
->launch
->fn
);
1281 #ifndef DISABLE_ASYNC
1282 if (async
< acc_async_noval
)
1287 void * openacc_get_current_cuda_context (void);
1290 nvptx_alloc (size_t s
)
1294 CUDA_CALL_ERET (NULL
, cuMemAlloc
, &d
, s
);
1299 nvptx_free (void *p
)
1304 CUDA_CALL (cuMemGetAddressRange
, &pb
, &ps
, (CUdeviceptr
) p
);
1305 if ((CUdeviceptr
) p
!= pb
)
1307 GOMP_PLUGIN_error ("invalid device address");
1311 CUDA_CALL (cuMemFree
, (CUdeviceptr
) p
);
1317 nvptx_host2dev (void *d
, const void *h
, size_t s
)
1321 struct nvptx_thread
*nvthd
= nvptx_thread ();
1327 GOMP_PLUGIN_error ("invalid device address");
1331 CUDA_CALL (cuMemGetAddressRange
, &pb
, &ps
, (CUdeviceptr
) d
);
1335 GOMP_PLUGIN_error ("invalid device address");
1340 GOMP_PLUGIN_error ("invalid host address");
1345 GOMP_PLUGIN_error ("invalid host or device address");
1348 if ((void *)(d
+ s
) > (void *)(pb
+ ps
))
1350 GOMP_PLUGIN_error ("invalid size");
1354 #ifndef DISABLE_ASYNC
1355 if (nvthd
&& nvthd
->current_stream
!= nvthd
->ptx_dev
->null_stream
)
1357 CUevent
*e
= (CUevent
*)GOMP_PLUGIN_malloc (sizeof (CUevent
));
1358 CUDA_CALL (cuEventCreate
, e
, CU_EVENT_DISABLE_TIMING
);
1360 CUDA_CALL (cuMemcpyHtoDAsync
,
1361 (CUdeviceptr
) d
, h
, s
, nvthd
->current_stream
->stream
);
1362 CUDA_CALL (cuEventRecord
, *e
, nvthd
->current_stream
->stream
);
1363 event_add (PTX_EVT_MEM
, e
, (void *)h
, 0);
1367 CUDA_CALL (cuMemcpyHtoD
, (CUdeviceptr
) d
, h
, s
);
1373 nvptx_dev2host (void *h
, const void *d
, size_t s
)
1377 struct nvptx_thread
*nvthd
= nvptx_thread ();
1383 GOMP_PLUGIN_error ("invalid device address");
1387 CUDA_CALL (cuMemGetAddressRange
, &pb
, &ps
, (CUdeviceptr
) d
);
1391 GOMP_PLUGIN_error ("invalid device address");
1396 GOMP_PLUGIN_error ("invalid host address");
1401 GOMP_PLUGIN_error ("invalid host or device address");
1404 if ((void *)(d
+ s
) > (void *)(pb
+ ps
))
1406 GOMP_PLUGIN_error ("invalid size");
1410 #ifndef DISABLE_ASYNC
1411 if (nvthd
&& nvthd
->current_stream
!= nvthd
->ptx_dev
->null_stream
)
1413 CUevent
*e
= (CUevent
*) GOMP_PLUGIN_malloc (sizeof (CUevent
));
1414 CUDA_CALL (cuEventCreate
, e
, CU_EVENT_DISABLE_TIMING
);
1416 CUDA_CALL (cuMemcpyDtoHAsync
,
1417 h
, (CUdeviceptr
) d
, s
, nvthd
->current_stream
->stream
);
1418 CUDA_CALL (cuEventRecord
, *e
, nvthd
->current_stream
->stream
);
1419 event_add (PTX_EVT_MEM
, e
, (void *)h
, 0);
1423 CUDA_CALL (cuMemcpyDtoH
, h
, (CUdeviceptr
) d
, s
);
1429 nvptx_set_async (int async
)
1431 struct nvptx_thread
*nvthd
= nvptx_thread ();
1432 nvthd
->current_stream
1433 = select_stream_for_async (async
, pthread_self (), true, NULL
);
1437 nvptx_async_test (int async
)
1440 struct ptx_stream
*s
;
1442 s
= select_stream_for_async (async
, pthread_self (), false, NULL
);
1445 GOMP_PLUGIN_fatal ("unknown async %d", async
);
1447 r
= CUDA_CALL_NOCHECK (cuStreamQuery
, s
->stream
);
1448 if (r
== CUDA_SUCCESS
)
1450 /* The oacc-parallel.c:goacc_wait function calls this hook to determine
1451 whether all work has completed on this stream, and if so omits the call
1452 to the wait hook. If that happens, event_gc might not get called
1453 (which prevents variables from getting unmapped and their associated
1454 device storage freed), so call it here. */
1458 else if (r
== CUDA_ERROR_NOT_READY
)
1461 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r
));
1467 nvptx_async_test_all (void)
1469 struct ptx_stream
*s
;
1470 pthread_t self
= pthread_self ();
1471 struct nvptx_thread
*nvthd
= nvptx_thread ();
1473 pthread_mutex_lock (&nvthd
->ptx_dev
->stream_lock
);
1475 for (s
= nvthd
->ptx_dev
->active_streams
; s
!= NULL
; s
= s
->next
)
1477 if ((s
->multithreaded
|| pthread_equal (s
->host_thread
, self
))
1478 && CUDA_CALL_NOCHECK (cuStreamQuery
,
1479 s
->stream
) == CUDA_ERROR_NOT_READY
)
1481 pthread_mutex_unlock (&nvthd
->ptx_dev
->stream_lock
);
1486 pthread_mutex_unlock (&nvthd
->ptx_dev
->stream_lock
);
1494 nvptx_wait (int async
)
1496 struct ptx_stream
*s
;
1498 s
= select_stream_for_async (async
, pthread_self (), false, NULL
);
1500 GOMP_PLUGIN_fatal ("unknown async %d", async
);
1502 CUDA_CALL_ASSERT (cuStreamSynchronize
, s
->stream
);
1508 nvptx_wait_async (int async1
, int async2
)
1511 struct ptx_stream
*s1
, *s2
;
1512 pthread_t self
= pthread_self ();
1514 /* The stream that is waiting (rather than being waited for) doesn't
1515 necessarily have to exist already. */
1516 s2
= select_stream_for_async (async2
, self
, true, NULL
);
1518 s1
= select_stream_for_async (async1
, self
, false, NULL
);
1520 GOMP_PLUGIN_fatal ("invalid async 1\n");
1523 GOMP_PLUGIN_fatal ("identical parameters");
1525 e
= (CUevent
*) GOMP_PLUGIN_malloc (sizeof (CUevent
));
1527 CUDA_CALL_ASSERT (cuEventCreate
, e
, CU_EVENT_DISABLE_TIMING
);
1531 CUDA_CALL_ASSERT (cuEventRecord
, *e
, s1
->stream
);
1533 event_add (PTX_EVT_SYNC
, e
, NULL
, 0);
1535 CUDA_CALL_ASSERT (cuStreamWaitEvent
, s2
->stream
, *e
, 0);
1539 nvptx_wait_all (void)
1542 struct ptx_stream
*s
;
1543 pthread_t self
= pthread_self ();
1544 struct nvptx_thread
*nvthd
= nvptx_thread ();
1546 pthread_mutex_lock (&nvthd
->ptx_dev
->stream_lock
);
1548 /* Wait for active streams initiated by this thread (or by multiple threads)
1550 for (s
= nvthd
->ptx_dev
->active_streams
; s
!= NULL
; s
= s
->next
)
1552 if (s
->multithreaded
|| pthread_equal (s
->host_thread
, self
))
1554 r
= CUDA_CALL_NOCHECK (cuStreamQuery
, s
->stream
);
1555 if (r
== CUDA_SUCCESS
)
1557 else if (r
!= CUDA_ERROR_NOT_READY
)
1558 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r
));
1560 CUDA_CALL_ASSERT (cuStreamSynchronize
, s
->stream
);
1564 pthread_mutex_unlock (&nvthd
->ptx_dev
->stream_lock
);
1570 nvptx_wait_all_async (int async
)
1572 struct ptx_stream
*waiting_stream
, *other_stream
;
1574 struct nvptx_thread
*nvthd
= nvptx_thread ();
1575 pthread_t self
= pthread_self ();
1577 /* The stream doing the waiting. This could be the first mention of the
1578 stream, so create it if necessary. */
1580 = select_stream_for_async (async
, pthread_self (), true, NULL
);
1582 /* Launches on the null stream already block on other streams in the
1584 if (!waiting_stream
|| waiting_stream
== nvthd
->ptx_dev
->null_stream
)
1589 pthread_mutex_lock (&nvthd
->ptx_dev
->stream_lock
);
1591 for (other_stream
= nvthd
->ptx_dev
->active_streams
;
1592 other_stream
!= NULL
;
1593 other_stream
= other_stream
->next
)
1595 if (!other_stream
->multithreaded
1596 && !pthread_equal (other_stream
->host_thread
, self
))
1599 e
= (CUevent
*) GOMP_PLUGIN_malloc (sizeof (CUevent
));
1601 CUDA_CALL_ASSERT (cuEventCreate
, e
, CU_EVENT_DISABLE_TIMING
);
1603 /* Record an event on the waited-for stream. */
1604 CUDA_CALL_ASSERT (cuEventRecord
, *e
, other_stream
->stream
);
1606 event_add (PTX_EVT_SYNC
, e
, NULL
, 0);
1608 CUDA_CALL_ASSERT (cuStreamWaitEvent
, waiting_stream
->stream
, *e
, 0);
1611 pthread_mutex_unlock (&nvthd
->ptx_dev
->stream_lock
);
1615 nvptx_get_current_cuda_device (void)
1617 struct nvptx_thread
*nvthd
= nvptx_thread ();
1619 if (!nvthd
|| !nvthd
->ptx_dev
)
1622 return &nvthd
->ptx_dev
->dev
;
1626 nvptx_get_current_cuda_context (void)
1628 struct nvptx_thread
*nvthd
= nvptx_thread ();
1630 if (!nvthd
|| !nvthd
->ptx_dev
)
1633 return nvthd
->ptx_dev
->ctx
;
1637 nvptx_get_cuda_stream (int async
)
1639 struct ptx_stream
*s
;
1640 struct nvptx_thread
*nvthd
= nvptx_thread ();
1642 if (!nvthd
|| !nvthd
->ptx_dev
)
1645 s
= select_stream_for_async (async
, pthread_self (), false, NULL
);
1647 return s
? s
->stream
: NULL
;
1651 nvptx_set_cuda_stream (int async
, void *stream
)
1653 struct ptx_stream
*oldstream
;
1654 pthread_t self
= pthread_self ();
1655 struct nvptx_thread
*nvthd
= nvptx_thread ();
1658 GOMP_PLUGIN_fatal ("bad async %d", async
);
1660 pthread_mutex_lock (&nvthd
->ptx_dev
->stream_lock
);
1662 /* We have a list of active streams and an array mapping async values to
1663 entries of that list. We need to take "ownership" of the passed-in stream,
1664 and add it to our list, removing the previous entry also (if there was one)
1665 in order to prevent resource leaks. Note the potential for surprise
1666 here: maybe we should keep track of passed-in streams and leave it up to
1667 the user to tidy those up, but that doesn't work for stream handles
1668 returned from acc_get_cuda_stream above... */
1670 oldstream
= select_stream_for_async (async
, self
, false, NULL
);
1674 if (nvthd
->ptx_dev
->active_streams
== oldstream
)
1675 nvthd
->ptx_dev
->active_streams
= nvthd
->ptx_dev
->active_streams
->next
;
1678 struct ptx_stream
*s
= nvthd
->ptx_dev
->active_streams
;
1679 while (s
->next
!= oldstream
)
1681 s
->next
= s
->next
->next
;
1684 CUDA_CALL_ASSERT (cuStreamDestroy
, oldstream
->stream
);
1686 if (!map_fini (oldstream
))
1687 GOMP_PLUGIN_fatal ("error when freeing host memory");
1692 pthread_mutex_unlock (&nvthd
->ptx_dev
->stream_lock
);
1694 (void) select_stream_for_async (async
, self
, true, (CUstream
) stream
);
1699 /* Plugin entry points. */
1702 GOMP_OFFLOAD_get_name (void)
1708 GOMP_OFFLOAD_get_caps (void)
1710 return GOMP_OFFLOAD_CAP_OPENACC_200
| GOMP_OFFLOAD_CAP_OPENMP_400
;
1714 GOMP_OFFLOAD_get_type (void)
1716 return OFFLOAD_TARGET_TYPE_NVIDIA_PTX
;
1720 GOMP_OFFLOAD_get_num_devices (void)
1722 return nvptx_get_num_devices ();
1726 GOMP_OFFLOAD_init_device (int n
)
1728 struct ptx_device
*dev
;
1730 pthread_mutex_lock (&ptx_dev_lock
);
1732 if (!nvptx_init () || ptx_devices
[n
] != NULL
)
1734 pthread_mutex_unlock (&ptx_dev_lock
);
1738 dev
= nvptx_open_device (n
);
1741 ptx_devices
[n
] = dev
;
1742 instantiated_devices
++;
1745 pthread_mutex_unlock (&ptx_dev_lock
);
1751 GOMP_OFFLOAD_fini_device (int n
)
1753 pthread_mutex_lock (&ptx_dev_lock
);
1755 if (ptx_devices
[n
] != NULL
)
1757 if (!nvptx_attach_host_thread_to_device (n
)
1758 || !nvptx_close_device (ptx_devices
[n
]))
1760 pthread_mutex_unlock (&ptx_dev_lock
);
1763 ptx_devices
[n
] = NULL
;
1764 instantiated_devices
--;
1767 pthread_mutex_unlock (&ptx_dev_lock
);
1771 /* Return the libgomp version number we're compatible with. There is
1772 no requirement for cross-version compatibility. */
1775 GOMP_OFFLOAD_version (void)
1777 return GOMP_VERSION
;
1780 /* Initialize __nvptx_clocktick, if present in MODULE. */
1783 nvptx_set_clocktick (CUmodule module
, struct ptx_device
*dev
)
1786 CUresult r
= CUDA_CALL_NOCHECK (cuModuleGetGlobal
, &dptr
, NULL
,
1787 module
, "__nvptx_clocktick");
1788 if (r
== CUDA_ERROR_NOT_FOUND
)
1790 if (r
!= CUDA_SUCCESS
)
1791 GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r
));
1792 double __nvptx_clocktick
= 1e-3 / dev
->clock_khz
;
1793 r
= CUDA_CALL_NOCHECK (cuMemcpyHtoD
, dptr
, &__nvptx_clocktick
,
1794 sizeof (__nvptx_clocktick
));
1795 if (r
!= CUDA_SUCCESS
)
1796 GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r
));
1799 /* Load the (partial) program described by TARGET_DATA to device
1800 number ORD. Allocate and return TARGET_TABLE. */
1803 GOMP_OFFLOAD_load_image (int ord
, unsigned version
, const void *target_data
,
1804 struct addr_pair
**target_table
)
1807 const char *const *var_names
;
1808 const struct targ_fn_launch
*fn_descs
;
1809 unsigned int fn_entries
, var_entries
, i
, j
;
1810 struct targ_fn_descriptor
*targ_fns
;
1811 struct addr_pair
*targ_tbl
;
1812 const nvptx_tdata_t
*img_header
= (const nvptx_tdata_t
*) target_data
;
1813 struct ptx_image_data
*new_image
;
1814 struct ptx_device
*dev
;
1816 if (GOMP_VERSION_DEV (version
) > GOMP_VERSION_NVIDIA_PTX
)
1818 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1819 " (expected %u, received %u)",
1820 GOMP_VERSION_NVIDIA_PTX
, GOMP_VERSION_DEV (version
));
1824 if (!nvptx_attach_host_thread_to_device (ord
)
1825 || !link_ptx (&module
, img_header
->ptx_objs
, img_header
->ptx_num
))
1828 dev
= ptx_devices
[ord
];
1830 /* The mkoffload utility emits a struct of pointers/integers at the
1831 start of each offload image. The array of kernel names and the
1832 functions addresses form a one-to-one correspondence. */
1834 var_entries
= img_header
->var_num
;
1835 var_names
= img_header
->var_names
;
1836 fn_entries
= img_header
->fn_num
;
1837 fn_descs
= img_header
->fn_descs
;
1839 targ_tbl
= GOMP_PLUGIN_malloc (sizeof (struct addr_pair
)
1840 * (fn_entries
+ var_entries
));
1841 targ_fns
= GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor
)
1844 *target_table
= targ_tbl
;
1846 new_image
= GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data
));
1847 new_image
->target_data
= target_data
;
1848 new_image
->module
= module
;
1849 new_image
->fns
= targ_fns
;
1851 pthread_mutex_lock (&dev
->image_lock
);
1852 new_image
->next
= dev
->images
;
1853 dev
->images
= new_image
;
1854 pthread_mutex_unlock (&dev
->image_lock
);
1856 for (i
= 0; i
< fn_entries
; i
++, targ_fns
++, targ_tbl
++)
1858 CUfunction function
;
1861 CUDA_CALL_ERET (-1, cuModuleGetFunction
, &function
, module
,
1863 CUDA_CALL_ERET (-1, cuFuncGetAttribute
, &nregs
,
1864 CU_FUNC_ATTRIBUTE_NUM_REGS
, function
);
1865 CUDA_CALL_ERET (-1, cuFuncGetAttribute
, &mthrs
,
1866 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK
, function
);
1868 targ_fns
->fn
= function
;
1869 targ_fns
->launch
= &fn_descs
[i
];
1870 targ_fns
->regs_per_thread
= nregs
;
1871 targ_fns
->max_threads_per_block
= mthrs
;
1873 targ_tbl
->start
= (uintptr_t) targ_fns
;
1874 targ_tbl
->end
= targ_tbl
->start
+ 1;
1877 for (j
= 0; j
< var_entries
; j
++, targ_tbl
++)
1882 CUDA_CALL_ERET (-1, cuModuleGetGlobal
,
1883 &var
, &bytes
, module
, var_names
[j
]);
1885 targ_tbl
->start
= (uintptr_t) var
;
1886 targ_tbl
->end
= targ_tbl
->start
+ bytes
;
1889 nvptx_set_clocktick (module
, dev
);
1891 return fn_entries
+ var_entries
;
1894 /* Unload the program described by TARGET_DATA. DEV_DATA is the
1895 function descriptors allocated by G_O_load_image. */
1898 GOMP_OFFLOAD_unload_image (int ord
, unsigned version
, const void *target_data
)
1900 struct ptx_image_data
*image
, **prev_p
;
1901 struct ptx_device
*dev
= ptx_devices
[ord
];
1903 if (GOMP_VERSION_DEV (version
) > GOMP_VERSION_NVIDIA_PTX
)
1905 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1906 " (expected %u, received %u)",
1907 GOMP_VERSION_NVIDIA_PTX
, GOMP_VERSION_DEV (version
));
1912 pthread_mutex_lock (&dev
->image_lock
);
1913 for (prev_p
= &dev
->images
; (image
= *prev_p
) != 0; prev_p
= &image
->next
)
1914 if (image
->target_data
== target_data
)
1916 *prev_p
= image
->next
;
1917 if (CUDA_CALL_NOCHECK (cuModuleUnload
, image
->module
) != CUDA_SUCCESS
)
1923 pthread_mutex_unlock (&dev
->image_lock
);
1928 GOMP_OFFLOAD_alloc (int ord
, size_t size
)
1930 if (!nvptx_attach_host_thread_to_device (ord
))
1932 return nvptx_alloc (size
);
1936 GOMP_OFFLOAD_free (int ord
, void *ptr
)
1938 return (nvptx_attach_host_thread_to_device (ord
)
1939 && nvptx_free (ptr
));
1943 GOMP_OFFLOAD_dev2host (int ord
, void *dst
, const void *src
, size_t n
)
1945 return (nvptx_attach_host_thread_to_device (ord
)
1946 && nvptx_dev2host (dst
, src
, n
));
1950 GOMP_OFFLOAD_host2dev (int ord
, void *dst
, const void *src
, size_t n
)
1952 return (nvptx_attach_host_thread_to_device (ord
)
1953 && nvptx_host2dev (dst
, src
, n
));
1957 GOMP_OFFLOAD_dev2dev (int ord
, void *dst
, const void *src
, size_t n
)
1959 struct ptx_device
*ptx_dev
= ptx_devices
[ord
];
1960 CUDA_CALL (cuMemcpyDtoDAsync
, (CUdeviceptr
) dst
, (CUdeviceptr
) src
, n
,
1961 ptx_dev
->null_stream
->stream
);
1965 void (*device_run
) (int n
, void *fn_ptr
, void *vars
) = NULL
;
1968 GOMP_OFFLOAD_openacc_exec (void (*fn
) (void *), size_t mapnum
,
1969 void **hostaddrs
, void **devaddrs
,
1970 int async
, unsigned *dims
, void *targ_mem_desc
)
1972 nvptx_exec (fn
, mapnum
, hostaddrs
, devaddrs
, async
, dims
, targ_mem_desc
);
1976 GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc
, int async
)
1978 struct nvptx_thread
*nvthd
= nvptx_thread ();
1979 CUevent
*e
= (CUevent
*) GOMP_PLUGIN_malloc (sizeof (CUevent
));
1981 CUDA_CALL_ASSERT (cuEventCreate
, e
, CU_EVENT_DISABLE_TIMING
);
1982 CUDA_CALL_ASSERT (cuEventRecord
, *e
, nvthd
->current_stream
->stream
);
1983 event_add (PTX_EVT_ASYNC_CLEANUP
, e
, targ_mem_desc
, async
);
1987 GOMP_OFFLOAD_openacc_async_test (int async
)
1989 return nvptx_async_test (async
);
1993 GOMP_OFFLOAD_openacc_async_test_all (void)
1995 return nvptx_async_test_all ();
1999 GOMP_OFFLOAD_openacc_async_wait (int async
)
2005 GOMP_OFFLOAD_openacc_async_wait_async (int async1
, int async2
)
2007 nvptx_wait_async (async1
, async2
);
2011 GOMP_OFFLOAD_openacc_async_wait_all (void)
2017 GOMP_OFFLOAD_openacc_async_wait_all_async (int async
)
2019 nvptx_wait_all_async (async
);
2023 GOMP_OFFLOAD_openacc_async_set_async (int async
)
2025 nvptx_set_async (async
);
2029 GOMP_OFFLOAD_openacc_create_thread_data (int ord
)
2031 struct ptx_device
*ptx_dev
;
2032 struct nvptx_thread
*nvthd
2033 = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread
));
2036 ptx_dev
= ptx_devices
[ord
];
2040 CUDA_CALL_ASSERT (cuCtxGetCurrent
, &thd_ctx
);
2042 assert (ptx_dev
->ctx
);
2045 CUDA_CALL_ASSERT (cuCtxPushCurrent
, ptx_dev
->ctx
);
2047 nvthd
->current_stream
= ptx_dev
->null_stream
;
2048 nvthd
->ptx_dev
= ptx_dev
;
2050 return (void *) nvthd
;
2054 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data
)
2060 GOMP_OFFLOAD_openacc_cuda_get_current_device (void)
2062 return nvptx_get_current_cuda_device ();
2066 GOMP_OFFLOAD_openacc_cuda_get_current_context (void)
2068 return nvptx_get_current_cuda_context ();
2071 /* NOTE: This returns a CUstream, not a ptx_stream pointer. */
2074 GOMP_OFFLOAD_openacc_cuda_get_stream (int async
)
2076 return nvptx_get_cuda_stream (async
);
2079 /* NOTE: This takes a CUstream, not a ptx_stream pointer. */
2082 GOMP_OFFLOAD_openacc_cuda_set_stream (int async
, void *stream
)
2084 return nvptx_set_cuda_stream (async
, stream
);
2087 /* Adjust launch dimensions: pick good values for number of blocks and warps
2088 and ensure that number of warps does not exceed CUDA limits as well as GCC's
2092 nvptx_adjust_launch_bounds (struct targ_fn_descriptor
*fn
,
2093 struct ptx_device
*ptx_dev
,
2094 int *teams_p
, int *threads_p
)
2096 int max_warps_block
= fn
->max_threads_per_block
/ 32;
2097 /* Maximum 32 warps per block is an implementation limit in NVPTX backend
2098 and libgcc, which matches documented limit of all GPUs as of 2015. */
2099 if (max_warps_block
> 32)
2100 max_warps_block
= 32;
2101 if (*threads_p
<= 0)
2103 if (*threads_p
> max_warps_block
)
2104 *threads_p
= max_warps_block
;
2106 int regs_per_block
= fn
->regs_per_thread
* 32 * *threads_p
;
2107 /* This is an estimate of how many blocks the device can host simultaneously.
2108 Actual limit, which may be lower, can be queried with "occupancy control"
2109 driver interface (since CUDA 6.0). */
2110 int max_blocks
= ptx_dev
->regs_per_sm
/ regs_per_block
* ptx_dev
->num_sms
;
2111 if (*teams_p
<= 0 || *teams_p
> max_blocks
)
2112 *teams_p
= max_blocks
;
2115 /* Return the size of per-warp stacks (see gcc -msoft-stack) to use for OpenMP
2119 nvptx_stacks_size ()
2124 /* Return contiguous storage for NUM stacks, each SIZE bytes. */
2127 nvptx_stacks_alloc (size_t size
, int num
)
2130 CUresult r
= CUDA_CALL_NOCHECK (cuMemAlloc
, &stacks
, size
* num
);
2131 if (r
!= CUDA_SUCCESS
)
2132 GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r
));
2133 return (void *) stacks
;
2136 /* Release storage previously allocated by nvptx_stacks_alloc. */
2139 nvptx_stacks_free (void *p
, int num
)
2141 CUresult r
= CUDA_CALL_NOCHECK (cuMemFree
, (CUdeviceptr
) p
);
2142 if (r
!= CUDA_SUCCESS
)
2143 GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r
));
2147 GOMP_OFFLOAD_run (int ord
, void *tgt_fn
, void *tgt_vars
, void **args
)
2149 CUfunction function
= ((struct targ_fn_descriptor
*) tgt_fn
)->fn
;
2151 struct ptx_device
*ptx_dev
= ptx_devices
[ord
];
2152 const char *maybe_abort_msg
= "(perhaps abort was called)";
2153 int teams
= 0, threads
= 0;
2156 GOMP_PLUGIN_fatal ("No target arguments provided");
2159 intptr_t id
= (intptr_t) *args
++, val
;
2160 if (id
& GOMP_TARGET_ARG_SUBSEQUENT_PARAM
)
2161 val
= (intptr_t) *args
++;
2163 val
= id
>> GOMP_TARGET_ARG_VALUE_SHIFT
;
2164 if ((id
& GOMP_TARGET_ARG_DEVICE_MASK
) != GOMP_TARGET_ARG_DEVICE_ALL
)
2166 val
= val
> INT_MAX
? INT_MAX
: val
;
2167 id
&= GOMP_TARGET_ARG_ID_MASK
;
2168 if (id
== GOMP_TARGET_ARG_NUM_TEAMS
)
2170 else if (id
== GOMP_TARGET_ARG_THREAD_LIMIT
)
2173 nvptx_adjust_launch_bounds (tgt_fn
, ptx_dev
, &teams
, &threads
);
2175 size_t stack_size
= nvptx_stacks_size ();
2176 void *stacks
= nvptx_stacks_alloc (stack_size
, teams
* threads
);
2177 void *fn_args
[] = {tgt_vars
, stacks
, (void *) stack_size
};
2178 size_t fn_args_size
= sizeof fn_args
;
2180 CU_LAUNCH_PARAM_BUFFER_POINTER
, fn_args
,
2181 CU_LAUNCH_PARAM_BUFFER_SIZE
, &fn_args_size
,
2184 r
= CUDA_CALL_NOCHECK (cuLaunchKernel
, function
, teams
, 1, 1,
2185 32, threads
, 1, 0, ptx_dev
->null_stream
->stream
,
2187 if (r
!= CUDA_SUCCESS
)
2188 GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r
));
2190 r
= CUDA_CALL_NOCHECK (cuCtxSynchronize
, );
2191 if (r
== CUDA_ERROR_LAUNCH_FAILED
)
2192 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r
),
2194 else if (r
!= CUDA_SUCCESS
)
2195 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r
));
2196 nvptx_stacks_free (stacks
, teams
* threads
);
2200 GOMP_OFFLOAD_async_run (int ord
, void *tgt_fn
, void *tgt_vars
, void **args
,
2203 GOMP_PLUGIN_fatal ("GOMP_OFFLOAD_async_run unimplemented");