1 /* Plugin for NVPTX execution.
3 Copyright (C) 2013-2018 Free Software Foundation, Inc.
5 Contributed by Mentor Embedded.
7 This file is part of the GNU Offloading and Multi Processing Library
10 Libgomp is free software; you can redistribute it and/or modify it
11 under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 3, or (at your option)
15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
20 Under Section 7 of GPL version 3, you are granted additional
21 permissions described in the GCC Runtime Library Exception, version
22 3.1, as published by the Free Software Foundation.
24 You should have received a copy of the GNU General Public License and
25 a copy of the GCC Runtime Library Exception along with this program;
26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
27 <http://www.gnu.org/licenses/>. */
29 /* Nvidia PTX-specific parts of OpenACC support. The cuda driver
30 library appears to hold some implicit state, but the documentation
31 is not clear as to what that state might be. Or how one might
32 propagate it from one thread to another. */
37 #include "libgomp-plugin.h"
38 #include "oacc-plugin.h"
39 #include "gomp-constants.h"
52 #if CUDA_VERSION < 6000
53 extern CUresult
cuGetErrorString (CUresult
, const char **);
54 #define CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR 82
57 #if CUDA_VERSION >= 6050
60 CUresult
cuLinkAddData (CUlinkState
, CUjitInputType
, void *, size_t,
61 const char *, unsigned, CUjit_option
*, void **);
62 CUresult
cuLinkCreate (unsigned, CUjit_option
*, void **, CUlinkState
*);
64 typedef size_t (*CUoccupancyB2DSize
)(int);
65 CUresult
cuLinkAddData_v2 (CUlinkState
, CUjitInputType
, void *, size_t,
66 const char *, unsigned, CUjit_option
*, void **);
67 CUresult
cuLinkCreate_v2 (unsigned, CUjit_option
*, void **, CUlinkState
*);
68 CUresult
cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction
,
69 CUoccupancyB2DSize
, size_t, int);
72 #define DO_PRAGMA(x) _Pragma (#x)
74 #if PLUGIN_NVPTX_DYNAMIC
79 # define CUDA_ONE_CALL(call) \
80 __typeof (call) *call;
81 # define CUDA_ONE_CALL_MAYBE_NULL(call) \
83 #include "cuda-lib.def"
85 # undef CUDA_ONE_CALL_MAYBE_NULL
89 /* -1 if init_cuda_lib has not been called yet, false
90 if it has been and failed, true if it has been and succeeded. */
91 static signed char cuda_lib_inited
= -1;
93 /* Dynamically load the CUDA runtime library and initialize function
94 pointers, return false if unsuccessful, true if successful. */
98 if (cuda_lib_inited
!= -1)
99 return cuda_lib_inited
;
100 const char *cuda_runtime_lib
= "libcuda.so.1";
101 void *h
= dlopen (cuda_runtime_lib
, RTLD_LAZY
);
102 cuda_lib_inited
= false;
106 # define CUDA_ONE_CALL(call) CUDA_ONE_CALL_1 (call, false)
107 # define CUDA_ONE_CALL_MAYBE_NULL(call) CUDA_ONE_CALL_1 (call, true)
108 # define CUDA_ONE_CALL_1(call, allow_null) \
109 cuda_lib.call = dlsym (h, #call); \
110 if (!allow_null && cuda_lib.call == NULL) \
112 #include "cuda-lib.def"
113 # undef CUDA_ONE_CALL
114 # undef CUDA_ONE_CALL_1
115 # undef CUDA_ONE_CALL_MAYBE_NULL
117 cuda_lib_inited
= true;
120 # define CUDA_CALL_PREFIX cuda_lib.
123 # define CUDA_ONE_CALL(call)
124 # define CUDA_ONE_CALL_MAYBE_NULL(call) DO_PRAGMA (weak call)
125 #include "cuda-lib.def"
126 #undef CUDA_ONE_CALL_MAYBE_NULL
129 # define CUDA_CALL_PREFIX
130 # define init_cuda_lib() true
133 #include "secure_getenv.h"
137 #define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
138 #define MAX(X,Y) ((X) > (Y) ? (X) : (Y))
140 /* Convenience macros for the frequently used CUDA library call and
141 error handling sequence as well as CUDA library calls that
142 do the error checking themselves or don't do it at all. */
144 #define CUDA_CALL_ERET(ERET, FN, ...) \
147 = CUDA_CALL_PREFIX FN (__VA_ARGS__); \
148 if (__r != CUDA_SUCCESS) \
150 GOMP_PLUGIN_error (#FN " error: %s", \
156 #define CUDA_CALL(FN, ...) \
157 CUDA_CALL_ERET (false, FN, __VA_ARGS__)
159 #define CUDA_CALL_ASSERT(FN, ...) \
162 = CUDA_CALL_PREFIX FN (__VA_ARGS__); \
163 if (__r != CUDA_SUCCESS) \
165 GOMP_PLUGIN_fatal (#FN " error: %s", \
170 #define CUDA_CALL_NOCHECK(FN, ...) \
171 CUDA_CALL_PREFIX FN (__VA_ARGS__)
173 #define CUDA_CALL_EXISTS(FN) \
177 cuda_error (CUresult r
)
179 const char *fallback
= "unknown cuda error";
182 if (!CUDA_CALL_EXISTS (cuGetErrorString
))
185 r
= CUDA_CALL_NOCHECK (cuGetErrorString
, r
, &desc
);
186 if (r
== CUDA_SUCCESS
)
192 static unsigned int instantiated_devices
= 0;
193 static pthread_mutex_t ptx_dev_lock
= PTHREAD_MUTEX_INITIALIZER
;
198 pthread_t host_thread
;
209 struct ptx_stream
*next
;
212 /* Thread-specific data for PTX. */
216 struct ptx_stream
*current_stream
;
217 struct ptx_device
*ptx_dev
;
221 map_init (struct ptx_stream
*s
)
223 int size
= getpagesize ();
229 CUDA_CALL (cuMemAllocHost
, &s
->h
, size
);
230 CUDA_CALL (cuMemHostGetDevicePointer
, &s
->d
, s
->h
, 0);
235 s
->h_end
= s
->h_begin
+ size
;
236 s
->h_next
= s
->h_prev
= s
->h_tail
= s
->h_begin
;
244 map_fini (struct ptx_stream
*s
)
246 CUDA_CALL (cuMemFreeHost
, s
->h
);
251 map_pop (struct ptx_stream
*s
)
258 s
->h_tail
= s
->h_next
;
260 if (s
->h_tail
>= s
->h_end
)
261 s
->h_tail
= s
->h_begin
+ (int) (s
->h_tail
- s
->h_end
);
263 if (s
->h_next
== s
->h_tail
)
264 s
->h_prev
= s
->h_next
;
266 assert (s
->h_next
>= s
->h_begin
);
267 assert (s
->h_tail
>= s
->h_begin
);
268 assert (s
->h_prev
>= s
->h_begin
);
270 assert (s
->h_next
<= s
->h_end
);
271 assert (s
->h_tail
<= s
->h_end
);
272 assert (s
->h_prev
<= s
->h_end
);
276 map_push (struct ptx_stream
*s
, size_t size
, void **h
, void **d
)
283 left
= s
->h_end
- s
->h_next
;
290 assert (s
->h_next
== s
->h_prev
);
291 s
->h_next
= s
->h_prev
= s
->h_tail
= s
->h_begin
;
296 offset
= s
->h_next
- s
->h
;
298 *d
= (void *)(s
->d
+ offset
);
299 *h
= (void *)(s
->h
+ offset
);
301 s
->h_prev
= s
->h_next
;
307 assert (s
->h_next
>= s
->h_begin
);
308 assert (s
->h_tail
>= s
->h_begin
);
309 assert (s
->h_prev
>= s
->h_begin
);
310 assert (s
->h_next
<= s
->h_end
);
311 assert (s
->h_tail
<= s
->h_end
);
312 assert (s
->h_prev
<= s
->h_end
);
317 /* Target data function launch information. */
319 struct targ_fn_launch
322 unsigned short dim
[GOMP_DIM_MAX
];
325 /* Target PTX object information. */
333 /* Target data image information. */
335 typedef struct nvptx_tdata
337 const struct targ_ptx_obj
*ptx_objs
;
340 const char *const *var_names
;
343 const struct targ_fn_launch
*fn_descs
;
347 /* Descriptor of a loaded function. */
349 struct targ_fn_descriptor
352 const struct targ_fn_launch
*launch
;
354 int max_threads_per_block
;
357 /* A loaded PTX image. */
358 struct ptx_image_data
360 const void *target_data
;
363 struct targ_fn_descriptor
*fns
; /* Array of functions. */
365 struct ptx_image_data
*next
;
373 struct ptx_stream
*null_stream
;
374 /* All non-null streams associated with this device (actually context),
375 either created implicitly or passed in from the user (via
376 acc_set_cuda_stream). */
377 struct ptx_stream
*active_streams
;
379 struct ptx_stream
**arr
;
382 /* A lock for use when manipulating the above stream list and array. */
383 pthread_mutex_t stream_lock
;
395 int max_threads_per_block
;
396 int max_threads_per_multiprocessor
;
397 int default_dims
[GOMP_DIM_MAX
];
399 struct ptx_image_data
*images
; /* Images loaded on device. */
400 pthread_mutex_t image_lock
; /* Lock for above list. */
402 struct ptx_device
*next
;
410 PTX_EVT_ASYNC_CLEANUP
421 struct ptx_event
*next
;
424 static pthread_mutex_t ptx_event_lock
;
425 static struct ptx_event
*ptx_events
;
427 static struct ptx_device
**ptx_devices
;
429 static inline struct nvptx_thread
*
432 return (struct nvptx_thread
*) GOMP_PLUGIN_acc_thread ();
436 init_streams_for_device (struct ptx_device
*ptx_dev
, int concurrency
)
439 struct ptx_stream
*null_stream
440 = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream
));
442 null_stream
->stream
= NULL
;
443 null_stream
->host_thread
= pthread_self ();
444 null_stream
->multithreaded
= true;
445 null_stream
->d
= (CUdeviceptr
) NULL
;
446 null_stream
->h
= NULL
;
447 if (!map_init (null_stream
))
450 ptx_dev
->null_stream
= null_stream
;
451 ptx_dev
->active_streams
= NULL
;
452 pthread_mutex_init (&ptx_dev
->stream_lock
, NULL
);
457 /* This is just a guess -- make space for as many async streams as the
458 current device is capable of concurrently executing. This can grow
459 later as necessary. No streams are created yet. */
460 ptx_dev
->async_streams
.arr
461 = GOMP_PLUGIN_malloc (concurrency
* sizeof (struct ptx_stream
*));
462 ptx_dev
->async_streams
.size
= concurrency
;
464 for (i
= 0; i
< concurrency
; i
++)
465 ptx_dev
->async_streams
.arr
[i
] = NULL
;
471 fini_streams_for_device (struct ptx_device
*ptx_dev
)
473 free (ptx_dev
->async_streams
.arr
);
476 while (ptx_dev
->active_streams
!= NULL
)
478 struct ptx_stream
*s
= ptx_dev
->active_streams
;
479 ptx_dev
->active_streams
= ptx_dev
->active_streams
->next
;
483 CUresult r
= CUDA_CALL_NOCHECK (cuStreamDestroy
, s
->stream
);
484 if (r
!= CUDA_SUCCESS
)
486 GOMP_PLUGIN_error ("cuStreamDestroy error: %s", cuda_error (r
));
492 ret
&= map_fini (ptx_dev
->null_stream
);
493 free (ptx_dev
->null_stream
);
497 /* Select a stream for (OpenACC-semantics) ASYNC argument for the current
498 thread THREAD (and also current device/context). If CREATE is true, create
499 the stream if it does not exist (or use EXISTING if it is non-NULL), and
500 associate the stream with the same thread argument. Returns stream to use
503 static struct ptx_stream
*
504 select_stream_for_async (int async
, pthread_t thread
, bool create
,
507 struct nvptx_thread
*nvthd
= nvptx_thread ();
508 /* Local copy of TLS variable. */
509 struct ptx_device
*ptx_dev
= nvthd
->ptx_dev
;
510 struct ptx_stream
*stream
= NULL
;
511 int orig_async
= async
;
513 /* The special value acc_async_noval (-1) maps (for now) to an
514 implicitly-created stream, which is then handled the same as any other
515 numbered async stream. Other options are available, e.g. using the null
516 stream for anonymous async operations, or choosing an idle stream from an
517 active set. But, stick with this for now. */
518 if (async
> acc_async_sync
)
522 pthread_mutex_lock (&ptx_dev
->stream_lock
);
524 /* NOTE: AFAICT there's no particular need for acc_async_sync to map to the
525 null stream, and in fact better performance may be obtainable if it doesn't
526 (because the null stream enforces overly-strict synchronisation with
527 respect to other streams for legacy reasons, and that's probably not
528 needed with OpenACC). Maybe investigate later. */
529 if (async
== acc_async_sync
)
530 stream
= ptx_dev
->null_stream
;
531 else if (async
>= 0 && async
< ptx_dev
->async_streams
.size
532 && ptx_dev
->async_streams
.arr
[async
] && !(create
&& existing
))
533 stream
= ptx_dev
->async_streams
.arr
[async
];
534 else if (async
>= 0 && create
)
536 if (async
>= ptx_dev
->async_streams
.size
)
538 int i
, newsize
= ptx_dev
->async_streams
.size
* 2;
540 if (async
>= newsize
)
543 ptx_dev
->async_streams
.arr
544 = GOMP_PLUGIN_realloc (ptx_dev
->async_streams
.arr
,
545 newsize
* sizeof (struct ptx_stream
*));
547 for (i
= ptx_dev
->async_streams
.size
; i
< newsize
; i
++)
548 ptx_dev
->async_streams
.arr
[i
] = NULL
;
550 ptx_dev
->async_streams
.size
= newsize
;
553 /* Create a new stream on-demand if there isn't one already, or if we're
554 setting a particular async value to an existing (externally-provided)
556 if (!ptx_dev
->async_streams
.arr
[async
] || existing
)
560 = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream
));
563 s
->stream
= existing
;
566 r
= CUDA_CALL_NOCHECK (cuStreamCreate
, &s
->stream
,
568 if (r
!= CUDA_SUCCESS
)
570 pthread_mutex_unlock (&ptx_dev
->stream_lock
);
571 GOMP_PLUGIN_fatal ("cuStreamCreate error: %s",
576 /* If CREATE is true, we're going to be queueing some work on this
577 stream. Associate it with the current host thread. */
578 s
->host_thread
= thread
;
579 s
->multithreaded
= false;
581 s
->d
= (CUdeviceptr
) NULL
;
585 pthread_mutex_unlock (&ptx_dev
->stream_lock
);
586 GOMP_PLUGIN_fatal ("map_init fail");
589 s
->next
= ptx_dev
->active_streams
;
590 ptx_dev
->active_streams
= s
;
591 ptx_dev
->async_streams
.arr
[async
] = s
;
594 stream
= ptx_dev
->async_streams
.arr
[async
];
599 pthread_mutex_unlock (&ptx_dev
->stream_lock
);
600 GOMP_PLUGIN_fatal ("bad async %d", async
);
605 assert (stream
!= NULL
);
607 /* If we're trying to use the same stream from different threads
608 simultaneously, set stream->multithreaded to true. This affects the
609 behaviour of acc_async_test_all and acc_wait_all, which are supposed to
610 only wait for asynchronous launches from the same host thread they are
611 invoked on. If multiple threads use the same async value, we make note
612 of that here and fall back to testing/waiting for all threads in those
614 if (thread
!= stream
->host_thread
)
615 stream
->multithreaded
= true;
617 pthread_mutex_unlock (&ptx_dev
->stream_lock
);
619 else if (stream
&& !stream
->multithreaded
620 && !pthread_equal (stream
->host_thread
, thread
))
621 GOMP_PLUGIN_fatal ("async %d used on wrong thread", orig_async
);
626 /* Initialize the device. Return TRUE on success, else FALSE. PTX_DEV_LOCK
627 should be locked on entry and remains locked on exit. */
634 if (instantiated_devices
!= 0)
638 pthread_mutex_init (&ptx_event_lock
, NULL
);
640 if (!init_cuda_lib ())
643 CUDA_CALL (cuInit
, 0);
645 CUDA_CALL (cuDeviceGetCount
, &ndevs
);
646 ptx_devices
= GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device
*)
651 /* Select the N'th PTX device for the current host thread. The device must
652 have been previously opened before calling this function. */
655 nvptx_attach_host_thread_to_device (int n
)
659 struct ptx_device
*ptx_dev
;
662 r
= CUDA_CALL_NOCHECK (cuCtxGetDevice
, &dev
);
663 if (r
!= CUDA_SUCCESS
&& r
!= CUDA_ERROR_INVALID_CONTEXT
)
665 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r
));
669 if (r
!= CUDA_ERROR_INVALID_CONTEXT
&& dev
== n
)
675 ptx_dev
= ptx_devices
[n
];
678 GOMP_PLUGIN_error ("device %d not found", n
);
682 CUDA_CALL (cuCtxGetCurrent
, &thd_ctx
);
684 /* We don't necessarily have a current context (e.g. if it has been
685 destroyed. Pop it if we do though. */
687 CUDA_CALL (cuCtxPopCurrent
, &old_ctx
);
689 CUDA_CALL (cuCtxPushCurrent
, ptx_dev
->ctx
);
694 static struct ptx_device
*
695 nvptx_open_device (int n
)
697 struct ptx_device
*ptx_dev
;
698 CUdevice dev
, ctx_dev
;
700 int async_engines
, pi
;
702 CUDA_CALL_ERET (NULL
, cuDeviceGet
, &dev
, n
);
704 ptx_dev
= GOMP_PLUGIN_malloc (sizeof (struct ptx_device
));
708 ptx_dev
->ctx_shared
= false;
710 r
= CUDA_CALL_NOCHECK (cuCtxGetDevice
, &ctx_dev
);
711 if (r
!= CUDA_SUCCESS
&& r
!= CUDA_ERROR_INVALID_CONTEXT
)
713 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r
));
717 if (r
!= CUDA_ERROR_INVALID_CONTEXT
&& ctx_dev
!= dev
)
719 /* The current host thread has an active context for a different device.
722 CUDA_CALL_ERET (NULL
, cuCtxPopCurrent
, &old_ctx
);
725 CUDA_CALL_ERET (NULL
, cuCtxGetCurrent
, &ptx_dev
->ctx
);
728 CUDA_CALL_ERET (NULL
, cuCtxCreate
, &ptx_dev
->ctx
, CU_CTX_SCHED_AUTO
, dev
);
730 ptx_dev
->ctx_shared
= true;
732 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
733 &pi
, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP
, dev
);
734 ptx_dev
->overlap
= pi
;
736 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
737 &pi
, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY
, dev
);
740 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
741 &pi
, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS
, dev
);
742 ptx_dev
->concur
= pi
;
744 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
745 &pi
, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE
, dev
);
748 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
749 &pi
, CU_DEVICE_ATTRIBUTE_INTEGRATED
, dev
);
752 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
753 &pi
, CU_DEVICE_ATTRIBUTE_CLOCK_RATE
, dev
);
754 ptx_dev
->clock_khz
= pi
;
756 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
757 &pi
, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT
, dev
);
758 ptx_dev
->num_sms
= pi
;
760 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
761 &pi
, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK
, dev
);
762 ptx_dev
->regs_per_block
= pi
;
764 /* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR is defined only
765 in CUDA 6.0 and newer. */
766 r
= CUDA_CALL_NOCHECK (cuDeviceGetAttribute
, &pi
,
767 CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR
,
769 /* Fallback: use limit of registers per block, which is usually equal. */
770 if (r
== CUDA_ERROR_INVALID_VALUE
)
771 pi
= ptx_dev
->regs_per_block
;
772 else if (r
!= CUDA_SUCCESS
)
774 GOMP_PLUGIN_error ("cuDeviceGetAttribute error: %s", cuda_error (r
));
777 ptx_dev
->regs_per_sm
= pi
;
779 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
780 &pi
, CU_DEVICE_ATTRIBUTE_WARP_SIZE
, dev
);
783 GOMP_PLUGIN_error ("Only warp size 32 is supported");
786 ptx_dev
->warp_size
= pi
;
788 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
, &pi
,
789 CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK
, dev
);
790 ptx_dev
->max_threads_per_block
= pi
;
792 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
, &pi
,
793 CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR
, dev
);
794 ptx_dev
->max_threads_per_multiprocessor
= pi
;
796 r
= CUDA_CALL_NOCHECK (cuDeviceGetAttribute
, &async_engines
,
797 CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT
, dev
);
798 if (r
!= CUDA_SUCCESS
)
801 for (int i
= 0; i
!= GOMP_DIM_MAX
; i
++)
802 ptx_dev
->default_dims
[i
] = 0;
804 ptx_dev
->images
= NULL
;
805 pthread_mutex_init (&ptx_dev
->image_lock
, NULL
);
807 if (!init_streams_for_device (ptx_dev
, async_engines
))
814 nvptx_close_device (struct ptx_device
*ptx_dev
)
819 if (!fini_streams_for_device (ptx_dev
))
822 pthread_mutex_destroy (&ptx_dev
->image_lock
);
824 if (!ptx_dev
->ctx_shared
)
825 CUDA_CALL (cuCtxDestroy
, ptx_dev
->ctx
);
832 nvptx_get_num_devices (void)
836 /* PR libgomp/65099: Currently, we only support offloading in 64-bit
838 if (sizeof (void *) != 8)
840 GOMP_PLUGIN_debug (0, "Disabling nvptx offloading;"
841 " only 64-bit configurations are supported\n");
845 /* This function will be called before the plugin has been initialized in
846 order to enumerate available devices, but CUDA API routines can't be used
847 until cuInit has been called. Just call it now (but don't yet do any
848 further initialization). */
849 if (instantiated_devices
== 0)
851 if (!init_cuda_lib ())
853 CUresult r
= CUDA_CALL_NOCHECK (cuInit
, 0);
854 /* This is not an error: e.g. we may have CUDA libraries installed but
855 no devices available. */
856 if (r
!= CUDA_SUCCESS
)
858 GOMP_PLUGIN_debug (0, "Disabling nvptx offloading; cuInit: %s\n",
864 CUDA_CALL_ERET (-1, cuDeviceGetCount
, &n
);
869 notify_var (const char *var_name
, const char *env_var
)
872 GOMP_PLUGIN_debug (0, "%s: <Not defined>\n", var_name
);
874 GOMP_PLUGIN_debug (0, "%s: '%s'\n", var_name
, env_var
);
878 process_GOMP_NVPTX_JIT (intptr_t *gomp_nvptx_o
)
880 const char *var_name
= "GOMP_NVPTX_JIT";
881 const char *env_var
= secure_getenv (var_name
);
882 notify_var (var_name
, env_var
);
887 const char *c
= env_var
;
893 if (c
[0] == '-' && c
[1] == 'O'
894 && '0' <= c
[2] && c
[2] <= '4'
895 && (c
[3] == '\0' || c
[3] == ' '))
897 *gomp_nvptx_o
= c
[2] - '0';
902 GOMP_PLUGIN_error ("Error parsing %s", var_name
);
908 link_ptx (CUmodule
*module
, const struct targ_ptx_obj
*ptx_objs
,
911 CUjit_option opts
[7];
916 CUlinkState linkstate
;
919 size_t linkoutsize
__attribute__ ((unused
));
921 opts
[0] = CU_JIT_WALL_TIME
;
922 optvals
[0] = &elapsed
;
924 opts
[1] = CU_JIT_INFO_LOG_BUFFER
;
925 optvals
[1] = &ilog
[0];
927 opts
[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES
;
928 optvals
[2] = (void *) sizeof ilog
;
930 opts
[3] = CU_JIT_ERROR_LOG_BUFFER
;
931 optvals
[3] = &elog
[0];
933 opts
[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES
;
934 optvals
[4] = (void *) sizeof elog
;
936 opts
[5] = CU_JIT_LOG_VERBOSE
;
937 optvals
[5] = (void *) 1;
939 static intptr_t gomp_nvptx_o
= -1;
941 static bool init_done
= false;
944 process_GOMP_NVPTX_JIT (&gomp_nvptx_o
);
949 if (gomp_nvptx_o
!= -1)
951 opts
[nopts
] = CU_JIT_OPTIMIZATION_LEVEL
;
952 optvals
[nopts
] = (void *) gomp_nvptx_o
;
956 if (CUDA_CALL_EXISTS (cuLinkCreate_v2
))
957 CUDA_CALL (cuLinkCreate_v2
, nopts
, opts
, optvals
, &linkstate
);
959 CUDA_CALL (cuLinkCreate
, nopts
, opts
, optvals
, &linkstate
);
961 for (; num_objs
--; ptx_objs
++)
963 /* cuLinkAddData's 'data' argument erroneously omits the const
965 GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs
->code
);
966 if (CUDA_CALL_EXISTS (cuLinkAddData_v2
))
967 r
= CUDA_CALL_NOCHECK (cuLinkAddData_v2
, linkstate
, CU_JIT_INPUT_PTX
,
968 (char *) ptx_objs
->code
, ptx_objs
->size
,
971 r
= CUDA_CALL_NOCHECK (cuLinkAddData
, linkstate
, CU_JIT_INPUT_PTX
,
972 (char *) ptx_objs
->code
, ptx_objs
->size
,
974 if (r
!= CUDA_SUCCESS
)
976 GOMP_PLUGIN_error ("Link error log %s\n", &elog
[0]);
977 GOMP_PLUGIN_error ("cuLinkAddData (ptx_code) error: %s",
983 GOMP_PLUGIN_debug (0, "Linking\n");
984 r
= CUDA_CALL_NOCHECK (cuLinkComplete
, linkstate
, &linkout
, &linkoutsize
);
986 GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed
);
987 GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog
[0]);
989 if (r
!= CUDA_SUCCESS
)
991 GOMP_PLUGIN_error ("cuLinkComplete error: %s", cuda_error (r
));
995 CUDA_CALL (cuModuleLoadData
, module
, linkout
);
996 CUDA_CALL (cuLinkDestroy
, linkstate
);
1001 event_gc (bool memmap_lockable
)
1003 struct ptx_event
*ptx_event
= ptx_events
;
1004 struct ptx_event
*async_cleanups
= NULL
;
1005 struct nvptx_thread
*nvthd
= nvptx_thread ();
1007 pthread_mutex_lock (&ptx_event_lock
);
1009 while (ptx_event
!= NULL
)
1012 struct ptx_event
*e
= ptx_event
;
1014 ptx_event
= ptx_event
->next
;
1016 if (e
->ord
!= nvthd
->ptx_dev
->ord
)
1019 r
= CUDA_CALL_NOCHECK (cuEventQuery
, *e
->evt
);
1020 if (r
== CUDA_SUCCESS
)
1022 bool append_async
= false;
1037 case PTX_EVT_ASYNC_CLEANUP
:
1039 /* The function gomp_plugin_async_unmap_vars needs to claim the
1040 memory-map splay tree lock for the current device, so we
1041 can't call it when one of our callers has already claimed
1042 the lock. In that case, just delay the GC for this event
1044 if (!memmap_lockable
)
1047 append_async
= true;
1052 CUDA_CALL_NOCHECK (cuEventDestroy
, *te
);
1055 /* Unlink 'e' from ptx_events list. */
1056 if (ptx_events
== e
)
1057 ptx_events
= ptx_events
->next
;
1060 struct ptx_event
*e_
= ptx_events
;
1061 while (e_
->next
!= e
)
1063 e_
->next
= e_
->next
->next
;
1068 e
->next
= async_cleanups
;
1076 pthread_mutex_unlock (&ptx_event_lock
);
1078 /* We have to do these here, after ptx_event_lock is released. */
1079 while (async_cleanups
)
1081 struct ptx_event
*e
= async_cleanups
;
1082 async_cleanups
= async_cleanups
->next
;
1084 GOMP_PLUGIN_async_unmap_vars (e
->addr
, e
->val
);
1090 event_add (enum ptx_event_type type
, CUevent
*e
, void *h
, int val
)
1092 struct ptx_event
*ptx_event
;
1093 struct nvptx_thread
*nvthd
= nvptx_thread ();
1095 assert (type
== PTX_EVT_MEM
|| type
== PTX_EVT_KNL
|| type
== PTX_EVT_SYNC
1096 || type
== PTX_EVT_ASYNC_CLEANUP
);
1098 ptx_event
= GOMP_PLUGIN_malloc (sizeof (struct ptx_event
));
1099 ptx_event
->type
= type
;
1101 ptx_event
->addr
= h
;
1102 ptx_event
->ord
= nvthd
->ptx_dev
->ord
;
1103 ptx_event
->val
= val
;
1105 pthread_mutex_lock (&ptx_event_lock
);
1107 ptx_event
->next
= ptx_events
;
1108 ptx_events
= ptx_event
;
1110 pthread_mutex_unlock (&ptx_event_lock
);
1114 nvptx_exec (void (*fn
), size_t mapnum
, void **hostaddrs
, void **devaddrs
,
1115 int async
, unsigned *dims
, void *targ_mem_desc
)
1117 struct targ_fn_descriptor
*targ_fn
= (struct targ_fn_descriptor
*) fn
;
1118 CUfunction function
;
1121 struct ptx_stream
*dev_str
;
1124 struct nvptx_thread
*nvthd
= nvptx_thread ();
1125 int warp_size
= nvthd
->ptx_dev
->warp_size
;
1126 const char *maybe_abort_msg
= "(perhaps abort was called)";
1128 function
= targ_fn
->fn
;
1130 dev_str
= select_stream_for_async (async
, pthread_self (), false, NULL
);
1131 assert (dev_str
== nvthd
->current_stream
);
1133 /* Initialize the launch dimensions. Typically this is constant,
1134 provided by the device compiler, but we must permit runtime
1137 for (i
= 0; i
!= GOMP_DIM_MAX
; i
++)
1139 if (targ_fn
->launch
->dim
[i
])
1140 dims
[i
] = targ_fn
->launch
->dim
[i
];
1147 pthread_mutex_lock (&ptx_dev_lock
);
1149 static int gomp_openacc_dims
[GOMP_DIM_MAX
];
1150 if (!gomp_openacc_dims
[0])
1152 /* See if the user provided GOMP_OPENACC_DIM environment
1153 variable to specify runtime defaults. */
1154 for (int i
= 0; i
< GOMP_DIM_MAX
; ++i
)
1155 gomp_openacc_dims
[i
] = GOMP_PLUGIN_acc_default_dim (i
);
1158 if (!nvthd
->ptx_dev
->default_dims
[0])
1160 int default_dims
[GOMP_DIM_MAX
];
1161 for (int i
= 0; i
< GOMP_DIM_MAX
; ++i
)
1162 default_dims
[i
] = gomp_openacc_dims
[i
];
1164 int gang
, worker
, vector
;
1166 int block_size
= nvthd
->ptx_dev
->max_threads_per_block
;
1167 int cpu_size
= nvthd
->ptx_dev
->max_threads_per_multiprocessor
;
1168 int dev_size
= nvthd
->ptx_dev
->num_sms
;
1169 GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
1170 " dev_size=%d, cpu_size=%d\n",
1171 warp_size
, block_size
, dev_size
, cpu_size
);
1173 gang
= (cpu_size
/ block_size
) * dev_size
;
1174 worker
= block_size
/ warp_size
;
1178 /* There is no upper bound on the gang size. The best size
1179 matches the hardware configuration. Logical gangs are
1180 scheduled onto physical hardware. To maximize usage, we
1181 should guess a large number. */
1182 if (default_dims
[GOMP_DIM_GANG
] < 1)
1183 default_dims
[GOMP_DIM_GANG
] = gang
? gang
: 1024;
1184 /* The worker size must not exceed the hardware. */
1185 if (default_dims
[GOMP_DIM_WORKER
] < 1
1186 || (default_dims
[GOMP_DIM_WORKER
] > worker
&& gang
))
1187 default_dims
[GOMP_DIM_WORKER
] = worker
;
1188 /* The vector size must exactly match the hardware. */
1189 if (default_dims
[GOMP_DIM_VECTOR
] < 1
1190 || (default_dims
[GOMP_DIM_VECTOR
] != vector
&& gang
))
1191 default_dims
[GOMP_DIM_VECTOR
] = vector
;
1193 GOMP_PLUGIN_debug (0, " default dimensions [%d,%d,%d]\n",
1194 default_dims
[GOMP_DIM_GANG
],
1195 default_dims
[GOMP_DIM_WORKER
],
1196 default_dims
[GOMP_DIM_VECTOR
]);
1198 for (i
= 0; i
!= GOMP_DIM_MAX
; i
++)
1199 nvthd
->ptx_dev
->default_dims
[i
] = default_dims
[i
];
1201 pthread_mutex_unlock (&ptx_dev_lock
);
1204 bool default_dim_p
[GOMP_DIM_MAX
];
1205 for (i
= 0; i
!= GOMP_DIM_MAX
; i
++)
1206 default_dim_p
[i
] = !dims
[i
];
1208 if (!CUDA_CALL_EXISTS (cuOccupancyMaxPotentialBlockSize
))
1210 for (i
= 0; i
!= GOMP_DIM_MAX
; i
++)
1211 if (default_dim_p
[i
])
1212 dims
[i
] = nvthd
->ptx_dev
->default_dims
[i
];
1214 if (default_dim_p
[GOMP_DIM_VECTOR
])
1215 dims
[GOMP_DIM_VECTOR
]
1216 = MIN (dims
[GOMP_DIM_VECTOR
],
1217 (targ_fn
->max_threads_per_block
/ warp_size
1220 if (default_dim_p
[GOMP_DIM_WORKER
])
1221 dims
[GOMP_DIM_WORKER
]
1222 = MIN (dims
[GOMP_DIM_WORKER
],
1223 targ_fn
->max_threads_per_block
/ dims
[GOMP_DIM_VECTOR
]);
1227 /* Handle the case that the compiler allows the runtime to choose
1228 the vector-length conservatively, by ignoring
1229 gomp_openacc_dims[GOMP_DIM_VECTOR]. TODO: actually handle
1232 /* TODO: limit gomp_openacc_dims[GOMP_DIM_WORKER] such that that
1233 gomp_openacc_dims[GOMP_DIM_WORKER] * actual_vectors does not
1234 exceed targ_fn->max_threads_per_block. */
1235 int workers
= gomp_openacc_dims
[GOMP_DIM_WORKER
];
1236 int gangs
= gomp_openacc_dims
[GOMP_DIM_GANG
];
1239 CUDA_CALL_ASSERT (cuOccupancyMaxPotentialBlockSize
, &grids
,
1240 &blocks
, function
, NULL
, 0,
1241 dims
[GOMP_DIM_WORKER
] * dims
[GOMP_DIM_VECTOR
]);
1242 GOMP_PLUGIN_debug (0, "cuOccupancyMaxPotentialBlockSize: "
1243 "grid = %d, block = %d\n", grids
, blocks
);
1245 /* Keep the num_gangs proportional to the block size. In
1246 the case were a block size is limited by shared-memory
1247 or the register file capacity, the runtime will not
1248 excessively over assign gangs to the multiprocessor
1249 units if their state is going to be swapped out even
1250 more than necessary. The constant factor 2 is there to
1251 prevent threads from idling when there is insufficient
1254 gangs
= 2 * grids
* (blocks
/ warp_size
);
1257 vectors
= warp_size
;
1261 int actual_vectors
= (default_dim_p
[GOMP_DIM_VECTOR
]
1263 : dims
[GOMP_DIM_VECTOR
]);
1264 workers
= blocks
/ actual_vectors
;
1267 for (i
= 0; i
!= GOMP_DIM_MAX
; i
++)
1268 if (default_dim_p
[i
])
1271 case GOMP_DIM_GANG
: dims
[i
] = gangs
; break;
1272 case GOMP_DIM_WORKER
: dims
[i
] = workers
; break;
1273 case GOMP_DIM_VECTOR
: dims
[i
] = vectors
; break;
1274 default: GOMP_PLUGIN_fatal ("invalid dim");
1280 /* Check if the accelerator has sufficient hardware resources to
1281 launch the offloaded kernel. */
1282 if (dims
[GOMP_DIM_WORKER
] * dims
[GOMP_DIM_VECTOR
]
1283 > targ_fn
->max_threads_per_block
)
1286 = targ_fn
->max_threads_per_block
/ dims
[GOMP_DIM_VECTOR
];
1287 GOMP_PLUGIN_fatal ("The Nvidia accelerator has insufficient resources to"
1288 " launch '%s' with num_workers = %d; recompile the"
1289 " program with 'num_workers = %d' on that offloaded"
1290 " region or '-fopenacc-dim=:%d'",
1291 targ_fn
->launch
->fn
, dims
[GOMP_DIM_WORKER
],
1292 suggest_workers
, suggest_workers
);
1295 /* This reserves a chunk of a pre-allocated page of memory mapped on both
1296 the host and the device. HP is a host pointer to the new chunk, and DP is
1297 the corresponding device pointer. */
1298 map_push (dev_str
, mapnum
* sizeof (void *), &hp
, &dp
);
1300 GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__
);
1302 /* Copy the array of arguments to the mapped page. */
1303 for (i
= 0; i
< mapnum
; i
++)
1304 ((void **) hp
)[i
] = devaddrs
[i
];
1306 /* Copy the (device) pointers to arguments to the device (dp and hp might in
1307 fact have the same value on a unified-memory system). */
1308 CUDA_CALL_ASSERT (cuMemcpy
, (CUdeviceptr
) dp
, (CUdeviceptr
) hp
,
1309 mapnum
* sizeof (void *));
1310 GOMP_PLUGIN_debug (0, " %s: kernel %s: launch"
1311 " gangs=%u, workers=%u, vectors=%u\n",
1312 __FUNCTION__
, targ_fn
->launch
->fn
, dims
[GOMP_DIM_GANG
],
1313 dims
[GOMP_DIM_WORKER
], dims
[GOMP_DIM_VECTOR
]);
1317 // num_gangs nctaid.x
1318 // num_workers ntid.y
1319 // vector length ntid.x
1322 CUDA_CALL_ASSERT (cuLaunchKernel
, function
,
1323 dims
[GOMP_DIM_GANG
], 1, 1,
1324 dims
[GOMP_DIM_VECTOR
], dims
[GOMP_DIM_WORKER
], 1,
1325 0, dev_str
->stream
, kargs
, 0);
1327 #ifndef DISABLE_ASYNC
1328 if (async
< acc_async_noval
)
1330 r
= CUDA_CALL_NOCHECK (cuStreamSynchronize
, dev_str
->stream
);
1331 if (r
== CUDA_ERROR_LAUNCH_FAILED
)
1332 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r
),
1334 else if (r
!= CUDA_SUCCESS
)
1335 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r
));
1341 e
= (CUevent
*)GOMP_PLUGIN_malloc (sizeof (CUevent
));
1343 r
= CUDA_CALL_NOCHECK (cuEventCreate
, e
, CU_EVENT_DISABLE_TIMING
);
1344 if (r
== CUDA_ERROR_LAUNCH_FAILED
)
1345 GOMP_PLUGIN_fatal ("cuEventCreate error: %s %s\n", cuda_error (r
),
1347 else if (r
!= CUDA_SUCCESS
)
1348 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r
));
1352 CUDA_CALL_ASSERT (cuEventRecord
, *e
, dev_str
->stream
);
1354 event_add (PTX_EVT_KNL
, e
, (void *)dev_str
, 0);
1357 r
= CUDA_CALL_NOCHECK (cuCtxSynchronize
, );
1358 if (r
== CUDA_ERROR_LAUNCH_FAILED
)
1359 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r
),
1361 else if (r
!= CUDA_SUCCESS
)
1362 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r
));
1365 GOMP_PLUGIN_debug (0, " %s: kernel %s: finished\n", __FUNCTION__
,
1366 targ_fn
->launch
->fn
);
1368 #ifndef DISABLE_ASYNC
1369 if (async
< acc_async_noval
)
1374 void * openacc_get_current_cuda_context (void);
1377 nvptx_alloc (size_t s
)
1381 CUDA_CALL_ERET (NULL
, cuMemAlloc
, &d
, s
);
1386 nvptx_free (void *p
)
1391 CUDA_CALL (cuMemGetAddressRange
, &pb
, &ps
, (CUdeviceptr
) p
);
1392 if ((CUdeviceptr
) p
!= pb
)
1394 GOMP_PLUGIN_error ("invalid device address");
1398 CUDA_CALL (cuMemFree
, (CUdeviceptr
) p
);
1404 nvptx_host2dev (void *d
, const void *h
, size_t s
)
1408 struct nvptx_thread
*nvthd
= nvptx_thread ();
1414 GOMP_PLUGIN_error ("invalid device address");
1418 CUDA_CALL (cuMemGetAddressRange
, &pb
, &ps
, (CUdeviceptr
) d
);
1422 GOMP_PLUGIN_error ("invalid device address");
1427 GOMP_PLUGIN_error ("invalid host address");
1432 GOMP_PLUGIN_error ("invalid host or device address");
1435 if ((void *)(d
+ s
) > (void *)(pb
+ ps
))
1437 GOMP_PLUGIN_error ("invalid size");
1441 #ifndef DISABLE_ASYNC
1442 if (nvthd
&& nvthd
->current_stream
!= nvthd
->ptx_dev
->null_stream
)
1444 CUevent
*e
= (CUevent
*)GOMP_PLUGIN_malloc (sizeof (CUevent
));
1445 CUDA_CALL (cuEventCreate
, e
, CU_EVENT_DISABLE_TIMING
);
1447 CUDA_CALL (cuMemcpyHtoDAsync
,
1448 (CUdeviceptr
) d
, h
, s
, nvthd
->current_stream
->stream
);
1449 CUDA_CALL (cuEventRecord
, *e
, nvthd
->current_stream
->stream
);
1450 event_add (PTX_EVT_MEM
, e
, (void *)h
, 0);
1454 CUDA_CALL (cuMemcpyHtoD
, (CUdeviceptr
) d
, h
, s
);
1460 nvptx_dev2host (void *h
, const void *d
, size_t s
)
1464 struct nvptx_thread
*nvthd
= nvptx_thread ();
1470 GOMP_PLUGIN_error ("invalid device address");
1474 CUDA_CALL (cuMemGetAddressRange
, &pb
, &ps
, (CUdeviceptr
) d
);
1478 GOMP_PLUGIN_error ("invalid device address");
1483 GOMP_PLUGIN_error ("invalid host address");
1488 GOMP_PLUGIN_error ("invalid host or device address");
1491 if ((void *)(d
+ s
) > (void *)(pb
+ ps
))
1493 GOMP_PLUGIN_error ("invalid size");
1497 #ifndef DISABLE_ASYNC
1498 if (nvthd
&& nvthd
->current_stream
!= nvthd
->ptx_dev
->null_stream
)
1500 CUevent
*e
= (CUevent
*) GOMP_PLUGIN_malloc (sizeof (CUevent
));
1501 CUDA_CALL (cuEventCreate
, e
, CU_EVENT_DISABLE_TIMING
);
1503 CUDA_CALL (cuMemcpyDtoHAsync
,
1504 h
, (CUdeviceptr
) d
, s
, nvthd
->current_stream
->stream
);
1505 CUDA_CALL (cuEventRecord
, *e
, nvthd
->current_stream
->stream
);
1506 event_add (PTX_EVT_MEM
, e
, (void *)h
, 0);
1510 CUDA_CALL (cuMemcpyDtoH
, h
, (CUdeviceptr
) d
, s
);
1516 nvptx_set_async (int async
)
1518 struct nvptx_thread
*nvthd
= nvptx_thread ();
1519 nvthd
->current_stream
1520 = select_stream_for_async (async
, pthread_self (), true, NULL
);
1524 nvptx_async_test (int async
)
1527 struct ptx_stream
*s
;
1529 s
= select_stream_for_async (async
, pthread_self (), false, NULL
);
1532 GOMP_PLUGIN_fatal ("unknown async %d", async
);
1534 r
= CUDA_CALL_NOCHECK (cuStreamQuery
, s
->stream
);
1535 if (r
== CUDA_SUCCESS
)
1537 /* The oacc-parallel.c:goacc_wait function calls this hook to determine
1538 whether all work has completed on this stream, and if so omits the call
1539 to the wait hook. If that happens, event_gc might not get called
1540 (which prevents variables from getting unmapped and their associated
1541 device storage freed), so call it here. */
1545 else if (r
== CUDA_ERROR_NOT_READY
)
1548 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r
));
1554 nvptx_async_test_all (void)
1556 struct ptx_stream
*s
;
1557 pthread_t self
= pthread_self ();
1558 struct nvptx_thread
*nvthd
= nvptx_thread ();
1560 pthread_mutex_lock (&nvthd
->ptx_dev
->stream_lock
);
1562 for (s
= nvthd
->ptx_dev
->active_streams
; s
!= NULL
; s
= s
->next
)
1564 if ((s
->multithreaded
|| pthread_equal (s
->host_thread
, self
))
1565 && CUDA_CALL_NOCHECK (cuStreamQuery
,
1566 s
->stream
) == CUDA_ERROR_NOT_READY
)
1568 pthread_mutex_unlock (&nvthd
->ptx_dev
->stream_lock
);
1573 pthread_mutex_unlock (&nvthd
->ptx_dev
->stream_lock
);
1581 nvptx_wait (int async
)
1583 struct ptx_stream
*s
;
1585 s
= select_stream_for_async (async
, pthread_self (), false, NULL
);
1587 GOMP_PLUGIN_fatal ("unknown async %d", async
);
1589 CUDA_CALL_ASSERT (cuStreamSynchronize
, s
->stream
);
1595 nvptx_wait_async (int async1
, int async2
)
1598 struct ptx_stream
*s1
, *s2
;
1599 pthread_t self
= pthread_self ();
1601 /* The stream that is waiting (rather than being waited for) doesn't
1602 necessarily have to exist already. */
1603 s2
= select_stream_for_async (async2
, self
, true, NULL
);
1605 s1
= select_stream_for_async (async1
, self
, false, NULL
);
1607 GOMP_PLUGIN_fatal ("invalid async 1\n");
1610 GOMP_PLUGIN_fatal ("identical parameters");
1612 e
= (CUevent
*) GOMP_PLUGIN_malloc (sizeof (CUevent
));
1614 CUDA_CALL_ASSERT (cuEventCreate
, e
, CU_EVENT_DISABLE_TIMING
);
1618 CUDA_CALL_ASSERT (cuEventRecord
, *e
, s1
->stream
);
1620 event_add (PTX_EVT_SYNC
, e
, NULL
, 0);
1622 CUDA_CALL_ASSERT (cuStreamWaitEvent
, s2
->stream
, *e
, 0);
1626 nvptx_wait_all (void)
1629 struct ptx_stream
*s
;
1630 pthread_t self
= pthread_self ();
1631 struct nvptx_thread
*nvthd
= nvptx_thread ();
1633 pthread_mutex_lock (&nvthd
->ptx_dev
->stream_lock
);
1635 /* Wait for active streams initiated by this thread (or by multiple threads)
1637 for (s
= nvthd
->ptx_dev
->active_streams
; s
!= NULL
; s
= s
->next
)
1639 if (s
->multithreaded
|| pthread_equal (s
->host_thread
, self
))
1641 r
= CUDA_CALL_NOCHECK (cuStreamQuery
, s
->stream
);
1642 if (r
== CUDA_SUCCESS
)
1644 else if (r
!= CUDA_ERROR_NOT_READY
)
1645 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r
));
1647 CUDA_CALL_ASSERT (cuStreamSynchronize
, s
->stream
);
1651 pthread_mutex_unlock (&nvthd
->ptx_dev
->stream_lock
);
1657 nvptx_wait_all_async (int async
)
1659 struct ptx_stream
*waiting_stream
, *other_stream
;
1661 struct nvptx_thread
*nvthd
= nvptx_thread ();
1662 pthread_t self
= pthread_self ();
1664 /* The stream doing the waiting. This could be the first mention of the
1665 stream, so create it if necessary. */
1667 = select_stream_for_async (async
, pthread_self (), true, NULL
);
1669 /* Launches on the null stream already block on other streams in the
1671 if (!waiting_stream
|| waiting_stream
== nvthd
->ptx_dev
->null_stream
)
1676 pthread_mutex_lock (&nvthd
->ptx_dev
->stream_lock
);
1678 for (other_stream
= nvthd
->ptx_dev
->active_streams
;
1679 other_stream
!= NULL
;
1680 other_stream
= other_stream
->next
)
1682 if (!other_stream
->multithreaded
1683 && !pthread_equal (other_stream
->host_thread
, self
))
1686 e
= (CUevent
*) GOMP_PLUGIN_malloc (sizeof (CUevent
));
1688 CUDA_CALL_ASSERT (cuEventCreate
, e
, CU_EVENT_DISABLE_TIMING
);
1690 /* Record an event on the waited-for stream. */
1691 CUDA_CALL_ASSERT (cuEventRecord
, *e
, other_stream
->stream
);
1693 event_add (PTX_EVT_SYNC
, e
, NULL
, 0);
1695 CUDA_CALL_ASSERT (cuStreamWaitEvent
, waiting_stream
->stream
, *e
, 0);
1698 pthread_mutex_unlock (&nvthd
->ptx_dev
->stream_lock
);
1702 nvptx_get_current_cuda_device (void)
1704 struct nvptx_thread
*nvthd
= nvptx_thread ();
1706 if (!nvthd
|| !nvthd
->ptx_dev
)
1709 return &nvthd
->ptx_dev
->dev
;
1713 nvptx_get_current_cuda_context (void)
1715 struct nvptx_thread
*nvthd
= nvptx_thread ();
1717 if (!nvthd
|| !nvthd
->ptx_dev
)
1720 return nvthd
->ptx_dev
->ctx
;
1724 nvptx_get_cuda_stream (int async
)
1726 struct ptx_stream
*s
;
1727 struct nvptx_thread
*nvthd
= nvptx_thread ();
1729 if (!nvthd
|| !nvthd
->ptx_dev
)
1732 s
= select_stream_for_async (async
, pthread_self (), false, NULL
);
1734 return s
? s
->stream
: NULL
;
1738 nvptx_set_cuda_stream (int async
, void *stream
)
1740 struct ptx_stream
*oldstream
;
1741 pthread_t self
= pthread_self ();
1742 struct nvptx_thread
*nvthd
= nvptx_thread ();
1745 GOMP_PLUGIN_fatal ("bad async %d", async
);
1747 pthread_mutex_lock (&nvthd
->ptx_dev
->stream_lock
);
1749 /* We have a list of active streams and an array mapping async values to
1750 entries of that list. We need to take "ownership" of the passed-in stream,
1751 and add it to our list, removing the previous entry also (if there was one)
1752 in order to prevent resource leaks. Note the potential for surprise
1753 here: maybe we should keep track of passed-in streams and leave it up to
1754 the user to tidy those up, but that doesn't work for stream handles
1755 returned from acc_get_cuda_stream above... */
1757 oldstream
= select_stream_for_async (async
, self
, false, NULL
);
1761 if (nvthd
->ptx_dev
->active_streams
== oldstream
)
1762 nvthd
->ptx_dev
->active_streams
= nvthd
->ptx_dev
->active_streams
->next
;
1765 struct ptx_stream
*s
= nvthd
->ptx_dev
->active_streams
;
1766 while (s
->next
!= oldstream
)
1768 s
->next
= s
->next
->next
;
1771 CUDA_CALL_ASSERT (cuStreamDestroy
, oldstream
->stream
);
1773 if (!map_fini (oldstream
))
1774 GOMP_PLUGIN_fatal ("error when freeing host memory");
1779 pthread_mutex_unlock (&nvthd
->ptx_dev
->stream_lock
);
1781 (void) select_stream_for_async (async
, self
, true, (CUstream
) stream
);
1786 /* Plugin entry points. */
1789 GOMP_OFFLOAD_get_name (void)
1795 GOMP_OFFLOAD_get_caps (void)
1797 return GOMP_OFFLOAD_CAP_OPENACC_200
| GOMP_OFFLOAD_CAP_OPENMP_400
;
1801 GOMP_OFFLOAD_get_type (void)
1803 return OFFLOAD_TARGET_TYPE_NVIDIA_PTX
;
1807 GOMP_OFFLOAD_get_num_devices (void)
1809 return nvptx_get_num_devices ();
1813 GOMP_OFFLOAD_init_device (int n
)
1815 struct ptx_device
*dev
;
1817 pthread_mutex_lock (&ptx_dev_lock
);
1819 if (!nvptx_init () || ptx_devices
[n
] != NULL
)
1821 pthread_mutex_unlock (&ptx_dev_lock
);
1825 dev
= nvptx_open_device (n
);
1828 ptx_devices
[n
] = dev
;
1829 instantiated_devices
++;
1832 pthread_mutex_unlock (&ptx_dev_lock
);
1838 GOMP_OFFLOAD_fini_device (int n
)
1840 pthread_mutex_lock (&ptx_dev_lock
);
1842 if (ptx_devices
[n
] != NULL
)
1844 if (!nvptx_attach_host_thread_to_device (n
)
1845 || !nvptx_close_device (ptx_devices
[n
]))
1847 pthread_mutex_unlock (&ptx_dev_lock
);
1850 ptx_devices
[n
] = NULL
;
1851 instantiated_devices
--;
1854 pthread_mutex_unlock (&ptx_dev_lock
);
1858 /* Return the libgomp version number we're compatible with. There is
1859 no requirement for cross-version compatibility. */
1862 GOMP_OFFLOAD_version (void)
1864 return GOMP_VERSION
;
1867 /* Initialize __nvptx_clocktick, if present in MODULE. */
1870 nvptx_set_clocktick (CUmodule module
, struct ptx_device
*dev
)
1873 CUresult r
= CUDA_CALL_NOCHECK (cuModuleGetGlobal
, &dptr
, NULL
,
1874 module
, "__nvptx_clocktick");
1875 if (r
== CUDA_ERROR_NOT_FOUND
)
1877 if (r
!= CUDA_SUCCESS
)
1878 GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r
));
1879 double __nvptx_clocktick
= 1e-3 / dev
->clock_khz
;
1880 r
= CUDA_CALL_NOCHECK (cuMemcpyHtoD
, dptr
, &__nvptx_clocktick
,
1881 sizeof (__nvptx_clocktick
));
1882 if (r
!= CUDA_SUCCESS
)
1883 GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r
));
1886 /* Load the (partial) program described by TARGET_DATA to device
1887 number ORD. Allocate and return TARGET_TABLE. */
1890 GOMP_OFFLOAD_load_image (int ord
, unsigned version
, const void *target_data
,
1891 struct addr_pair
**target_table
)
1894 const char *const *var_names
;
1895 const struct targ_fn_launch
*fn_descs
;
1896 unsigned int fn_entries
, var_entries
, i
, j
;
1897 struct targ_fn_descriptor
*targ_fns
;
1898 struct addr_pair
*targ_tbl
;
1899 const nvptx_tdata_t
*img_header
= (const nvptx_tdata_t
*) target_data
;
1900 struct ptx_image_data
*new_image
;
1901 struct ptx_device
*dev
;
1903 if (GOMP_VERSION_DEV (version
) > GOMP_VERSION_NVIDIA_PTX
)
1905 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1906 " (expected %u, received %u)",
1907 GOMP_VERSION_NVIDIA_PTX
, GOMP_VERSION_DEV (version
));
1911 if (!nvptx_attach_host_thread_to_device (ord
)
1912 || !link_ptx (&module
, img_header
->ptx_objs
, img_header
->ptx_num
))
1915 dev
= ptx_devices
[ord
];
1917 /* The mkoffload utility emits a struct of pointers/integers at the
1918 start of each offload image. The array of kernel names and the
1919 functions addresses form a one-to-one correspondence. */
1921 var_entries
= img_header
->var_num
;
1922 var_names
= img_header
->var_names
;
1923 fn_entries
= img_header
->fn_num
;
1924 fn_descs
= img_header
->fn_descs
;
1926 targ_tbl
= GOMP_PLUGIN_malloc (sizeof (struct addr_pair
)
1927 * (fn_entries
+ var_entries
));
1928 targ_fns
= GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor
)
1931 *target_table
= targ_tbl
;
1933 new_image
= GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data
));
1934 new_image
->target_data
= target_data
;
1935 new_image
->module
= module
;
1936 new_image
->fns
= targ_fns
;
1938 pthread_mutex_lock (&dev
->image_lock
);
1939 new_image
->next
= dev
->images
;
1940 dev
->images
= new_image
;
1941 pthread_mutex_unlock (&dev
->image_lock
);
1943 for (i
= 0; i
< fn_entries
; i
++, targ_fns
++, targ_tbl
++)
1945 CUfunction function
;
1948 CUDA_CALL_ERET (-1, cuModuleGetFunction
, &function
, module
,
1950 CUDA_CALL_ERET (-1, cuFuncGetAttribute
, &nregs
,
1951 CU_FUNC_ATTRIBUTE_NUM_REGS
, function
);
1952 CUDA_CALL_ERET (-1, cuFuncGetAttribute
, &mthrs
,
1953 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK
, function
);
1955 targ_fns
->fn
= function
;
1956 targ_fns
->launch
= &fn_descs
[i
];
1957 targ_fns
->regs_per_thread
= nregs
;
1958 targ_fns
->max_threads_per_block
= mthrs
;
1960 targ_tbl
->start
= (uintptr_t) targ_fns
;
1961 targ_tbl
->end
= targ_tbl
->start
+ 1;
1964 for (j
= 0; j
< var_entries
; j
++, targ_tbl
++)
1969 CUDA_CALL_ERET (-1, cuModuleGetGlobal
,
1970 &var
, &bytes
, module
, var_names
[j
]);
1972 targ_tbl
->start
= (uintptr_t) var
;
1973 targ_tbl
->end
= targ_tbl
->start
+ bytes
;
1976 nvptx_set_clocktick (module
, dev
);
1978 return fn_entries
+ var_entries
;
1981 /* Unload the program described by TARGET_DATA. DEV_DATA is the
1982 function descriptors allocated by G_O_load_image. */
1985 GOMP_OFFLOAD_unload_image (int ord
, unsigned version
, const void *target_data
)
1987 struct ptx_image_data
*image
, **prev_p
;
1988 struct ptx_device
*dev
= ptx_devices
[ord
];
1990 if (GOMP_VERSION_DEV (version
) > GOMP_VERSION_NVIDIA_PTX
)
1992 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1993 " (expected %u, received %u)",
1994 GOMP_VERSION_NVIDIA_PTX
, GOMP_VERSION_DEV (version
));
1999 pthread_mutex_lock (&dev
->image_lock
);
2000 for (prev_p
= &dev
->images
; (image
= *prev_p
) != 0; prev_p
= &image
->next
)
2001 if (image
->target_data
== target_data
)
2003 *prev_p
= image
->next
;
2004 if (CUDA_CALL_NOCHECK (cuModuleUnload
, image
->module
) != CUDA_SUCCESS
)
2010 pthread_mutex_unlock (&dev
->image_lock
);
2015 GOMP_OFFLOAD_alloc (int ord
, size_t size
)
2017 if (!nvptx_attach_host_thread_to_device (ord
))
2019 return nvptx_alloc (size
);
2023 GOMP_OFFLOAD_free (int ord
, void *ptr
)
2025 return (nvptx_attach_host_thread_to_device (ord
)
2026 && nvptx_free (ptr
));
2030 GOMP_OFFLOAD_dev2host (int ord
, void *dst
, const void *src
, size_t n
)
2032 return (nvptx_attach_host_thread_to_device (ord
)
2033 && nvptx_dev2host (dst
, src
, n
));
2037 GOMP_OFFLOAD_host2dev (int ord
, void *dst
, const void *src
, size_t n
)
2039 return (nvptx_attach_host_thread_to_device (ord
)
2040 && nvptx_host2dev (dst
, src
, n
));
2044 GOMP_OFFLOAD_dev2dev (int ord
, void *dst
, const void *src
, size_t n
)
2046 struct ptx_device
*ptx_dev
= ptx_devices
[ord
];
2047 CUDA_CALL (cuMemcpyDtoDAsync
, (CUdeviceptr
) dst
, (CUdeviceptr
) src
, n
,
2048 ptx_dev
->null_stream
->stream
);
2052 void (*device_run
) (int n
, void *fn_ptr
, void *vars
) = NULL
;
2055 GOMP_OFFLOAD_openacc_exec (void (*fn
) (void *), size_t mapnum
,
2056 void **hostaddrs
, void **devaddrs
,
2057 int async
, unsigned *dims
, void *targ_mem_desc
)
2059 nvptx_exec (fn
, mapnum
, hostaddrs
, devaddrs
, async
, dims
, targ_mem_desc
);
2063 GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc
, int async
)
2065 struct nvptx_thread
*nvthd
= nvptx_thread ();
2066 CUevent
*e
= (CUevent
*) GOMP_PLUGIN_malloc (sizeof (CUevent
));
2068 CUDA_CALL_ASSERT (cuEventCreate
, e
, CU_EVENT_DISABLE_TIMING
);
2069 CUDA_CALL_ASSERT (cuEventRecord
, *e
, nvthd
->current_stream
->stream
);
2070 event_add (PTX_EVT_ASYNC_CLEANUP
, e
, targ_mem_desc
, async
);
2074 GOMP_OFFLOAD_openacc_async_test (int async
)
2076 return nvptx_async_test (async
);
2080 GOMP_OFFLOAD_openacc_async_test_all (void)
2082 return nvptx_async_test_all ();
2086 GOMP_OFFLOAD_openacc_async_wait (int async
)
2092 GOMP_OFFLOAD_openacc_async_wait_async (int async1
, int async2
)
2094 nvptx_wait_async (async1
, async2
);
2098 GOMP_OFFLOAD_openacc_async_wait_all (void)
2104 GOMP_OFFLOAD_openacc_async_wait_all_async (int async
)
2106 nvptx_wait_all_async (async
);
2110 GOMP_OFFLOAD_openacc_async_set_async (int async
)
2112 nvptx_set_async (async
);
2116 GOMP_OFFLOAD_openacc_create_thread_data (int ord
)
2118 struct ptx_device
*ptx_dev
;
2119 struct nvptx_thread
*nvthd
2120 = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread
));
2123 ptx_dev
= ptx_devices
[ord
];
2127 CUDA_CALL_ASSERT (cuCtxGetCurrent
, &thd_ctx
);
2129 assert (ptx_dev
->ctx
);
2132 CUDA_CALL_ASSERT (cuCtxPushCurrent
, ptx_dev
->ctx
);
2134 nvthd
->current_stream
= ptx_dev
->null_stream
;
2135 nvthd
->ptx_dev
= ptx_dev
;
2137 return (void *) nvthd
;
2141 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data
)
2147 GOMP_OFFLOAD_openacc_cuda_get_current_device (void)
2149 return nvptx_get_current_cuda_device ();
2153 GOMP_OFFLOAD_openacc_cuda_get_current_context (void)
2155 return nvptx_get_current_cuda_context ();
2158 /* NOTE: This returns a CUstream, not a ptx_stream pointer. */
2161 GOMP_OFFLOAD_openacc_cuda_get_stream (int async
)
2163 return nvptx_get_cuda_stream (async
);
2166 /* NOTE: This takes a CUstream, not a ptx_stream pointer. */
2169 GOMP_OFFLOAD_openacc_cuda_set_stream (int async
, void *stream
)
2171 return nvptx_set_cuda_stream (async
, stream
);
2174 /* Adjust launch dimensions: pick good values for number of blocks and warps
2175 and ensure that number of warps does not exceed CUDA limits as well as GCC's
2179 nvptx_adjust_launch_bounds (struct targ_fn_descriptor
*fn
,
2180 struct ptx_device
*ptx_dev
,
2181 int *teams_p
, int *threads_p
)
2183 int max_warps_block
= fn
->max_threads_per_block
/ 32;
2184 /* Maximum 32 warps per block is an implementation limit in NVPTX backend
2185 and libgcc, which matches documented limit of all GPUs as of 2015. */
2186 if (max_warps_block
> 32)
2187 max_warps_block
= 32;
2188 if (*threads_p
<= 0)
2190 if (*threads_p
> max_warps_block
)
2191 *threads_p
= max_warps_block
;
2193 int regs_per_block
= fn
->regs_per_thread
* 32 * *threads_p
;
2194 /* This is an estimate of how many blocks the device can host simultaneously.
2195 Actual limit, which may be lower, can be queried with "occupancy control"
2196 driver interface (since CUDA 6.0). */
2197 int max_blocks
= ptx_dev
->regs_per_sm
/ regs_per_block
* ptx_dev
->num_sms
;
2198 if (*teams_p
<= 0 || *teams_p
> max_blocks
)
2199 *teams_p
= max_blocks
;
2202 /* Return the size of per-warp stacks (see gcc -msoft-stack) to use for OpenMP
2206 nvptx_stacks_size ()
2211 /* Return contiguous storage for NUM stacks, each SIZE bytes. */
2214 nvptx_stacks_alloc (size_t size
, int num
)
2217 CUresult r
= CUDA_CALL_NOCHECK (cuMemAlloc
, &stacks
, size
* num
);
2218 if (r
!= CUDA_SUCCESS
)
2219 GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r
));
2220 return (void *) stacks
;
2223 /* Release storage previously allocated by nvptx_stacks_alloc. */
2226 nvptx_stacks_free (void *p
, int num
)
2228 CUresult r
= CUDA_CALL_NOCHECK (cuMemFree
, (CUdeviceptr
) p
);
2229 if (r
!= CUDA_SUCCESS
)
2230 GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r
));
2234 GOMP_OFFLOAD_run (int ord
, void *tgt_fn
, void *tgt_vars
, void **args
)
2236 CUfunction function
= ((struct targ_fn_descriptor
*) tgt_fn
)->fn
;
2238 struct ptx_device
*ptx_dev
= ptx_devices
[ord
];
2239 const char *maybe_abort_msg
= "(perhaps abort was called)";
2240 int teams
= 0, threads
= 0;
2243 GOMP_PLUGIN_fatal ("No target arguments provided");
2246 intptr_t id
= (intptr_t) *args
++, val
;
2247 if (id
& GOMP_TARGET_ARG_SUBSEQUENT_PARAM
)
2248 val
= (intptr_t) *args
++;
2250 val
= id
>> GOMP_TARGET_ARG_VALUE_SHIFT
;
2251 if ((id
& GOMP_TARGET_ARG_DEVICE_MASK
) != GOMP_TARGET_ARG_DEVICE_ALL
)
2253 val
= val
> INT_MAX
? INT_MAX
: val
;
2254 id
&= GOMP_TARGET_ARG_ID_MASK
;
2255 if (id
== GOMP_TARGET_ARG_NUM_TEAMS
)
2257 else if (id
== GOMP_TARGET_ARG_THREAD_LIMIT
)
2260 nvptx_adjust_launch_bounds (tgt_fn
, ptx_dev
, &teams
, &threads
);
2262 size_t stack_size
= nvptx_stacks_size ();
2263 void *stacks
= nvptx_stacks_alloc (stack_size
, teams
* threads
);
2264 void *fn_args
[] = {tgt_vars
, stacks
, (void *) stack_size
};
2265 size_t fn_args_size
= sizeof fn_args
;
2267 CU_LAUNCH_PARAM_BUFFER_POINTER
, fn_args
,
2268 CU_LAUNCH_PARAM_BUFFER_SIZE
, &fn_args_size
,
2271 r
= CUDA_CALL_NOCHECK (cuLaunchKernel
, function
, teams
, 1, 1,
2272 32, threads
, 1, 0, ptx_dev
->null_stream
->stream
,
2274 if (r
!= CUDA_SUCCESS
)
2275 GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r
));
2277 r
= CUDA_CALL_NOCHECK (cuCtxSynchronize
, );
2278 if (r
== CUDA_ERROR_LAUNCH_FAILED
)
2279 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r
),
2281 else if (r
!= CUDA_SUCCESS
)
2282 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r
));
2283 nvptx_stacks_free (stacks
, teams
* threads
);
2287 GOMP_OFFLOAD_async_run (int ord
, void *tgt_fn
, void *tgt_vars
, void **args
,
2290 GOMP_PLUGIN_fatal ("GOMP_OFFLOAD_async_run unimplemented");