1 /* Plugin for NVPTX execution.
3 Copyright (C) 2013-2018 Free Software Foundation, Inc.
5 Contributed by Mentor Embedded.
7 This file is part of the GNU Offloading and Multi Processing Library
10 Libgomp is free software; you can redistribute it and/or modify it
11 under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 3, or (at your option)
15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
20 Under Section 7 of GPL version 3, you are granted additional
21 permissions described in the GCC Runtime Library Exception, version
22 3.1, as published by the Free Software Foundation.
24 You should have received a copy of the GNU General Public License and
25 a copy of the GCC Runtime Library Exception along with this program;
26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
27 <http://www.gnu.org/licenses/>. */
29 /* Nvidia PTX-specific parts of OpenACC support. The cuda driver
30 library appears to hold some implicit state, but the documentation
31 is not clear as to what that state might be. Or how one might
32 propagate it from one thread to another. */
37 #include "libgomp-plugin.h"
38 #include "oacc-plugin.h"
39 #include "gomp-constants.h"
52 #if CUDA_VERSION < 6000
53 extern CUresult
cuGetErrorString (CUresult
, const char **);
54 #define CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR 82
57 #if CUDA_VERSION >= 6050
60 CUresult
cuLinkAddData (CUlinkState
, CUjitInputType
, void *, size_t,
61 const char *, unsigned, CUjit_option
*, void **);
62 CUresult
cuLinkCreate (unsigned, CUjit_option
*, void **, CUlinkState
*);
64 CUresult
cuLinkAddData_v2 (CUlinkState
, CUjitInputType
, void *, size_t,
65 const char *, unsigned, CUjit_option
*, void **);
66 CUresult
cuLinkCreate_v2 (unsigned, CUjit_option
*, void **, CUlinkState
*);
69 #define DO_PRAGMA(x) _Pragma (#x)
71 #if PLUGIN_NVPTX_DYNAMIC
76 # define CUDA_ONE_CALL(call) \
77 __typeof (call) *call;
78 # define CUDA_ONE_CALL_MAYBE_NULL(call) \
80 #include "cuda-lib.def"
82 # undef CUDA_ONE_CALL_MAYBE_NULL
86 /* -1 if init_cuda_lib has not been called yet, false
87 if it has been and failed, true if it has been and succeeded. */
88 static signed char cuda_lib_inited
= -1;
90 /* Dynamically load the CUDA runtime library and initialize function
91 pointers, return false if unsuccessful, true if successful. */
95 if (cuda_lib_inited
!= -1)
96 return cuda_lib_inited
;
97 const char *cuda_runtime_lib
= "libcuda.so.1";
98 void *h
= dlopen (cuda_runtime_lib
, RTLD_LAZY
);
99 cuda_lib_inited
= false;
103 # define CUDA_ONE_CALL(call) CUDA_ONE_CALL_1 (call, false)
104 # define CUDA_ONE_CALL_MAYBE_NULL(call) CUDA_ONE_CALL_1 (call, true)
105 # define CUDA_ONE_CALL_1(call, allow_null) \
106 cuda_lib.call = dlsym (h, #call); \
107 if (!allow_null && cuda_lib.call == NULL) \
109 #include "cuda-lib.def"
110 # undef CUDA_ONE_CALL
111 # undef CUDA_ONE_CALL_1
112 # undef CUDA_ONE_CALL_MAYBE_NULL
114 cuda_lib_inited
= true;
117 # define CUDA_CALL_PREFIX cuda_lib.
120 # define CUDA_ONE_CALL(call)
121 # define CUDA_ONE_CALL_MAYBE_NULL(call) DO_PRAGMA (weak call)
122 #include "cuda-lib.def"
123 #undef CUDA_ONE_CALL_MAYBE_NULL
126 # define CUDA_CALL_PREFIX
127 # define init_cuda_lib() true
130 #include "secure_getenv.h"
134 #define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
135 #define MAX(X,Y) ((X) > (Y) ? (X) : (Y))
137 /* Convenience macros for the frequently used CUDA library call and
138 error handling sequence as well as CUDA library calls that
139 do the error checking themselves or don't do it at all. */
141 #define CUDA_CALL_ERET(ERET, FN, ...) \
144 = CUDA_CALL_PREFIX FN (__VA_ARGS__); \
145 if (__r != CUDA_SUCCESS) \
147 GOMP_PLUGIN_error (#FN " error: %s", \
153 #define CUDA_CALL(FN, ...) \
154 CUDA_CALL_ERET (false, FN, __VA_ARGS__)
156 #define CUDA_CALL_ASSERT(FN, ...) \
159 = CUDA_CALL_PREFIX FN (__VA_ARGS__); \
160 if (__r != CUDA_SUCCESS) \
162 GOMP_PLUGIN_fatal (#FN " error: %s", \
167 #define CUDA_CALL_NOCHECK(FN, ...) \
168 CUDA_CALL_PREFIX FN (__VA_ARGS__)
170 #define CUDA_CALL_EXISTS(FN) \
174 cuda_error (CUresult r
)
176 const char *fallback
= "unknown cuda error";
179 if (!CUDA_CALL_EXISTS (cuGetErrorString
))
182 r
= CUDA_CALL_NOCHECK (cuGetErrorString
, r
, &desc
);
183 if (r
== CUDA_SUCCESS
)
189 static unsigned int instantiated_devices
= 0;
190 static pthread_mutex_t ptx_dev_lock
= PTHREAD_MUTEX_INITIALIZER
;
195 pthread_t host_thread
;
206 struct ptx_stream
*next
;
209 /* Thread-specific data for PTX. */
213 struct ptx_stream
*current_stream
;
214 struct ptx_device
*ptx_dev
;
218 map_init (struct ptx_stream
*s
)
220 int size
= getpagesize ();
226 CUDA_CALL (cuMemAllocHost
, &s
->h
, size
);
227 CUDA_CALL (cuMemHostGetDevicePointer
, &s
->d
, s
->h
, 0);
232 s
->h_end
= s
->h_begin
+ size
;
233 s
->h_next
= s
->h_prev
= s
->h_tail
= s
->h_begin
;
241 map_fini (struct ptx_stream
*s
)
243 CUDA_CALL (cuMemFreeHost
, s
->h
);
248 map_pop (struct ptx_stream
*s
)
255 s
->h_tail
= s
->h_next
;
257 if (s
->h_tail
>= s
->h_end
)
258 s
->h_tail
= s
->h_begin
+ (int) (s
->h_tail
- s
->h_end
);
260 if (s
->h_next
== s
->h_tail
)
261 s
->h_prev
= s
->h_next
;
263 assert (s
->h_next
>= s
->h_begin
);
264 assert (s
->h_tail
>= s
->h_begin
);
265 assert (s
->h_prev
>= s
->h_begin
);
267 assert (s
->h_next
<= s
->h_end
);
268 assert (s
->h_tail
<= s
->h_end
);
269 assert (s
->h_prev
<= s
->h_end
);
273 map_push (struct ptx_stream
*s
, size_t size
, void **h
, void **d
)
280 left
= s
->h_end
- s
->h_next
;
287 assert (s
->h_next
== s
->h_prev
);
288 s
->h_next
= s
->h_prev
= s
->h_tail
= s
->h_begin
;
293 offset
= s
->h_next
- s
->h
;
295 *d
= (void *)(s
->d
+ offset
);
296 *h
= (void *)(s
->h
+ offset
);
298 s
->h_prev
= s
->h_next
;
304 assert (s
->h_next
>= s
->h_begin
);
305 assert (s
->h_tail
>= s
->h_begin
);
306 assert (s
->h_prev
>= s
->h_begin
);
307 assert (s
->h_next
<= s
->h_end
);
308 assert (s
->h_tail
<= s
->h_end
);
309 assert (s
->h_prev
<= s
->h_end
);
314 /* Target data function launch information. */
316 struct targ_fn_launch
319 unsigned short dim
[GOMP_DIM_MAX
];
322 /* Target PTX object information. */
330 /* Target data image information. */
332 typedef struct nvptx_tdata
334 const struct targ_ptx_obj
*ptx_objs
;
337 const char *const *var_names
;
340 const struct targ_fn_launch
*fn_descs
;
344 /* Descriptor of a loaded function. */
346 struct targ_fn_descriptor
349 const struct targ_fn_launch
*launch
;
351 int max_threads_per_block
;
354 /* A loaded PTX image. */
355 struct ptx_image_data
357 const void *target_data
;
360 struct targ_fn_descriptor
*fns
; /* Array of functions. */
362 struct ptx_image_data
*next
;
370 struct ptx_stream
*null_stream
;
371 /* All non-null streams associated with this device (actually context),
372 either created implicitly or passed in from the user (via
373 acc_set_cuda_stream). */
374 struct ptx_stream
*active_streams
;
376 struct ptx_stream
**arr
;
379 /* A lock for use when manipulating the above stream list and array. */
380 pthread_mutex_t stream_lock
;
392 int max_threads_per_block
;
393 int max_threads_per_multiprocessor
;
394 int default_dims
[GOMP_DIM_MAX
];
396 struct ptx_image_data
*images
; /* Images loaded on device. */
397 pthread_mutex_t image_lock
; /* Lock for above list. */
399 struct ptx_device
*next
;
407 PTX_EVT_ASYNC_CLEANUP
418 struct ptx_event
*next
;
421 static pthread_mutex_t ptx_event_lock
;
422 static struct ptx_event
*ptx_events
;
424 static struct ptx_device
**ptx_devices
;
426 static inline struct nvptx_thread
*
429 return (struct nvptx_thread
*) GOMP_PLUGIN_acc_thread ();
433 init_streams_for_device (struct ptx_device
*ptx_dev
, int concurrency
)
436 struct ptx_stream
*null_stream
437 = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream
));
439 null_stream
->stream
= NULL
;
440 null_stream
->host_thread
= pthread_self ();
441 null_stream
->multithreaded
= true;
442 null_stream
->d
= (CUdeviceptr
) NULL
;
443 null_stream
->h
= NULL
;
444 if (!map_init (null_stream
))
447 ptx_dev
->null_stream
= null_stream
;
448 ptx_dev
->active_streams
= NULL
;
449 pthread_mutex_init (&ptx_dev
->stream_lock
, NULL
);
454 /* This is just a guess -- make space for as many async streams as the
455 current device is capable of concurrently executing. This can grow
456 later as necessary. No streams are created yet. */
457 ptx_dev
->async_streams
.arr
458 = GOMP_PLUGIN_malloc (concurrency
* sizeof (struct ptx_stream
*));
459 ptx_dev
->async_streams
.size
= concurrency
;
461 for (i
= 0; i
< concurrency
; i
++)
462 ptx_dev
->async_streams
.arr
[i
] = NULL
;
468 fini_streams_for_device (struct ptx_device
*ptx_dev
)
470 free (ptx_dev
->async_streams
.arr
);
473 while (ptx_dev
->active_streams
!= NULL
)
475 struct ptx_stream
*s
= ptx_dev
->active_streams
;
476 ptx_dev
->active_streams
= ptx_dev
->active_streams
->next
;
480 CUresult r
= CUDA_CALL_NOCHECK (cuStreamDestroy
, s
->stream
);
481 if (r
!= CUDA_SUCCESS
)
483 GOMP_PLUGIN_error ("cuStreamDestroy error: %s", cuda_error (r
));
489 ret
&= map_fini (ptx_dev
->null_stream
);
490 free (ptx_dev
->null_stream
);
494 /* Select a stream for (OpenACC-semantics) ASYNC argument for the current
495 thread THREAD (and also current device/context). If CREATE is true, create
496 the stream if it does not exist (or use EXISTING if it is non-NULL), and
497 associate the stream with the same thread argument. Returns stream to use
500 static struct ptx_stream
*
501 select_stream_for_async (int async
, pthread_t thread
, bool create
,
504 struct nvptx_thread
*nvthd
= nvptx_thread ();
505 /* Local copy of TLS variable. */
506 struct ptx_device
*ptx_dev
= nvthd
->ptx_dev
;
507 struct ptx_stream
*stream
= NULL
;
508 int orig_async
= async
;
510 /* The special value acc_async_noval (-1) maps (for now) to an
511 implicitly-created stream, which is then handled the same as any other
512 numbered async stream. Other options are available, e.g. using the null
513 stream for anonymous async operations, or choosing an idle stream from an
514 active set. But, stick with this for now. */
515 if (async
> acc_async_sync
)
519 pthread_mutex_lock (&ptx_dev
->stream_lock
);
521 /* NOTE: AFAICT there's no particular need for acc_async_sync to map to the
522 null stream, and in fact better performance may be obtainable if it doesn't
523 (because the null stream enforces overly-strict synchronisation with
524 respect to other streams for legacy reasons, and that's probably not
525 needed with OpenACC). Maybe investigate later. */
526 if (async
== acc_async_sync
)
527 stream
= ptx_dev
->null_stream
;
528 else if (async
>= 0 && async
< ptx_dev
->async_streams
.size
529 && ptx_dev
->async_streams
.arr
[async
] && !(create
&& existing
))
530 stream
= ptx_dev
->async_streams
.arr
[async
];
531 else if (async
>= 0 && create
)
533 if (async
>= ptx_dev
->async_streams
.size
)
535 int i
, newsize
= ptx_dev
->async_streams
.size
* 2;
537 if (async
>= newsize
)
540 ptx_dev
->async_streams
.arr
541 = GOMP_PLUGIN_realloc (ptx_dev
->async_streams
.arr
,
542 newsize
* sizeof (struct ptx_stream
*));
544 for (i
= ptx_dev
->async_streams
.size
; i
< newsize
; i
++)
545 ptx_dev
->async_streams
.arr
[i
] = NULL
;
547 ptx_dev
->async_streams
.size
= newsize
;
550 /* Create a new stream on-demand if there isn't one already, or if we're
551 setting a particular async value to an existing (externally-provided)
553 if (!ptx_dev
->async_streams
.arr
[async
] || existing
)
557 = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream
));
560 s
->stream
= existing
;
563 r
= CUDA_CALL_NOCHECK (cuStreamCreate
, &s
->stream
,
565 if (r
!= CUDA_SUCCESS
)
567 pthread_mutex_unlock (&ptx_dev
->stream_lock
);
568 GOMP_PLUGIN_fatal ("cuStreamCreate error: %s",
573 /* If CREATE is true, we're going to be queueing some work on this
574 stream. Associate it with the current host thread. */
575 s
->host_thread
= thread
;
576 s
->multithreaded
= false;
578 s
->d
= (CUdeviceptr
) NULL
;
582 pthread_mutex_unlock (&ptx_dev
->stream_lock
);
583 GOMP_PLUGIN_fatal ("map_init fail");
586 s
->next
= ptx_dev
->active_streams
;
587 ptx_dev
->active_streams
= s
;
588 ptx_dev
->async_streams
.arr
[async
] = s
;
591 stream
= ptx_dev
->async_streams
.arr
[async
];
596 pthread_mutex_unlock (&ptx_dev
->stream_lock
);
597 GOMP_PLUGIN_fatal ("bad async %d", async
);
602 assert (stream
!= NULL
);
604 /* If we're trying to use the same stream from different threads
605 simultaneously, set stream->multithreaded to true. This affects the
606 behaviour of acc_async_test_all and acc_wait_all, which are supposed to
607 only wait for asynchronous launches from the same host thread they are
608 invoked on. If multiple threads use the same async value, we make note
609 of that here and fall back to testing/waiting for all threads in those
611 if (thread
!= stream
->host_thread
)
612 stream
->multithreaded
= true;
614 pthread_mutex_unlock (&ptx_dev
->stream_lock
);
616 else if (stream
&& !stream
->multithreaded
617 && !pthread_equal (stream
->host_thread
, thread
))
618 GOMP_PLUGIN_fatal ("async %d used on wrong thread", orig_async
);
623 /* Initialize the device. Return TRUE on success, else FALSE. PTX_DEV_LOCK
624 should be locked on entry and remains locked on exit. */
631 if (instantiated_devices
!= 0)
635 pthread_mutex_init (&ptx_event_lock
, NULL
);
637 if (!init_cuda_lib ())
640 CUDA_CALL (cuInit
, 0);
642 CUDA_CALL (cuDeviceGetCount
, &ndevs
);
643 ptx_devices
= GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device
*)
648 /* Select the N'th PTX device for the current host thread. The device must
649 have been previously opened before calling this function. */
652 nvptx_attach_host_thread_to_device (int n
)
656 struct ptx_device
*ptx_dev
;
659 r
= CUDA_CALL_NOCHECK (cuCtxGetDevice
, &dev
);
660 if (r
!= CUDA_SUCCESS
&& r
!= CUDA_ERROR_INVALID_CONTEXT
)
662 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r
));
666 if (r
!= CUDA_ERROR_INVALID_CONTEXT
&& dev
== n
)
672 ptx_dev
= ptx_devices
[n
];
675 GOMP_PLUGIN_error ("device %d not found", n
);
679 CUDA_CALL (cuCtxGetCurrent
, &thd_ctx
);
681 /* We don't necessarily have a current context (e.g. if it has been
682 destroyed. Pop it if we do though. */
684 CUDA_CALL (cuCtxPopCurrent
, &old_ctx
);
686 CUDA_CALL (cuCtxPushCurrent
, ptx_dev
->ctx
);
691 static struct ptx_device
*
692 nvptx_open_device (int n
)
694 struct ptx_device
*ptx_dev
;
695 CUdevice dev
, ctx_dev
;
697 int async_engines
, pi
;
699 CUDA_CALL_ERET (NULL
, cuDeviceGet
, &dev
, n
);
701 ptx_dev
= GOMP_PLUGIN_malloc (sizeof (struct ptx_device
));
705 ptx_dev
->ctx_shared
= false;
707 r
= CUDA_CALL_NOCHECK (cuCtxGetDevice
, &ctx_dev
);
708 if (r
!= CUDA_SUCCESS
&& r
!= CUDA_ERROR_INVALID_CONTEXT
)
710 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r
));
714 if (r
!= CUDA_ERROR_INVALID_CONTEXT
&& ctx_dev
!= dev
)
716 /* The current host thread has an active context for a different device.
719 CUDA_CALL_ERET (NULL
, cuCtxPopCurrent
, &old_ctx
);
722 CUDA_CALL_ERET (NULL
, cuCtxGetCurrent
, &ptx_dev
->ctx
);
725 CUDA_CALL_ERET (NULL
, cuCtxCreate
, &ptx_dev
->ctx
, CU_CTX_SCHED_AUTO
, dev
);
727 ptx_dev
->ctx_shared
= true;
729 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
730 &pi
, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP
, dev
);
731 ptx_dev
->overlap
= pi
;
733 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
734 &pi
, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY
, dev
);
737 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
738 &pi
, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS
, dev
);
739 ptx_dev
->concur
= pi
;
741 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
742 &pi
, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE
, dev
);
745 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
746 &pi
, CU_DEVICE_ATTRIBUTE_INTEGRATED
, dev
);
749 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
750 &pi
, CU_DEVICE_ATTRIBUTE_CLOCK_RATE
, dev
);
751 ptx_dev
->clock_khz
= pi
;
753 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
754 &pi
, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT
, dev
);
755 ptx_dev
->num_sms
= pi
;
757 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
758 &pi
, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK
, dev
);
759 ptx_dev
->regs_per_block
= pi
;
761 /* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR is defined only
762 in CUDA 6.0 and newer. */
763 r
= CUDA_CALL_NOCHECK (cuDeviceGetAttribute
, &pi
,
764 CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR
,
766 /* Fallback: use limit of registers per block, which is usually equal. */
767 if (r
== CUDA_ERROR_INVALID_VALUE
)
768 pi
= ptx_dev
->regs_per_block
;
769 else if (r
!= CUDA_SUCCESS
)
771 GOMP_PLUGIN_error ("cuDeviceGetAttribute error: %s", cuda_error (r
));
774 ptx_dev
->regs_per_sm
= pi
;
776 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
777 &pi
, CU_DEVICE_ATTRIBUTE_WARP_SIZE
, dev
);
780 GOMP_PLUGIN_error ("Only warp size 32 is supported");
783 ptx_dev
->warp_size
= pi
;
785 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
, &pi
,
786 CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK
, dev
);
787 ptx_dev
->max_threads_per_block
= pi
;
789 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
, &pi
,
790 CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR
, dev
);
791 ptx_dev
->max_threads_per_multiprocessor
= pi
;
793 r
= CUDA_CALL_NOCHECK (cuDeviceGetAttribute
, &async_engines
,
794 CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT
, dev
);
795 if (r
!= CUDA_SUCCESS
)
798 for (int i
= 0; i
!= GOMP_DIM_MAX
; i
++)
799 ptx_dev
->default_dims
[i
] = 0;
801 ptx_dev
->images
= NULL
;
802 pthread_mutex_init (&ptx_dev
->image_lock
, NULL
);
804 if (!init_streams_for_device (ptx_dev
, async_engines
))
811 nvptx_close_device (struct ptx_device
*ptx_dev
)
816 if (!fini_streams_for_device (ptx_dev
))
819 pthread_mutex_destroy (&ptx_dev
->image_lock
);
821 if (!ptx_dev
->ctx_shared
)
822 CUDA_CALL (cuCtxDestroy
, ptx_dev
->ctx
);
829 nvptx_get_num_devices (void)
833 /* PR libgomp/65099: Currently, we only support offloading in 64-bit
835 if (sizeof (void *) != 8)
837 GOMP_PLUGIN_debug (0, "Disabling nvptx offloading;"
838 " only 64-bit configurations are supported\n");
842 /* This function will be called before the plugin has been initialized in
843 order to enumerate available devices, but CUDA API routines can't be used
844 until cuInit has been called. Just call it now (but don't yet do any
845 further initialization). */
846 if (instantiated_devices
== 0)
848 if (!init_cuda_lib ())
850 CUresult r
= CUDA_CALL_NOCHECK (cuInit
, 0);
851 /* This is not an error: e.g. we may have CUDA libraries installed but
852 no devices available. */
853 if (r
!= CUDA_SUCCESS
)
855 GOMP_PLUGIN_debug (0, "Disabling nvptx offloading; cuInit: %s\n",
861 CUDA_CALL_ERET (-1, cuDeviceGetCount
, &n
);
866 notify_var (const char *var_name
, const char *env_var
)
869 GOMP_PLUGIN_debug (0, "%s: <Not defined>\n", var_name
);
871 GOMP_PLUGIN_debug (0, "%s: '%s'\n", var_name
, env_var
);
875 process_GOMP_NVPTX_JIT (intptr_t *gomp_nvptx_o
)
877 const char *var_name
= "GOMP_NVPTX_JIT";
878 const char *env_var
= secure_getenv (var_name
);
879 notify_var (var_name
, env_var
);
884 const char *c
= env_var
;
890 if (c
[0] == '-' && c
[1] == 'O'
891 && '0' <= c
[2] && c
[2] <= '4'
892 && (c
[3] == '\0' || c
[3] == ' '))
894 *gomp_nvptx_o
= c
[2] - '0';
899 GOMP_PLUGIN_error ("Error parsing %s", var_name
);
905 link_ptx (CUmodule
*module
, const struct targ_ptx_obj
*ptx_objs
,
908 CUjit_option opts
[7];
913 CUlinkState linkstate
;
916 size_t linkoutsize
__attribute__ ((unused
));
918 opts
[0] = CU_JIT_WALL_TIME
;
919 optvals
[0] = &elapsed
;
921 opts
[1] = CU_JIT_INFO_LOG_BUFFER
;
922 optvals
[1] = &ilog
[0];
924 opts
[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES
;
925 optvals
[2] = (void *) sizeof ilog
;
927 opts
[3] = CU_JIT_ERROR_LOG_BUFFER
;
928 optvals
[3] = &elog
[0];
930 opts
[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES
;
931 optvals
[4] = (void *) sizeof elog
;
933 opts
[5] = CU_JIT_LOG_VERBOSE
;
934 optvals
[5] = (void *) 1;
936 static intptr_t gomp_nvptx_o
= -1;
938 static bool init_done
= false;
941 process_GOMP_NVPTX_JIT (&gomp_nvptx_o
);
946 if (gomp_nvptx_o
!= -1)
948 opts
[nopts
] = CU_JIT_OPTIMIZATION_LEVEL
;
949 optvals
[nopts
] = (void *) gomp_nvptx_o
;
953 if (CUDA_CALL_EXISTS (cuLinkCreate_v2
))
954 CUDA_CALL (cuLinkCreate_v2
, nopts
, opts
, optvals
, &linkstate
);
956 CUDA_CALL (cuLinkCreate
, nopts
, opts
, optvals
, &linkstate
);
958 for (; num_objs
--; ptx_objs
++)
960 /* cuLinkAddData's 'data' argument erroneously omits the const
962 GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs
->code
);
963 if (CUDA_CALL_EXISTS (cuLinkAddData_v2
))
964 r
= CUDA_CALL_NOCHECK (cuLinkAddData_v2
, linkstate
, CU_JIT_INPUT_PTX
,
965 (char *) ptx_objs
->code
, ptx_objs
->size
,
968 r
= CUDA_CALL_NOCHECK (cuLinkAddData
, linkstate
, CU_JIT_INPUT_PTX
,
969 (char *) ptx_objs
->code
, ptx_objs
->size
,
971 if (r
!= CUDA_SUCCESS
)
973 GOMP_PLUGIN_error ("Link error log %s\n", &elog
[0]);
974 GOMP_PLUGIN_error ("cuLinkAddData (ptx_code) error: %s",
980 GOMP_PLUGIN_debug (0, "Linking\n");
981 r
= CUDA_CALL_NOCHECK (cuLinkComplete
, linkstate
, &linkout
, &linkoutsize
);
983 GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed
);
984 GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog
[0]);
986 if (r
!= CUDA_SUCCESS
)
988 GOMP_PLUGIN_error ("cuLinkComplete error: %s", cuda_error (r
));
992 CUDA_CALL (cuModuleLoadData
, module
, linkout
);
993 CUDA_CALL (cuLinkDestroy
, linkstate
);
998 event_gc (bool memmap_lockable
)
1000 struct ptx_event
*ptx_event
= ptx_events
;
1001 struct ptx_event
*async_cleanups
= NULL
;
1002 struct nvptx_thread
*nvthd
= nvptx_thread ();
1004 pthread_mutex_lock (&ptx_event_lock
);
1006 while (ptx_event
!= NULL
)
1009 struct ptx_event
*e
= ptx_event
;
1011 ptx_event
= ptx_event
->next
;
1013 if (e
->ord
!= nvthd
->ptx_dev
->ord
)
1016 r
= CUDA_CALL_NOCHECK (cuEventQuery
, *e
->evt
);
1017 if (r
== CUDA_SUCCESS
)
1019 bool append_async
= false;
1034 case PTX_EVT_ASYNC_CLEANUP
:
1036 /* The function gomp_plugin_async_unmap_vars needs to claim the
1037 memory-map splay tree lock for the current device, so we
1038 can't call it when one of our callers has already claimed
1039 the lock. In that case, just delay the GC for this event
1041 if (!memmap_lockable
)
1044 append_async
= true;
1049 CUDA_CALL_NOCHECK (cuEventDestroy
, *te
);
1052 /* Unlink 'e' from ptx_events list. */
1053 if (ptx_events
== e
)
1054 ptx_events
= ptx_events
->next
;
1057 struct ptx_event
*e_
= ptx_events
;
1058 while (e_
->next
!= e
)
1060 e_
->next
= e_
->next
->next
;
1065 e
->next
= async_cleanups
;
1073 pthread_mutex_unlock (&ptx_event_lock
);
1075 /* We have to do these here, after ptx_event_lock is released. */
1076 while (async_cleanups
)
1078 struct ptx_event
*e
= async_cleanups
;
1079 async_cleanups
= async_cleanups
->next
;
1081 GOMP_PLUGIN_async_unmap_vars (e
->addr
, e
->val
);
1087 event_add (enum ptx_event_type type
, CUevent
*e
, void *h
, int val
)
1089 struct ptx_event
*ptx_event
;
1090 struct nvptx_thread
*nvthd
= nvptx_thread ();
1092 assert (type
== PTX_EVT_MEM
|| type
== PTX_EVT_KNL
|| type
== PTX_EVT_SYNC
1093 || type
== PTX_EVT_ASYNC_CLEANUP
);
1095 ptx_event
= GOMP_PLUGIN_malloc (sizeof (struct ptx_event
));
1096 ptx_event
->type
= type
;
1098 ptx_event
->addr
= h
;
1099 ptx_event
->ord
= nvthd
->ptx_dev
->ord
;
1100 ptx_event
->val
= val
;
1102 pthread_mutex_lock (&ptx_event_lock
);
1104 ptx_event
->next
= ptx_events
;
1105 ptx_events
= ptx_event
;
1107 pthread_mutex_unlock (&ptx_event_lock
);
1111 nvptx_exec (void (*fn
), size_t mapnum
, void **hostaddrs
, void **devaddrs
,
1112 int async
, unsigned *dims
, void *targ_mem_desc
)
1114 struct targ_fn_descriptor
*targ_fn
= (struct targ_fn_descriptor
*) fn
;
1115 CUfunction function
;
1118 struct ptx_stream
*dev_str
;
1121 struct nvptx_thread
*nvthd
= nvptx_thread ();
1122 int warp_size
= nvthd
->ptx_dev
->warp_size
;
1123 const char *maybe_abort_msg
= "(perhaps abort was called)";
1125 function
= targ_fn
->fn
;
1127 dev_str
= select_stream_for_async (async
, pthread_self (), false, NULL
);
1128 assert (dev_str
== nvthd
->current_stream
);
1130 /* Initialize the launch dimensions. Typically this is constant,
1131 provided by the device compiler, but we must permit runtime
1134 for (i
= 0; i
!= GOMP_DIM_MAX
; i
++)
1136 if (targ_fn
->launch
->dim
[i
])
1137 dims
[i
] = targ_fn
->launch
->dim
[i
];
1144 pthread_mutex_lock (&ptx_dev_lock
);
1146 static int gomp_openacc_dims
[GOMP_DIM_MAX
];
1147 if (!gomp_openacc_dims
[0])
1149 /* See if the user provided GOMP_OPENACC_DIM environment
1150 variable to specify runtime defaults. */
1151 for (int i
= 0; i
< GOMP_DIM_MAX
; ++i
)
1152 gomp_openacc_dims
[i
] = GOMP_PLUGIN_acc_default_dim (i
);
1155 if (!nvthd
->ptx_dev
->default_dims
[0])
1157 int default_dims
[GOMP_DIM_MAX
];
1158 for (int i
= 0; i
< GOMP_DIM_MAX
; ++i
)
1159 default_dims
[i
] = gomp_openacc_dims
[i
];
1161 int gang
, worker
, vector
;
1163 int block_size
= nvthd
->ptx_dev
->max_threads_per_block
;
1164 int cpu_size
= nvthd
->ptx_dev
->max_threads_per_multiprocessor
;
1165 int dev_size
= nvthd
->ptx_dev
->num_sms
;
1166 GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
1167 " dev_size=%d, cpu_size=%d\n",
1168 warp_size
, block_size
, dev_size
, cpu_size
);
1170 gang
= (cpu_size
/ block_size
) * dev_size
;
1171 worker
= block_size
/ warp_size
;
1175 /* There is no upper bound on the gang size. The best size
1176 matches the hardware configuration. Logical gangs are
1177 scheduled onto physical hardware. To maximize usage, we
1178 should guess a large number. */
1179 if (default_dims
[GOMP_DIM_GANG
] < 1)
1180 default_dims
[GOMP_DIM_GANG
] = gang
? gang
: 1024;
1181 /* The worker size must not exceed the hardware. */
1182 if (default_dims
[GOMP_DIM_WORKER
] < 1
1183 || (default_dims
[GOMP_DIM_WORKER
] > worker
&& gang
))
1184 default_dims
[GOMP_DIM_WORKER
] = worker
;
1185 /* The vector size must exactly match the hardware. */
1186 if (default_dims
[GOMP_DIM_VECTOR
] < 1
1187 || (default_dims
[GOMP_DIM_VECTOR
] != vector
&& gang
))
1188 default_dims
[GOMP_DIM_VECTOR
] = vector
;
1190 GOMP_PLUGIN_debug (0, " default dimensions [%d,%d,%d]\n",
1191 default_dims
[GOMP_DIM_GANG
],
1192 default_dims
[GOMP_DIM_WORKER
],
1193 default_dims
[GOMP_DIM_VECTOR
]);
1195 for (i
= 0; i
!= GOMP_DIM_MAX
; i
++)
1196 nvthd
->ptx_dev
->default_dims
[i
] = default_dims
[i
];
1198 pthread_mutex_unlock (&ptx_dev_lock
);
1201 bool default_dim_p
[GOMP_DIM_MAX
];
1202 for (i
= 0; i
!= GOMP_DIM_MAX
; i
++)
1204 default_dim_p
[i
] = !dims
[i
];
1205 if (default_dim_p
[i
])
1206 dims
[i
] = nvthd
->ptx_dev
->default_dims
[i
];
1209 if (default_dim_p
[GOMP_DIM_VECTOR
])
1210 dims
[GOMP_DIM_VECTOR
]
1211 = MIN (dims
[GOMP_DIM_VECTOR
],
1212 (targ_fn
->max_threads_per_block
/ warp_size
* warp_size
));
1214 if (default_dim_p
[GOMP_DIM_WORKER
])
1215 dims
[GOMP_DIM_WORKER
]
1216 = MIN (dims
[GOMP_DIM_WORKER
],
1217 targ_fn
->max_threads_per_block
/ dims
[GOMP_DIM_VECTOR
]);
1221 /* Check if the accelerator has sufficient hardware resources to
1222 launch the offloaded kernel. */
1223 if (dims
[GOMP_DIM_WORKER
] * dims
[GOMP_DIM_VECTOR
]
1224 > targ_fn
->max_threads_per_block
)
1227 = targ_fn
->max_threads_per_block
/ dims
[GOMP_DIM_VECTOR
];
1228 GOMP_PLUGIN_fatal ("The Nvidia accelerator has insufficient resources to"
1229 " launch '%s' with num_workers = %d; recompile the"
1230 " program with 'num_workers = %d' on that offloaded"
1231 " region or '-fopenacc-dim=:%d'",
1232 targ_fn
->launch
->fn
, dims
[GOMP_DIM_WORKER
],
1233 suggest_workers
, suggest_workers
);
1236 /* This reserves a chunk of a pre-allocated page of memory mapped on both
1237 the host and the device. HP is a host pointer to the new chunk, and DP is
1238 the corresponding device pointer. */
1239 map_push (dev_str
, mapnum
* sizeof (void *), &hp
, &dp
);
1241 GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__
);
1243 /* Copy the array of arguments to the mapped page. */
1244 for (i
= 0; i
< mapnum
; i
++)
1245 ((void **) hp
)[i
] = devaddrs
[i
];
1247 /* Copy the (device) pointers to arguments to the device (dp and hp might in
1248 fact have the same value on a unified-memory system). */
1249 CUDA_CALL_ASSERT (cuMemcpy
, (CUdeviceptr
) dp
, (CUdeviceptr
) hp
,
1250 mapnum
* sizeof (void *));
1251 GOMP_PLUGIN_debug (0, " %s: kernel %s: launch"
1252 " gangs=%u, workers=%u, vectors=%u\n",
1253 __FUNCTION__
, targ_fn
->launch
->fn
, dims
[GOMP_DIM_GANG
],
1254 dims
[GOMP_DIM_WORKER
], dims
[GOMP_DIM_VECTOR
]);
1258 // num_gangs nctaid.x
1259 // num_workers ntid.y
1260 // vector length ntid.x
1263 CUDA_CALL_ASSERT (cuLaunchKernel
, function
,
1264 dims
[GOMP_DIM_GANG
], 1, 1,
1265 dims
[GOMP_DIM_VECTOR
], dims
[GOMP_DIM_WORKER
], 1,
1266 0, dev_str
->stream
, kargs
, 0);
1268 #ifndef DISABLE_ASYNC
1269 if (async
< acc_async_noval
)
1271 r
= CUDA_CALL_NOCHECK (cuStreamSynchronize
, dev_str
->stream
);
1272 if (r
== CUDA_ERROR_LAUNCH_FAILED
)
1273 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r
),
1275 else if (r
!= CUDA_SUCCESS
)
1276 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r
));
1282 e
= (CUevent
*)GOMP_PLUGIN_malloc (sizeof (CUevent
));
1284 r
= CUDA_CALL_NOCHECK (cuEventCreate
, e
, CU_EVENT_DISABLE_TIMING
);
1285 if (r
== CUDA_ERROR_LAUNCH_FAILED
)
1286 GOMP_PLUGIN_fatal ("cuEventCreate error: %s %s\n", cuda_error (r
),
1288 else if (r
!= CUDA_SUCCESS
)
1289 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r
));
1293 CUDA_CALL_ASSERT (cuEventRecord
, *e
, dev_str
->stream
);
1295 event_add (PTX_EVT_KNL
, e
, (void *)dev_str
, 0);
1298 r
= CUDA_CALL_NOCHECK (cuCtxSynchronize
, );
1299 if (r
== CUDA_ERROR_LAUNCH_FAILED
)
1300 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r
),
1302 else if (r
!= CUDA_SUCCESS
)
1303 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r
));
1306 GOMP_PLUGIN_debug (0, " %s: kernel %s: finished\n", __FUNCTION__
,
1307 targ_fn
->launch
->fn
);
1309 #ifndef DISABLE_ASYNC
1310 if (async
< acc_async_noval
)
1315 void * openacc_get_current_cuda_context (void);
1318 nvptx_alloc (size_t s
)
1322 CUDA_CALL_ERET (NULL
, cuMemAlloc
, &d
, s
);
1327 nvptx_free (void *p
)
1332 CUDA_CALL (cuMemGetAddressRange
, &pb
, &ps
, (CUdeviceptr
) p
);
1333 if ((CUdeviceptr
) p
!= pb
)
1335 GOMP_PLUGIN_error ("invalid device address");
1339 CUDA_CALL (cuMemFree
, (CUdeviceptr
) p
);
1345 nvptx_host2dev (void *d
, const void *h
, size_t s
)
1349 struct nvptx_thread
*nvthd
= nvptx_thread ();
1355 GOMP_PLUGIN_error ("invalid device address");
1359 CUDA_CALL (cuMemGetAddressRange
, &pb
, &ps
, (CUdeviceptr
) d
);
1363 GOMP_PLUGIN_error ("invalid device address");
1368 GOMP_PLUGIN_error ("invalid host address");
1373 GOMP_PLUGIN_error ("invalid host or device address");
1376 if ((void *)(d
+ s
) > (void *)(pb
+ ps
))
1378 GOMP_PLUGIN_error ("invalid size");
1382 #ifndef DISABLE_ASYNC
1383 if (nvthd
&& nvthd
->current_stream
!= nvthd
->ptx_dev
->null_stream
)
1385 CUevent
*e
= (CUevent
*)GOMP_PLUGIN_malloc (sizeof (CUevent
));
1386 CUDA_CALL (cuEventCreate
, e
, CU_EVENT_DISABLE_TIMING
);
1388 CUDA_CALL (cuMemcpyHtoDAsync
,
1389 (CUdeviceptr
) d
, h
, s
, nvthd
->current_stream
->stream
);
1390 CUDA_CALL (cuEventRecord
, *e
, nvthd
->current_stream
->stream
);
1391 event_add (PTX_EVT_MEM
, e
, (void *)h
, 0);
1395 CUDA_CALL (cuMemcpyHtoD
, (CUdeviceptr
) d
, h
, s
);
1401 nvptx_dev2host (void *h
, const void *d
, size_t s
)
1405 struct nvptx_thread
*nvthd
= nvptx_thread ();
1411 GOMP_PLUGIN_error ("invalid device address");
1415 CUDA_CALL (cuMemGetAddressRange
, &pb
, &ps
, (CUdeviceptr
) d
);
1419 GOMP_PLUGIN_error ("invalid device address");
1424 GOMP_PLUGIN_error ("invalid host address");
1429 GOMP_PLUGIN_error ("invalid host or device address");
1432 if ((void *)(d
+ s
) > (void *)(pb
+ ps
))
1434 GOMP_PLUGIN_error ("invalid size");
1438 #ifndef DISABLE_ASYNC
1439 if (nvthd
&& nvthd
->current_stream
!= nvthd
->ptx_dev
->null_stream
)
1441 CUevent
*e
= (CUevent
*) GOMP_PLUGIN_malloc (sizeof (CUevent
));
1442 CUDA_CALL (cuEventCreate
, e
, CU_EVENT_DISABLE_TIMING
);
1444 CUDA_CALL (cuMemcpyDtoHAsync
,
1445 h
, (CUdeviceptr
) d
, s
, nvthd
->current_stream
->stream
);
1446 CUDA_CALL (cuEventRecord
, *e
, nvthd
->current_stream
->stream
);
1447 event_add (PTX_EVT_MEM
, e
, (void *)h
, 0);
1451 CUDA_CALL (cuMemcpyDtoH
, h
, (CUdeviceptr
) d
, s
);
1457 nvptx_set_async (int async
)
1459 struct nvptx_thread
*nvthd
= nvptx_thread ();
1460 nvthd
->current_stream
1461 = select_stream_for_async (async
, pthread_self (), true, NULL
);
1465 nvptx_async_test (int async
)
1468 struct ptx_stream
*s
;
1470 s
= select_stream_for_async (async
, pthread_self (), false, NULL
);
1473 GOMP_PLUGIN_fatal ("unknown async %d", async
);
1475 r
= CUDA_CALL_NOCHECK (cuStreamQuery
, s
->stream
);
1476 if (r
== CUDA_SUCCESS
)
1478 /* The oacc-parallel.c:goacc_wait function calls this hook to determine
1479 whether all work has completed on this stream, and if so omits the call
1480 to the wait hook. If that happens, event_gc might not get called
1481 (which prevents variables from getting unmapped and their associated
1482 device storage freed), so call it here. */
1486 else if (r
== CUDA_ERROR_NOT_READY
)
1489 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r
));
1495 nvptx_async_test_all (void)
1497 struct ptx_stream
*s
;
1498 pthread_t self
= pthread_self ();
1499 struct nvptx_thread
*nvthd
= nvptx_thread ();
1501 pthread_mutex_lock (&nvthd
->ptx_dev
->stream_lock
);
1503 for (s
= nvthd
->ptx_dev
->active_streams
; s
!= NULL
; s
= s
->next
)
1505 if ((s
->multithreaded
|| pthread_equal (s
->host_thread
, self
))
1506 && CUDA_CALL_NOCHECK (cuStreamQuery
,
1507 s
->stream
) == CUDA_ERROR_NOT_READY
)
1509 pthread_mutex_unlock (&nvthd
->ptx_dev
->stream_lock
);
1514 pthread_mutex_unlock (&nvthd
->ptx_dev
->stream_lock
);
1522 nvptx_wait (int async
)
1524 struct ptx_stream
*s
;
1526 s
= select_stream_for_async (async
, pthread_self (), false, NULL
);
1528 GOMP_PLUGIN_fatal ("unknown async %d", async
);
1530 CUDA_CALL_ASSERT (cuStreamSynchronize
, s
->stream
);
1536 nvptx_wait_async (int async1
, int async2
)
1539 struct ptx_stream
*s1
, *s2
;
1540 pthread_t self
= pthread_self ();
1542 /* The stream that is waiting (rather than being waited for) doesn't
1543 necessarily have to exist already. */
1544 s2
= select_stream_for_async (async2
, self
, true, NULL
);
1546 s1
= select_stream_for_async (async1
, self
, false, NULL
);
1548 GOMP_PLUGIN_fatal ("invalid async 1\n");
1551 GOMP_PLUGIN_fatal ("identical parameters");
1553 e
= (CUevent
*) GOMP_PLUGIN_malloc (sizeof (CUevent
));
1555 CUDA_CALL_ASSERT (cuEventCreate
, e
, CU_EVENT_DISABLE_TIMING
);
1559 CUDA_CALL_ASSERT (cuEventRecord
, *e
, s1
->stream
);
1561 event_add (PTX_EVT_SYNC
, e
, NULL
, 0);
1563 CUDA_CALL_ASSERT (cuStreamWaitEvent
, s2
->stream
, *e
, 0);
1567 nvptx_wait_all (void)
1570 struct ptx_stream
*s
;
1571 pthread_t self
= pthread_self ();
1572 struct nvptx_thread
*nvthd
= nvptx_thread ();
1574 pthread_mutex_lock (&nvthd
->ptx_dev
->stream_lock
);
1576 /* Wait for active streams initiated by this thread (or by multiple threads)
1578 for (s
= nvthd
->ptx_dev
->active_streams
; s
!= NULL
; s
= s
->next
)
1580 if (s
->multithreaded
|| pthread_equal (s
->host_thread
, self
))
1582 r
= CUDA_CALL_NOCHECK (cuStreamQuery
, s
->stream
);
1583 if (r
== CUDA_SUCCESS
)
1585 else if (r
!= CUDA_ERROR_NOT_READY
)
1586 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r
));
1588 CUDA_CALL_ASSERT (cuStreamSynchronize
, s
->stream
);
1592 pthread_mutex_unlock (&nvthd
->ptx_dev
->stream_lock
);
1598 nvptx_wait_all_async (int async
)
1600 struct ptx_stream
*waiting_stream
, *other_stream
;
1602 struct nvptx_thread
*nvthd
= nvptx_thread ();
1603 pthread_t self
= pthread_self ();
1605 /* The stream doing the waiting. This could be the first mention of the
1606 stream, so create it if necessary. */
1608 = select_stream_for_async (async
, pthread_self (), true, NULL
);
1610 /* Launches on the null stream already block on other streams in the
1612 if (!waiting_stream
|| waiting_stream
== nvthd
->ptx_dev
->null_stream
)
1617 pthread_mutex_lock (&nvthd
->ptx_dev
->stream_lock
);
1619 for (other_stream
= nvthd
->ptx_dev
->active_streams
;
1620 other_stream
!= NULL
;
1621 other_stream
= other_stream
->next
)
1623 if (!other_stream
->multithreaded
1624 && !pthread_equal (other_stream
->host_thread
, self
))
1627 e
= (CUevent
*) GOMP_PLUGIN_malloc (sizeof (CUevent
));
1629 CUDA_CALL_ASSERT (cuEventCreate
, e
, CU_EVENT_DISABLE_TIMING
);
1631 /* Record an event on the waited-for stream. */
1632 CUDA_CALL_ASSERT (cuEventRecord
, *e
, other_stream
->stream
);
1634 event_add (PTX_EVT_SYNC
, e
, NULL
, 0);
1636 CUDA_CALL_ASSERT (cuStreamWaitEvent
, waiting_stream
->stream
, *e
, 0);
1639 pthread_mutex_unlock (&nvthd
->ptx_dev
->stream_lock
);
1643 nvptx_get_current_cuda_device (void)
1645 struct nvptx_thread
*nvthd
= nvptx_thread ();
1647 if (!nvthd
|| !nvthd
->ptx_dev
)
1650 return &nvthd
->ptx_dev
->dev
;
1654 nvptx_get_current_cuda_context (void)
1656 struct nvptx_thread
*nvthd
= nvptx_thread ();
1658 if (!nvthd
|| !nvthd
->ptx_dev
)
1661 return nvthd
->ptx_dev
->ctx
;
1665 nvptx_get_cuda_stream (int async
)
1667 struct ptx_stream
*s
;
1668 struct nvptx_thread
*nvthd
= nvptx_thread ();
1670 if (!nvthd
|| !nvthd
->ptx_dev
)
1673 s
= select_stream_for_async (async
, pthread_self (), false, NULL
);
1675 return s
? s
->stream
: NULL
;
1679 nvptx_set_cuda_stream (int async
, void *stream
)
1681 struct ptx_stream
*oldstream
;
1682 pthread_t self
= pthread_self ();
1683 struct nvptx_thread
*nvthd
= nvptx_thread ();
1686 GOMP_PLUGIN_fatal ("bad async %d", async
);
1688 pthread_mutex_lock (&nvthd
->ptx_dev
->stream_lock
);
1690 /* We have a list of active streams and an array mapping async values to
1691 entries of that list. We need to take "ownership" of the passed-in stream,
1692 and add it to our list, removing the previous entry also (if there was one)
1693 in order to prevent resource leaks. Note the potential for surprise
1694 here: maybe we should keep track of passed-in streams and leave it up to
1695 the user to tidy those up, but that doesn't work for stream handles
1696 returned from acc_get_cuda_stream above... */
1698 oldstream
= select_stream_for_async (async
, self
, false, NULL
);
1702 if (nvthd
->ptx_dev
->active_streams
== oldstream
)
1703 nvthd
->ptx_dev
->active_streams
= nvthd
->ptx_dev
->active_streams
->next
;
1706 struct ptx_stream
*s
= nvthd
->ptx_dev
->active_streams
;
1707 while (s
->next
!= oldstream
)
1709 s
->next
= s
->next
->next
;
1712 CUDA_CALL_ASSERT (cuStreamDestroy
, oldstream
->stream
);
1714 if (!map_fini (oldstream
))
1715 GOMP_PLUGIN_fatal ("error when freeing host memory");
1720 pthread_mutex_unlock (&nvthd
->ptx_dev
->stream_lock
);
1722 (void) select_stream_for_async (async
, self
, true, (CUstream
) stream
);
1727 /* Plugin entry points. */
1730 GOMP_OFFLOAD_get_name (void)
1736 GOMP_OFFLOAD_get_caps (void)
1738 return GOMP_OFFLOAD_CAP_OPENACC_200
| GOMP_OFFLOAD_CAP_OPENMP_400
;
1742 GOMP_OFFLOAD_get_type (void)
1744 return OFFLOAD_TARGET_TYPE_NVIDIA_PTX
;
1748 GOMP_OFFLOAD_get_num_devices (void)
1750 return nvptx_get_num_devices ();
1754 GOMP_OFFLOAD_init_device (int n
)
1756 struct ptx_device
*dev
;
1758 pthread_mutex_lock (&ptx_dev_lock
);
1760 if (!nvptx_init () || ptx_devices
[n
] != NULL
)
1762 pthread_mutex_unlock (&ptx_dev_lock
);
1766 dev
= nvptx_open_device (n
);
1769 ptx_devices
[n
] = dev
;
1770 instantiated_devices
++;
1773 pthread_mutex_unlock (&ptx_dev_lock
);
1779 GOMP_OFFLOAD_fini_device (int n
)
1781 pthread_mutex_lock (&ptx_dev_lock
);
1783 if (ptx_devices
[n
] != NULL
)
1785 if (!nvptx_attach_host_thread_to_device (n
)
1786 || !nvptx_close_device (ptx_devices
[n
]))
1788 pthread_mutex_unlock (&ptx_dev_lock
);
1791 ptx_devices
[n
] = NULL
;
1792 instantiated_devices
--;
1795 pthread_mutex_unlock (&ptx_dev_lock
);
1799 /* Return the libgomp version number we're compatible with. There is
1800 no requirement for cross-version compatibility. */
1803 GOMP_OFFLOAD_version (void)
1805 return GOMP_VERSION
;
1808 /* Initialize __nvptx_clocktick, if present in MODULE. */
1811 nvptx_set_clocktick (CUmodule module
, struct ptx_device
*dev
)
1814 CUresult r
= CUDA_CALL_NOCHECK (cuModuleGetGlobal
, &dptr
, NULL
,
1815 module
, "__nvptx_clocktick");
1816 if (r
== CUDA_ERROR_NOT_FOUND
)
1818 if (r
!= CUDA_SUCCESS
)
1819 GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r
));
1820 double __nvptx_clocktick
= 1e-3 / dev
->clock_khz
;
1821 r
= CUDA_CALL_NOCHECK (cuMemcpyHtoD
, dptr
, &__nvptx_clocktick
,
1822 sizeof (__nvptx_clocktick
));
1823 if (r
!= CUDA_SUCCESS
)
1824 GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r
));
1827 /* Load the (partial) program described by TARGET_DATA to device
1828 number ORD. Allocate and return TARGET_TABLE. */
1831 GOMP_OFFLOAD_load_image (int ord
, unsigned version
, const void *target_data
,
1832 struct addr_pair
**target_table
)
1835 const char *const *var_names
;
1836 const struct targ_fn_launch
*fn_descs
;
1837 unsigned int fn_entries
, var_entries
, i
, j
;
1838 struct targ_fn_descriptor
*targ_fns
;
1839 struct addr_pair
*targ_tbl
;
1840 const nvptx_tdata_t
*img_header
= (const nvptx_tdata_t
*) target_data
;
1841 struct ptx_image_data
*new_image
;
1842 struct ptx_device
*dev
;
1844 if (GOMP_VERSION_DEV (version
) > GOMP_VERSION_NVIDIA_PTX
)
1846 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1847 " (expected %u, received %u)",
1848 GOMP_VERSION_NVIDIA_PTX
, GOMP_VERSION_DEV (version
));
1852 if (!nvptx_attach_host_thread_to_device (ord
)
1853 || !link_ptx (&module
, img_header
->ptx_objs
, img_header
->ptx_num
))
1856 dev
= ptx_devices
[ord
];
1858 /* The mkoffload utility emits a struct of pointers/integers at the
1859 start of each offload image. The array of kernel names and the
1860 functions addresses form a one-to-one correspondence. */
1862 var_entries
= img_header
->var_num
;
1863 var_names
= img_header
->var_names
;
1864 fn_entries
= img_header
->fn_num
;
1865 fn_descs
= img_header
->fn_descs
;
1867 targ_tbl
= GOMP_PLUGIN_malloc (sizeof (struct addr_pair
)
1868 * (fn_entries
+ var_entries
));
1869 targ_fns
= GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor
)
1872 *target_table
= targ_tbl
;
1874 new_image
= GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data
));
1875 new_image
->target_data
= target_data
;
1876 new_image
->module
= module
;
1877 new_image
->fns
= targ_fns
;
1879 pthread_mutex_lock (&dev
->image_lock
);
1880 new_image
->next
= dev
->images
;
1881 dev
->images
= new_image
;
1882 pthread_mutex_unlock (&dev
->image_lock
);
1884 for (i
= 0; i
< fn_entries
; i
++, targ_fns
++, targ_tbl
++)
1886 CUfunction function
;
1889 CUDA_CALL_ERET (-1, cuModuleGetFunction
, &function
, module
,
1891 CUDA_CALL_ERET (-1, cuFuncGetAttribute
, &nregs
,
1892 CU_FUNC_ATTRIBUTE_NUM_REGS
, function
);
1893 CUDA_CALL_ERET (-1, cuFuncGetAttribute
, &mthrs
,
1894 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK
, function
);
1896 targ_fns
->fn
= function
;
1897 targ_fns
->launch
= &fn_descs
[i
];
1898 targ_fns
->regs_per_thread
= nregs
;
1899 targ_fns
->max_threads_per_block
= mthrs
;
1901 targ_tbl
->start
= (uintptr_t) targ_fns
;
1902 targ_tbl
->end
= targ_tbl
->start
+ 1;
1905 for (j
= 0; j
< var_entries
; j
++, targ_tbl
++)
1910 CUDA_CALL_ERET (-1, cuModuleGetGlobal
,
1911 &var
, &bytes
, module
, var_names
[j
]);
1913 targ_tbl
->start
= (uintptr_t) var
;
1914 targ_tbl
->end
= targ_tbl
->start
+ bytes
;
1917 nvptx_set_clocktick (module
, dev
);
1919 return fn_entries
+ var_entries
;
1922 /* Unload the program described by TARGET_DATA. DEV_DATA is the
1923 function descriptors allocated by G_O_load_image. */
1926 GOMP_OFFLOAD_unload_image (int ord
, unsigned version
, const void *target_data
)
1928 struct ptx_image_data
*image
, **prev_p
;
1929 struct ptx_device
*dev
= ptx_devices
[ord
];
1931 if (GOMP_VERSION_DEV (version
) > GOMP_VERSION_NVIDIA_PTX
)
1933 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1934 " (expected %u, received %u)",
1935 GOMP_VERSION_NVIDIA_PTX
, GOMP_VERSION_DEV (version
));
1940 pthread_mutex_lock (&dev
->image_lock
);
1941 for (prev_p
= &dev
->images
; (image
= *prev_p
) != 0; prev_p
= &image
->next
)
1942 if (image
->target_data
== target_data
)
1944 *prev_p
= image
->next
;
1945 if (CUDA_CALL_NOCHECK (cuModuleUnload
, image
->module
) != CUDA_SUCCESS
)
1951 pthread_mutex_unlock (&dev
->image_lock
);
1956 GOMP_OFFLOAD_alloc (int ord
, size_t size
)
1958 if (!nvptx_attach_host_thread_to_device (ord
))
1960 return nvptx_alloc (size
);
1964 GOMP_OFFLOAD_free (int ord
, void *ptr
)
1966 return (nvptx_attach_host_thread_to_device (ord
)
1967 && nvptx_free (ptr
));
1971 GOMP_OFFLOAD_dev2host (int ord
, void *dst
, const void *src
, size_t n
)
1973 return (nvptx_attach_host_thread_to_device (ord
)
1974 && nvptx_dev2host (dst
, src
, n
));
1978 GOMP_OFFLOAD_host2dev (int ord
, void *dst
, const void *src
, size_t n
)
1980 return (nvptx_attach_host_thread_to_device (ord
)
1981 && nvptx_host2dev (dst
, src
, n
));
1985 GOMP_OFFLOAD_dev2dev (int ord
, void *dst
, const void *src
, size_t n
)
1987 struct ptx_device
*ptx_dev
= ptx_devices
[ord
];
1988 CUDA_CALL (cuMemcpyDtoDAsync
, (CUdeviceptr
) dst
, (CUdeviceptr
) src
, n
,
1989 ptx_dev
->null_stream
->stream
);
1993 void (*device_run
) (int n
, void *fn_ptr
, void *vars
) = NULL
;
1996 GOMP_OFFLOAD_openacc_exec (void (*fn
) (void *), size_t mapnum
,
1997 void **hostaddrs
, void **devaddrs
,
1998 int async
, unsigned *dims
, void *targ_mem_desc
)
2000 nvptx_exec (fn
, mapnum
, hostaddrs
, devaddrs
, async
, dims
, targ_mem_desc
);
2004 GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc
, int async
)
2006 struct nvptx_thread
*nvthd
= nvptx_thread ();
2007 CUevent
*e
= (CUevent
*) GOMP_PLUGIN_malloc (sizeof (CUevent
));
2009 CUDA_CALL_ASSERT (cuEventCreate
, e
, CU_EVENT_DISABLE_TIMING
);
2010 CUDA_CALL_ASSERT (cuEventRecord
, *e
, nvthd
->current_stream
->stream
);
2011 event_add (PTX_EVT_ASYNC_CLEANUP
, e
, targ_mem_desc
, async
);
2015 GOMP_OFFLOAD_openacc_async_test (int async
)
2017 return nvptx_async_test (async
);
2021 GOMP_OFFLOAD_openacc_async_test_all (void)
2023 return nvptx_async_test_all ();
2027 GOMP_OFFLOAD_openacc_async_wait (int async
)
2033 GOMP_OFFLOAD_openacc_async_wait_async (int async1
, int async2
)
2035 nvptx_wait_async (async1
, async2
);
2039 GOMP_OFFLOAD_openacc_async_wait_all (void)
2045 GOMP_OFFLOAD_openacc_async_wait_all_async (int async
)
2047 nvptx_wait_all_async (async
);
2051 GOMP_OFFLOAD_openacc_async_set_async (int async
)
2053 nvptx_set_async (async
);
2057 GOMP_OFFLOAD_openacc_create_thread_data (int ord
)
2059 struct ptx_device
*ptx_dev
;
2060 struct nvptx_thread
*nvthd
2061 = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread
));
2064 ptx_dev
= ptx_devices
[ord
];
2068 CUDA_CALL_ASSERT (cuCtxGetCurrent
, &thd_ctx
);
2070 assert (ptx_dev
->ctx
);
2073 CUDA_CALL_ASSERT (cuCtxPushCurrent
, ptx_dev
->ctx
);
2075 nvthd
->current_stream
= ptx_dev
->null_stream
;
2076 nvthd
->ptx_dev
= ptx_dev
;
2078 return (void *) nvthd
;
2082 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data
)
2088 GOMP_OFFLOAD_openacc_cuda_get_current_device (void)
2090 return nvptx_get_current_cuda_device ();
2094 GOMP_OFFLOAD_openacc_cuda_get_current_context (void)
2096 return nvptx_get_current_cuda_context ();
2099 /* NOTE: This returns a CUstream, not a ptx_stream pointer. */
2102 GOMP_OFFLOAD_openacc_cuda_get_stream (int async
)
2104 return nvptx_get_cuda_stream (async
);
2107 /* NOTE: This takes a CUstream, not a ptx_stream pointer. */
2110 GOMP_OFFLOAD_openacc_cuda_set_stream (int async
, void *stream
)
2112 return nvptx_set_cuda_stream (async
, stream
);
2115 /* Adjust launch dimensions: pick good values for number of blocks and warps
2116 and ensure that number of warps does not exceed CUDA limits as well as GCC's
2120 nvptx_adjust_launch_bounds (struct targ_fn_descriptor
*fn
,
2121 struct ptx_device
*ptx_dev
,
2122 int *teams_p
, int *threads_p
)
2124 int max_warps_block
= fn
->max_threads_per_block
/ 32;
2125 /* Maximum 32 warps per block is an implementation limit in NVPTX backend
2126 and libgcc, which matches documented limit of all GPUs as of 2015. */
2127 if (max_warps_block
> 32)
2128 max_warps_block
= 32;
2129 if (*threads_p
<= 0)
2131 if (*threads_p
> max_warps_block
)
2132 *threads_p
= max_warps_block
;
2134 int regs_per_block
= fn
->regs_per_thread
* 32 * *threads_p
;
2135 /* This is an estimate of how many blocks the device can host simultaneously.
2136 Actual limit, which may be lower, can be queried with "occupancy control"
2137 driver interface (since CUDA 6.0). */
2138 int max_blocks
= ptx_dev
->regs_per_sm
/ regs_per_block
* ptx_dev
->num_sms
;
2139 if (*teams_p
<= 0 || *teams_p
> max_blocks
)
2140 *teams_p
= max_blocks
;
2143 /* Return the size of per-warp stacks (see gcc -msoft-stack) to use for OpenMP
2147 nvptx_stacks_size ()
2152 /* Return contiguous storage for NUM stacks, each SIZE bytes. */
2155 nvptx_stacks_alloc (size_t size
, int num
)
2158 CUresult r
= CUDA_CALL_NOCHECK (cuMemAlloc
, &stacks
, size
* num
);
2159 if (r
!= CUDA_SUCCESS
)
2160 GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r
));
2161 return (void *) stacks
;
2164 /* Release storage previously allocated by nvptx_stacks_alloc. */
2167 nvptx_stacks_free (void *p
, int num
)
2169 CUresult r
= CUDA_CALL_NOCHECK (cuMemFree
, (CUdeviceptr
) p
);
2170 if (r
!= CUDA_SUCCESS
)
2171 GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r
));
2175 GOMP_OFFLOAD_run (int ord
, void *tgt_fn
, void *tgt_vars
, void **args
)
2177 CUfunction function
= ((struct targ_fn_descriptor
*) tgt_fn
)->fn
;
2179 struct ptx_device
*ptx_dev
= ptx_devices
[ord
];
2180 const char *maybe_abort_msg
= "(perhaps abort was called)";
2181 int teams
= 0, threads
= 0;
2184 GOMP_PLUGIN_fatal ("No target arguments provided");
2187 intptr_t id
= (intptr_t) *args
++, val
;
2188 if (id
& GOMP_TARGET_ARG_SUBSEQUENT_PARAM
)
2189 val
= (intptr_t) *args
++;
2191 val
= id
>> GOMP_TARGET_ARG_VALUE_SHIFT
;
2192 if ((id
& GOMP_TARGET_ARG_DEVICE_MASK
) != GOMP_TARGET_ARG_DEVICE_ALL
)
2194 val
= val
> INT_MAX
? INT_MAX
: val
;
2195 id
&= GOMP_TARGET_ARG_ID_MASK
;
2196 if (id
== GOMP_TARGET_ARG_NUM_TEAMS
)
2198 else if (id
== GOMP_TARGET_ARG_THREAD_LIMIT
)
2201 nvptx_adjust_launch_bounds (tgt_fn
, ptx_dev
, &teams
, &threads
);
2203 size_t stack_size
= nvptx_stacks_size ();
2204 void *stacks
= nvptx_stacks_alloc (stack_size
, teams
* threads
);
2205 void *fn_args
[] = {tgt_vars
, stacks
, (void *) stack_size
};
2206 size_t fn_args_size
= sizeof fn_args
;
2208 CU_LAUNCH_PARAM_BUFFER_POINTER
, fn_args
,
2209 CU_LAUNCH_PARAM_BUFFER_SIZE
, &fn_args_size
,
2212 r
= CUDA_CALL_NOCHECK (cuLaunchKernel
, function
, teams
, 1, 1,
2213 32, threads
, 1, 0, ptx_dev
->null_stream
->stream
,
2215 if (r
!= CUDA_SUCCESS
)
2216 GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r
));
2218 r
= CUDA_CALL_NOCHECK (cuCtxSynchronize
, );
2219 if (r
== CUDA_ERROR_LAUNCH_FAILED
)
2220 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r
),
2222 else if (r
!= CUDA_SUCCESS
)
2223 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r
));
2224 nvptx_stacks_free (stacks
, teams
* threads
);
2228 GOMP_OFFLOAD_async_run (int ord
, void *tgt_fn
, void *tgt_vars
, void **args
,
2231 GOMP_PLUGIN_fatal ("GOMP_OFFLOAD_async_run unimplemented");