1 /* Plugin for NVPTX execution.
3 Copyright (C) 2013-2018 Free Software Foundation, Inc.
5 Contributed by Mentor Embedded.
7 This file is part of the GNU Offloading and Multi Processing Library
10 Libgomp is free software; you can redistribute it and/or modify it
11 under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 3, or (at your option)
15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
20 Under Section 7 of GPL version 3, you are granted additional
21 permissions described in the GCC Runtime Library Exception, version
22 3.1, as published by the Free Software Foundation.
24 You should have received a copy of the GNU General Public License and
25 a copy of the GCC Runtime Library Exception along with this program;
26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
27 <http://www.gnu.org/licenses/>. */
29 /* Nvidia PTX-specific parts of OpenACC support. The cuda driver
30 library appears to hold some implicit state, but the documentation
31 is not clear as to what that state might be. Or how one might
32 propagate it from one thread to another. */
37 #include "libgomp-plugin.h"
38 #include "oacc-plugin.h"
39 #include "gomp-constants.h"
52 #if CUDA_VERSION < 6000
53 extern CUresult
cuGetErrorString (CUresult
, const char **);
54 #define CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR 82
57 #if CUDA_VERSION >= 6050
60 CUresult
cuLinkAddData (CUlinkState
, CUjitInputType
, void *, size_t,
61 const char *, unsigned, CUjit_option
*, void **);
62 CUresult
cuLinkCreate (unsigned, CUjit_option
*, void **, CUlinkState
*);
64 typedef size_t (*CUoccupancyB2DSize
)(int);
65 CUresult
cuLinkAddData_v2 (CUlinkState
, CUjitInputType
, void *, size_t,
66 const char *, unsigned, CUjit_option
*, void **);
67 CUresult
cuLinkCreate_v2 (unsigned, CUjit_option
*, void **, CUlinkState
*);
68 CUresult
cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction
,
69 CUoccupancyB2DSize
, size_t, int);
72 #define DO_PRAGMA(x) _Pragma (#x)
74 #if PLUGIN_NVPTX_DYNAMIC
79 # define CUDA_ONE_CALL(call) \
80 __typeof (call) *call;
81 # define CUDA_ONE_CALL_MAYBE_NULL(call) \
83 #include "cuda-lib.def"
85 # undef CUDA_ONE_CALL_MAYBE_NULL
89 /* -1 if init_cuda_lib has not been called yet, false
90 if it has been and failed, true if it has been and succeeded. */
91 static signed char cuda_lib_inited
= -1;
93 /* Dynamically load the CUDA runtime library and initialize function
94 pointers, return false if unsuccessful, true if successful. */
98 if (cuda_lib_inited
!= -1)
99 return cuda_lib_inited
;
100 const char *cuda_runtime_lib
= "libcuda.so.1";
101 void *h
= dlopen (cuda_runtime_lib
, RTLD_LAZY
);
102 cuda_lib_inited
= false;
106 # define CUDA_ONE_CALL(call) CUDA_ONE_CALL_1 (call, false)
107 # define CUDA_ONE_CALL_MAYBE_NULL(call) CUDA_ONE_CALL_1 (call, true)
108 # define CUDA_ONE_CALL_1(call, allow_null) \
109 cuda_lib.call = dlsym (h, #call); \
110 if (!allow_null && cuda_lib.call == NULL) \
112 #include "cuda-lib.def"
113 # undef CUDA_ONE_CALL
114 # undef CUDA_ONE_CALL_1
115 # undef CUDA_ONE_CALL_MAYBE_NULL
117 cuda_lib_inited
= true;
120 # define CUDA_CALL_PREFIX cuda_lib.
123 # define CUDA_ONE_CALL(call)
124 # define CUDA_ONE_CALL_MAYBE_NULL(call) DO_PRAGMA (weak call)
125 #include "cuda-lib.def"
126 #undef CUDA_ONE_CALL_MAYBE_NULL
129 # define CUDA_CALL_PREFIX
130 # define init_cuda_lib() true
133 #include "secure_getenv.h"
137 #define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
138 #define MAX(X,Y) ((X) > (Y) ? (X) : (Y))
140 /* Convenience macros for the frequently used CUDA library call and
141 error handling sequence as well as CUDA library calls that
142 do the error checking themselves or don't do it at all. */
144 #define CUDA_CALL_ERET(ERET, FN, ...) \
147 = CUDA_CALL_PREFIX FN (__VA_ARGS__); \
148 if (__r != CUDA_SUCCESS) \
150 GOMP_PLUGIN_error (#FN " error: %s", \
156 #define CUDA_CALL(FN, ...) \
157 CUDA_CALL_ERET (false, FN, __VA_ARGS__)
159 #define CUDA_CALL_ASSERT(FN, ...) \
162 = CUDA_CALL_PREFIX FN (__VA_ARGS__); \
163 if (__r != CUDA_SUCCESS) \
165 GOMP_PLUGIN_fatal (#FN " error: %s", \
170 #define CUDA_CALL_NOCHECK(FN, ...) \
171 CUDA_CALL_PREFIX FN (__VA_ARGS__)
173 #define CUDA_CALL_EXISTS(FN) \
177 cuda_error (CUresult r
)
179 const char *fallback
= "unknown cuda error";
182 if (!CUDA_CALL_EXISTS (cuGetErrorString
))
185 r
= CUDA_CALL_NOCHECK (cuGetErrorString
, r
, &desc
);
186 if (r
== CUDA_SUCCESS
)
192 static unsigned int instantiated_devices
= 0;
193 static pthread_mutex_t ptx_dev_lock
= PTHREAD_MUTEX_INITIALIZER
;
200 struct cuda_map
*next
;
206 pthread_t host_thread
;
208 struct cuda_map
*map
;
209 struct ptx_stream
*next
;
212 /* Thread-specific data for PTX. */
216 struct ptx_stream
*current_stream
;
217 struct ptx_device
*ptx_dev
;
220 static struct cuda_map
*
221 cuda_map_create (size_t size
)
223 struct cuda_map
*map
= GOMP_PLUGIN_malloc (sizeof (struct cuda_map
));
231 CUDA_CALL_ERET (NULL
, cuMemAlloc
, &map
->d
, size
);
238 cuda_map_destroy (struct cuda_map
*map
)
240 CUDA_CALL_ASSERT (cuMemFree
, map
->d
);
244 /* The following map_* routines manage the CUDA device memory that
245 contains the data mapping arguments for cuLaunchKernel. Each
246 asynchronous PTX stream may have multiple pending kernel
247 invocations, which are launched in a FIFO order. As such, the map
248 routines maintains a queue of cuLaunchKernel arguments.
250 Calls to map_push and map_pop must be guarded by ptx_event_lock.
251 Likewise, calls to map_init and map_fini are guarded by
252 ptx_dev_lock inside GOMP_OFFLOAD_init_device and
253 GOMP_OFFLOAD_fini_device, respectively. */
256 map_init (struct ptx_stream
*s
)
258 int size
= getpagesize ();
262 s
->map
= cuda_map_create (size
);
268 map_fini (struct ptx_stream
*s
)
270 assert (s
->map
->next
== NULL
);
271 assert (!s
->map
->active
);
273 cuda_map_destroy (s
->map
);
279 map_pop (struct ptx_stream
*s
)
281 struct cuda_map
*next
;
285 if (s
->map
->next
== NULL
)
287 s
->map
->active
= false;
292 cuda_map_destroy (s
->map
);
297 map_push (struct ptx_stream
*s
, size_t size
)
299 struct cuda_map
*map
= NULL
, *t
= NULL
;
304 /* Each PTX stream requires a separate data region to store the
305 launch arguments for cuLaunchKernel. Allocate a new
306 cuda_map and push it to the end of the list. */
309 map
= cuda_map_create (size
);
311 for (t
= s
->map
; t
->next
!= NULL
; t
= t
->next
)
316 else if (s
->map
->size
< size
)
318 cuda_map_destroy (s
->map
);
319 map
= cuda_map_create (size
);
325 s
->map
->active
= true;
330 /* Target data function launch information. */
332 struct targ_fn_launch
335 unsigned short dim
[GOMP_DIM_MAX
];
338 /* Target PTX object information. */
346 /* Target data image information. */
348 typedef struct nvptx_tdata
350 const struct targ_ptx_obj
*ptx_objs
;
353 const char *const *var_names
;
356 const struct targ_fn_launch
*fn_descs
;
360 /* Descriptor of a loaded function. */
362 struct targ_fn_descriptor
365 const struct targ_fn_launch
*launch
;
367 int max_threads_per_block
;
370 /* A loaded PTX image. */
371 struct ptx_image_data
373 const void *target_data
;
376 struct targ_fn_descriptor
*fns
; /* Array of functions. */
378 struct ptx_image_data
*next
;
386 struct ptx_stream
*null_stream
;
387 /* All non-null streams associated with this device (actually context),
388 either created implicitly or passed in from the user (via
389 acc_set_cuda_stream). */
390 struct ptx_stream
*active_streams
;
392 struct ptx_stream
**arr
;
395 /* A lock for use when manipulating the above stream list and array. */
396 pthread_mutex_t stream_lock
;
408 int max_threads_per_block
;
409 int max_threads_per_multiprocessor
;
410 int default_dims
[GOMP_DIM_MAX
];
412 struct ptx_image_data
*images
; /* Images loaded on device. */
413 pthread_mutex_t image_lock
; /* Lock for above list. */
415 struct ptx_device
*next
;
423 PTX_EVT_ASYNC_CLEANUP
434 struct ptx_event
*next
;
437 static pthread_mutex_t ptx_event_lock
;
438 static struct ptx_event
*ptx_events
;
440 static struct ptx_device
**ptx_devices
;
442 static inline struct nvptx_thread
*
445 return (struct nvptx_thread
*) GOMP_PLUGIN_acc_thread ();
449 init_streams_for_device (struct ptx_device
*ptx_dev
, int concurrency
)
452 struct ptx_stream
*null_stream
453 = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream
));
455 null_stream
->stream
= NULL
;
456 null_stream
->host_thread
= pthread_self ();
457 null_stream
->multithreaded
= true;
458 if (!map_init (null_stream
))
461 ptx_dev
->null_stream
= null_stream
;
462 ptx_dev
->active_streams
= NULL
;
463 pthread_mutex_init (&ptx_dev
->stream_lock
, NULL
);
468 /* This is just a guess -- make space for as many async streams as the
469 current device is capable of concurrently executing. This can grow
470 later as necessary. No streams are created yet. */
471 ptx_dev
->async_streams
.arr
472 = GOMP_PLUGIN_malloc (concurrency
* sizeof (struct ptx_stream
*));
473 ptx_dev
->async_streams
.size
= concurrency
;
475 for (i
= 0; i
< concurrency
; i
++)
476 ptx_dev
->async_streams
.arr
[i
] = NULL
;
482 fini_streams_for_device (struct ptx_device
*ptx_dev
)
484 free (ptx_dev
->async_streams
.arr
);
487 while (ptx_dev
->active_streams
!= NULL
)
489 struct ptx_stream
*s
= ptx_dev
->active_streams
;
490 ptx_dev
->active_streams
= ptx_dev
->active_streams
->next
;
494 CUresult r
= CUDA_CALL_NOCHECK (cuStreamDestroy
, s
->stream
);
495 if (r
!= CUDA_SUCCESS
)
497 GOMP_PLUGIN_error ("cuStreamDestroy error: %s", cuda_error (r
));
503 ret
&= map_fini (ptx_dev
->null_stream
);
504 free (ptx_dev
->null_stream
);
508 /* Select a stream for (OpenACC-semantics) ASYNC argument for the current
509 thread THREAD (and also current device/context). If CREATE is true, create
510 the stream if it does not exist (or use EXISTING if it is non-NULL), and
511 associate the stream with the same thread argument. Returns stream to use
514 static struct ptx_stream
*
515 select_stream_for_async (int async
, pthread_t thread
, bool create
,
518 struct nvptx_thread
*nvthd
= nvptx_thread ();
519 /* Local copy of TLS variable. */
520 struct ptx_device
*ptx_dev
= nvthd
->ptx_dev
;
521 struct ptx_stream
*stream
= NULL
;
522 int orig_async
= async
;
524 /* The special value acc_async_noval (-1) maps (for now) to an
525 implicitly-created stream, which is then handled the same as any other
526 numbered async stream. Other options are available, e.g. using the null
527 stream for anonymous async operations, or choosing an idle stream from an
528 active set. But, stick with this for now. */
529 if (async
> acc_async_sync
)
533 pthread_mutex_lock (&ptx_dev
->stream_lock
);
535 /* NOTE: AFAICT there's no particular need for acc_async_sync to map to the
536 null stream, and in fact better performance may be obtainable if it doesn't
537 (because the null stream enforces overly-strict synchronisation with
538 respect to other streams for legacy reasons, and that's probably not
539 needed with OpenACC). Maybe investigate later. */
540 if (async
== acc_async_sync
)
541 stream
= ptx_dev
->null_stream
;
542 else if (async
>= 0 && async
< ptx_dev
->async_streams
.size
543 && ptx_dev
->async_streams
.arr
[async
] && !(create
&& existing
))
544 stream
= ptx_dev
->async_streams
.arr
[async
];
545 else if (async
>= 0 && create
)
547 if (async
>= ptx_dev
->async_streams
.size
)
549 int i
, newsize
= ptx_dev
->async_streams
.size
* 2;
551 if (async
>= newsize
)
554 ptx_dev
->async_streams
.arr
555 = GOMP_PLUGIN_realloc (ptx_dev
->async_streams
.arr
,
556 newsize
* sizeof (struct ptx_stream
*));
558 for (i
= ptx_dev
->async_streams
.size
; i
< newsize
; i
++)
559 ptx_dev
->async_streams
.arr
[i
] = NULL
;
561 ptx_dev
->async_streams
.size
= newsize
;
564 /* Create a new stream on-demand if there isn't one already, or if we're
565 setting a particular async value to an existing (externally-provided)
567 if (!ptx_dev
->async_streams
.arr
[async
] || existing
)
571 = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream
));
574 s
->stream
= existing
;
577 r
= CUDA_CALL_NOCHECK (cuStreamCreate
, &s
->stream
,
579 if (r
!= CUDA_SUCCESS
)
581 pthread_mutex_unlock (&ptx_dev
->stream_lock
);
582 GOMP_PLUGIN_fatal ("cuStreamCreate error: %s",
587 /* If CREATE is true, we're going to be queueing some work on this
588 stream. Associate it with the current host thread. */
589 s
->host_thread
= thread
;
590 s
->multithreaded
= false;
594 pthread_mutex_unlock (&ptx_dev
->stream_lock
);
595 GOMP_PLUGIN_fatal ("map_init fail");
598 s
->next
= ptx_dev
->active_streams
;
599 ptx_dev
->active_streams
= s
;
600 ptx_dev
->async_streams
.arr
[async
] = s
;
603 stream
= ptx_dev
->async_streams
.arr
[async
];
608 pthread_mutex_unlock (&ptx_dev
->stream_lock
);
609 GOMP_PLUGIN_fatal ("bad async %d", async
);
614 assert (stream
!= NULL
);
616 /* If we're trying to use the same stream from different threads
617 simultaneously, set stream->multithreaded to true. This affects the
618 behaviour of acc_async_test_all and acc_wait_all, which are supposed to
619 only wait for asynchronous launches from the same host thread they are
620 invoked on. If multiple threads use the same async value, we make note
621 of that here and fall back to testing/waiting for all threads in those
623 if (thread
!= stream
->host_thread
)
624 stream
->multithreaded
= true;
626 pthread_mutex_unlock (&ptx_dev
->stream_lock
);
628 else if (stream
&& !stream
->multithreaded
629 && !pthread_equal (stream
->host_thread
, thread
))
630 GOMP_PLUGIN_fatal ("async %d used on wrong thread", orig_async
);
635 /* Initialize the device. Return TRUE on success, else FALSE. PTX_DEV_LOCK
636 should be locked on entry and remains locked on exit. */
643 if (instantiated_devices
!= 0)
647 pthread_mutex_init (&ptx_event_lock
, NULL
);
649 if (!init_cuda_lib ())
652 CUDA_CALL (cuInit
, 0);
654 CUDA_CALL (cuDeviceGetCount
, &ndevs
);
655 ptx_devices
= GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device
*)
660 /* Select the N'th PTX device for the current host thread. The device must
661 have been previously opened before calling this function. */
664 nvptx_attach_host_thread_to_device (int n
)
668 struct ptx_device
*ptx_dev
;
671 r
= CUDA_CALL_NOCHECK (cuCtxGetDevice
, &dev
);
672 if (r
!= CUDA_SUCCESS
&& r
!= CUDA_ERROR_INVALID_CONTEXT
)
674 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r
));
678 if (r
!= CUDA_ERROR_INVALID_CONTEXT
&& dev
== n
)
684 ptx_dev
= ptx_devices
[n
];
687 GOMP_PLUGIN_error ("device %d not found", n
);
691 CUDA_CALL (cuCtxGetCurrent
, &thd_ctx
);
693 /* We don't necessarily have a current context (e.g. if it has been
694 destroyed. Pop it if we do though. */
696 CUDA_CALL (cuCtxPopCurrent
, &old_ctx
);
698 CUDA_CALL (cuCtxPushCurrent
, ptx_dev
->ctx
);
703 static struct ptx_device
*
704 nvptx_open_device (int n
)
706 struct ptx_device
*ptx_dev
;
707 CUdevice dev
, ctx_dev
;
709 int async_engines
, pi
;
711 CUDA_CALL_ERET (NULL
, cuDeviceGet
, &dev
, n
);
713 ptx_dev
= GOMP_PLUGIN_malloc (sizeof (struct ptx_device
));
717 ptx_dev
->ctx_shared
= false;
719 r
= CUDA_CALL_NOCHECK (cuCtxGetDevice
, &ctx_dev
);
720 if (r
!= CUDA_SUCCESS
&& r
!= CUDA_ERROR_INVALID_CONTEXT
)
722 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r
));
726 if (r
!= CUDA_ERROR_INVALID_CONTEXT
&& ctx_dev
!= dev
)
728 /* The current host thread has an active context for a different device.
731 CUDA_CALL_ERET (NULL
, cuCtxPopCurrent
, &old_ctx
);
734 CUDA_CALL_ERET (NULL
, cuCtxGetCurrent
, &ptx_dev
->ctx
);
737 CUDA_CALL_ERET (NULL
, cuCtxCreate
, &ptx_dev
->ctx
, CU_CTX_SCHED_AUTO
, dev
);
739 ptx_dev
->ctx_shared
= true;
741 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
742 &pi
, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP
, dev
);
743 ptx_dev
->overlap
= pi
;
745 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
746 &pi
, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY
, dev
);
749 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
750 &pi
, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS
, dev
);
751 ptx_dev
->concur
= pi
;
753 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
754 &pi
, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE
, dev
);
757 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
758 &pi
, CU_DEVICE_ATTRIBUTE_INTEGRATED
, dev
);
761 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
762 &pi
, CU_DEVICE_ATTRIBUTE_CLOCK_RATE
, dev
);
763 ptx_dev
->clock_khz
= pi
;
765 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
766 &pi
, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT
, dev
);
767 ptx_dev
->num_sms
= pi
;
769 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
770 &pi
, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK
, dev
);
771 ptx_dev
->regs_per_block
= pi
;
773 /* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR is defined only
774 in CUDA 6.0 and newer. */
775 r
= CUDA_CALL_NOCHECK (cuDeviceGetAttribute
, &pi
,
776 CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR
,
778 /* Fallback: use limit of registers per block, which is usually equal. */
779 if (r
== CUDA_ERROR_INVALID_VALUE
)
780 pi
= ptx_dev
->regs_per_block
;
781 else if (r
!= CUDA_SUCCESS
)
783 GOMP_PLUGIN_error ("cuDeviceGetAttribute error: %s", cuda_error (r
));
786 ptx_dev
->regs_per_sm
= pi
;
788 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
789 &pi
, CU_DEVICE_ATTRIBUTE_WARP_SIZE
, dev
);
792 GOMP_PLUGIN_error ("Only warp size 32 is supported");
795 ptx_dev
->warp_size
= pi
;
797 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
, &pi
,
798 CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK
, dev
);
799 ptx_dev
->max_threads_per_block
= pi
;
801 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
, &pi
,
802 CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR
, dev
);
803 ptx_dev
->max_threads_per_multiprocessor
= pi
;
805 r
= CUDA_CALL_NOCHECK (cuDeviceGetAttribute
, &async_engines
,
806 CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT
, dev
);
807 if (r
!= CUDA_SUCCESS
)
810 for (int i
= 0; i
!= GOMP_DIM_MAX
; i
++)
811 ptx_dev
->default_dims
[i
] = 0;
813 ptx_dev
->images
= NULL
;
814 pthread_mutex_init (&ptx_dev
->image_lock
, NULL
);
816 if (!init_streams_for_device (ptx_dev
, async_engines
))
823 nvptx_close_device (struct ptx_device
*ptx_dev
)
828 if (!fini_streams_for_device (ptx_dev
))
831 pthread_mutex_destroy (&ptx_dev
->image_lock
);
833 if (!ptx_dev
->ctx_shared
)
834 CUDA_CALL (cuCtxDestroy
, ptx_dev
->ctx
);
841 nvptx_get_num_devices (void)
845 /* PR libgomp/65099: Currently, we only support offloading in 64-bit
847 if (sizeof (void *) != 8)
849 GOMP_PLUGIN_debug (0, "Disabling nvptx offloading;"
850 " only 64-bit configurations are supported\n");
854 /* This function will be called before the plugin has been initialized in
855 order to enumerate available devices, but CUDA API routines can't be used
856 until cuInit has been called. Just call it now (but don't yet do any
857 further initialization). */
858 if (instantiated_devices
== 0)
860 if (!init_cuda_lib ())
862 CUresult r
= CUDA_CALL_NOCHECK (cuInit
, 0);
863 /* This is not an error: e.g. we may have CUDA libraries installed but
864 no devices available. */
865 if (r
!= CUDA_SUCCESS
)
867 GOMP_PLUGIN_debug (0, "Disabling nvptx offloading; cuInit: %s\n",
873 CUDA_CALL_ERET (-1, cuDeviceGetCount
, &n
);
878 notify_var (const char *var_name
, const char *env_var
)
881 GOMP_PLUGIN_debug (0, "%s: <Not defined>\n", var_name
);
883 GOMP_PLUGIN_debug (0, "%s: '%s'\n", var_name
, env_var
);
887 process_GOMP_NVPTX_JIT (intptr_t *gomp_nvptx_o
)
889 const char *var_name
= "GOMP_NVPTX_JIT";
890 const char *env_var
= secure_getenv (var_name
);
891 notify_var (var_name
, env_var
);
896 const char *c
= env_var
;
902 if (c
[0] == '-' && c
[1] == 'O'
903 && '0' <= c
[2] && c
[2] <= '4'
904 && (c
[3] == '\0' || c
[3] == ' '))
906 *gomp_nvptx_o
= c
[2] - '0';
911 GOMP_PLUGIN_error ("Error parsing %s", var_name
);
917 link_ptx (CUmodule
*module
, const struct targ_ptx_obj
*ptx_objs
,
920 CUjit_option opts
[7];
925 CUlinkState linkstate
;
928 size_t linkoutsize
__attribute__ ((unused
));
930 opts
[0] = CU_JIT_WALL_TIME
;
931 optvals
[0] = &elapsed
;
933 opts
[1] = CU_JIT_INFO_LOG_BUFFER
;
934 optvals
[1] = &ilog
[0];
936 opts
[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES
;
937 optvals
[2] = (void *) sizeof ilog
;
939 opts
[3] = CU_JIT_ERROR_LOG_BUFFER
;
940 optvals
[3] = &elog
[0];
942 opts
[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES
;
943 optvals
[4] = (void *) sizeof elog
;
945 opts
[5] = CU_JIT_LOG_VERBOSE
;
946 optvals
[5] = (void *) 1;
948 static intptr_t gomp_nvptx_o
= -1;
950 static bool init_done
= false;
953 process_GOMP_NVPTX_JIT (&gomp_nvptx_o
);
958 if (gomp_nvptx_o
!= -1)
960 opts
[nopts
] = CU_JIT_OPTIMIZATION_LEVEL
;
961 optvals
[nopts
] = (void *) gomp_nvptx_o
;
965 if (CUDA_CALL_EXISTS (cuLinkCreate_v2
))
966 CUDA_CALL (cuLinkCreate_v2
, nopts
, opts
, optvals
, &linkstate
);
968 CUDA_CALL (cuLinkCreate
, nopts
, opts
, optvals
, &linkstate
);
970 for (; num_objs
--; ptx_objs
++)
972 /* cuLinkAddData's 'data' argument erroneously omits the const
974 GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs
->code
);
975 if (CUDA_CALL_EXISTS (cuLinkAddData_v2
))
976 r
= CUDA_CALL_NOCHECK (cuLinkAddData_v2
, linkstate
, CU_JIT_INPUT_PTX
,
977 (char *) ptx_objs
->code
, ptx_objs
->size
,
980 r
= CUDA_CALL_NOCHECK (cuLinkAddData
, linkstate
, CU_JIT_INPUT_PTX
,
981 (char *) ptx_objs
->code
, ptx_objs
->size
,
983 if (r
!= CUDA_SUCCESS
)
985 GOMP_PLUGIN_error ("Link error log %s\n", &elog
[0]);
986 GOMP_PLUGIN_error ("cuLinkAddData (ptx_code) error: %s",
992 GOMP_PLUGIN_debug (0, "Linking\n");
993 r
= CUDA_CALL_NOCHECK (cuLinkComplete
, linkstate
, &linkout
, &linkoutsize
);
995 GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed
);
996 GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog
[0]);
998 if (r
!= CUDA_SUCCESS
)
1000 GOMP_PLUGIN_error ("cuLinkComplete error: %s", cuda_error (r
));
1004 CUDA_CALL (cuModuleLoadData
, module
, linkout
);
1005 CUDA_CALL (cuLinkDestroy
, linkstate
);
1010 event_gc (bool memmap_lockable
)
1012 struct ptx_event
*ptx_event
= ptx_events
;
1013 struct ptx_event
*async_cleanups
= NULL
;
1014 struct nvptx_thread
*nvthd
= nvptx_thread ();
1016 pthread_mutex_lock (&ptx_event_lock
);
1018 while (ptx_event
!= NULL
)
1021 struct ptx_event
*e
= ptx_event
;
1023 ptx_event
= ptx_event
->next
;
1025 if (e
->ord
!= nvthd
->ptx_dev
->ord
)
1028 r
= CUDA_CALL_NOCHECK (cuEventQuery
, *e
->evt
);
1029 if (r
== CUDA_SUCCESS
)
1031 bool append_async
= false;
1046 case PTX_EVT_ASYNC_CLEANUP
:
1048 /* The function gomp_plugin_async_unmap_vars needs to claim the
1049 memory-map splay tree lock for the current device, so we
1050 can't call it when one of our callers has already claimed
1051 the lock. In that case, just delay the GC for this event
1053 if (!memmap_lockable
)
1056 append_async
= true;
1061 CUDA_CALL_NOCHECK (cuEventDestroy
, *te
);
1064 /* Unlink 'e' from ptx_events list. */
1065 if (ptx_events
== e
)
1066 ptx_events
= ptx_events
->next
;
1069 struct ptx_event
*e_
= ptx_events
;
1070 while (e_
->next
!= e
)
1072 e_
->next
= e_
->next
->next
;
1077 e
->next
= async_cleanups
;
1085 pthread_mutex_unlock (&ptx_event_lock
);
1087 /* We have to do these here, after ptx_event_lock is released. */
1088 while (async_cleanups
)
1090 struct ptx_event
*e
= async_cleanups
;
1091 async_cleanups
= async_cleanups
->next
;
1093 GOMP_PLUGIN_async_unmap_vars (e
->addr
, e
->val
);
1099 event_add (enum ptx_event_type type
, CUevent
*e
, void *h
, int val
)
1101 struct ptx_event
*ptx_event
;
1102 struct nvptx_thread
*nvthd
= nvptx_thread ();
1104 assert (type
== PTX_EVT_MEM
|| type
== PTX_EVT_KNL
|| type
== PTX_EVT_SYNC
1105 || type
== PTX_EVT_ASYNC_CLEANUP
);
1107 ptx_event
= GOMP_PLUGIN_malloc (sizeof (struct ptx_event
));
1108 ptx_event
->type
= type
;
1110 ptx_event
->addr
= h
;
1111 ptx_event
->ord
= nvthd
->ptx_dev
->ord
;
1112 ptx_event
->val
= val
;
1114 pthread_mutex_lock (&ptx_event_lock
);
1116 ptx_event
->next
= ptx_events
;
1117 ptx_events
= ptx_event
;
1119 pthread_mutex_unlock (&ptx_event_lock
);
1123 nvptx_exec (void (*fn
), size_t mapnum
, void **hostaddrs
, void **devaddrs
,
1124 int async
, unsigned *dims
, void *targ_mem_desc
)
1126 struct targ_fn_descriptor
*targ_fn
= (struct targ_fn_descriptor
*) fn
;
1127 CUfunction function
;
1130 struct ptx_stream
*dev_str
;
1134 struct nvptx_thread
*nvthd
= nvptx_thread ();
1135 int warp_size
= nvthd
->ptx_dev
->warp_size
;
1136 const char *maybe_abort_msg
= "(perhaps abort was called)";
1138 function
= targ_fn
->fn
;
1140 dev_str
= select_stream_for_async (async
, pthread_self (), false, NULL
);
1141 assert (dev_str
== nvthd
->current_stream
);
1143 /* Initialize the launch dimensions. Typically this is constant,
1144 provided by the device compiler, but we must permit runtime
1147 for (i
= 0; i
!= GOMP_DIM_MAX
; i
++)
1149 if (targ_fn
->launch
->dim
[i
])
1150 dims
[i
] = targ_fn
->launch
->dim
[i
];
1157 pthread_mutex_lock (&ptx_dev_lock
);
1159 static int gomp_openacc_dims
[GOMP_DIM_MAX
];
1160 if (!gomp_openacc_dims
[0])
1162 /* See if the user provided GOMP_OPENACC_DIM environment
1163 variable to specify runtime defaults. */
1164 for (int i
= 0; i
< GOMP_DIM_MAX
; ++i
)
1165 gomp_openacc_dims
[i
] = GOMP_PLUGIN_acc_default_dim (i
);
1168 if (!nvthd
->ptx_dev
->default_dims
[0])
1170 int default_dims
[GOMP_DIM_MAX
];
1171 for (int i
= 0; i
< GOMP_DIM_MAX
; ++i
)
1172 default_dims
[i
] = gomp_openacc_dims
[i
];
1174 int gang
, worker
, vector
;
1176 int block_size
= nvthd
->ptx_dev
->max_threads_per_block
;
1177 int cpu_size
= nvthd
->ptx_dev
->max_threads_per_multiprocessor
;
1178 int dev_size
= nvthd
->ptx_dev
->num_sms
;
1179 GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
1180 " dev_size=%d, cpu_size=%d\n",
1181 warp_size
, block_size
, dev_size
, cpu_size
);
1183 gang
= (cpu_size
/ block_size
) * dev_size
;
1184 worker
= block_size
/ warp_size
;
1188 /* There is no upper bound on the gang size. The best size
1189 matches the hardware configuration. Logical gangs are
1190 scheduled onto physical hardware. To maximize usage, we
1191 should guess a large number. */
1192 if (default_dims
[GOMP_DIM_GANG
] < 1)
1193 default_dims
[GOMP_DIM_GANG
] = gang
? gang
: 1024;
1194 /* The worker size must not exceed the hardware. */
1195 if (default_dims
[GOMP_DIM_WORKER
] < 1
1196 || (default_dims
[GOMP_DIM_WORKER
] > worker
&& gang
))
1197 default_dims
[GOMP_DIM_WORKER
] = worker
;
1198 /* The vector size must exactly match the hardware. */
1199 if (default_dims
[GOMP_DIM_VECTOR
] < 1
1200 || (default_dims
[GOMP_DIM_VECTOR
] != vector
&& gang
))
1201 default_dims
[GOMP_DIM_VECTOR
] = vector
;
1203 GOMP_PLUGIN_debug (0, " default dimensions [%d,%d,%d]\n",
1204 default_dims
[GOMP_DIM_GANG
],
1205 default_dims
[GOMP_DIM_WORKER
],
1206 default_dims
[GOMP_DIM_VECTOR
]);
1208 for (i
= 0; i
!= GOMP_DIM_MAX
; i
++)
1209 nvthd
->ptx_dev
->default_dims
[i
] = default_dims
[i
];
1211 pthread_mutex_unlock (&ptx_dev_lock
);
1214 bool default_dim_p
[GOMP_DIM_MAX
];
1215 for (i
= 0; i
!= GOMP_DIM_MAX
; i
++)
1216 default_dim_p
[i
] = !dims
[i
];
1218 if (!CUDA_CALL_EXISTS (cuOccupancyMaxPotentialBlockSize
))
1220 for (i
= 0; i
!= GOMP_DIM_MAX
; i
++)
1221 if (default_dim_p
[i
])
1222 dims
[i
] = nvthd
->ptx_dev
->default_dims
[i
];
1224 if (default_dim_p
[GOMP_DIM_VECTOR
])
1225 dims
[GOMP_DIM_VECTOR
]
1226 = MIN (dims
[GOMP_DIM_VECTOR
],
1227 (targ_fn
->max_threads_per_block
/ warp_size
1230 if (default_dim_p
[GOMP_DIM_WORKER
])
1231 dims
[GOMP_DIM_WORKER
]
1232 = MIN (dims
[GOMP_DIM_WORKER
],
1233 targ_fn
->max_threads_per_block
/ dims
[GOMP_DIM_VECTOR
]);
1237 /* Handle the case that the compiler allows the runtime to choose
1238 the vector-length conservatively, by ignoring
1239 gomp_openacc_dims[GOMP_DIM_VECTOR]. TODO: actually handle
1242 /* TODO: limit gomp_openacc_dims[GOMP_DIM_WORKER] such that that
1243 gomp_openacc_dims[GOMP_DIM_WORKER] * actual_vectors does not
1244 exceed targ_fn->max_threads_per_block. */
1245 int workers
= gomp_openacc_dims
[GOMP_DIM_WORKER
];
1246 int gangs
= gomp_openacc_dims
[GOMP_DIM_GANG
];
1249 CUDA_CALL_ASSERT (cuOccupancyMaxPotentialBlockSize
, &grids
,
1250 &blocks
, function
, NULL
, 0,
1251 dims
[GOMP_DIM_WORKER
] * dims
[GOMP_DIM_VECTOR
]);
1252 GOMP_PLUGIN_debug (0, "cuOccupancyMaxPotentialBlockSize: "
1253 "grid = %d, block = %d\n", grids
, blocks
);
1255 /* Keep the num_gangs proportional to the block size. In
1256 the case were a block size is limited by shared-memory
1257 or the register file capacity, the runtime will not
1258 excessively over assign gangs to the multiprocessor
1259 units if their state is going to be swapped out even
1260 more than necessary. The constant factor 2 is there to
1261 prevent threads from idling when there is insufficient
1264 gangs
= 2 * grids
* (blocks
/ warp_size
);
1267 vectors
= warp_size
;
1271 int actual_vectors
= (default_dim_p
[GOMP_DIM_VECTOR
]
1273 : dims
[GOMP_DIM_VECTOR
]);
1274 workers
= blocks
/ actual_vectors
;
1277 for (i
= 0; i
!= GOMP_DIM_MAX
; i
++)
1278 if (default_dim_p
[i
])
1281 case GOMP_DIM_GANG
: dims
[i
] = gangs
; break;
1282 case GOMP_DIM_WORKER
: dims
[i
] = workers
; break;
1283 case GOMP_DIM_VECTOR
: dims
[i
] = vectors
; break;
1284 default: GOMP_PLUGIN_fatal ("invalid dim");
1290 /* Check if the accelerator has sufficient hardware resources to
1291 launch the offloaded kernel. */
1292 if (dims
[GOMP_DIM_WORKER
] * dims
[GOMP_DIM_VECTOR
]
1293 > targ_fn
->max_threads_per_block
)
1296 = targ_fn
->max_threads_per_block
/ dims
[GOMP_DIM_VECTOR
];
1297 GOMP_PLUGIN_fatal ("The Nvidia accelerator has insufficient resources to"
1298 " launch '%s' with num_workers = %d; recompile the"
1299 " program with 'num_workers = %d' on that offloaded"
1300 " region or '-fopenacc-dim=:%d'",
1301 targ_fn
->launch
->fn
, dims
[GOMP_DIM_WORKER
],
1302 suggest_workers
, suggest_workers
);
1305 /* This reserves a chunk of a pre-allocated page of memory mapped on both
1306 the host and the device. HP is a host pointer to the new chunk, and DP is
1307 the corresponding device pointer. */
1308 pthread_mutex_lock (&ptx_event_lock
);
1309 dp
= map_push (dev_str
, mapnum
* sizeof (void *));
1310 pthread_mutex_unlock (&ptx_event_lock
);
1312 GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__
);
1314 /* Copy the array of arguments to the mapped page. */
1315 hp
= alloca(sizeof(void *) * mapnum
);
1316 for (i
= 0; i
< mapnum
; i
++)
1317 ((void **) hp
)[i
] = devaddrs
[i
];
1319 /* Copy the (device) pointers to arguments to the device */
1320 CUDA_CALL_ASSERT (cuMemcpyHtoD
, dp
, hp
,
1321 mapnum
* sizeof (void *));
1322 GOMP_PLUGIN_debug (0, " %s: kernel %s: launch"
1323 " gangs=%u, workers=%u, vectors=%u\n",
1324 __FUNCTION__
, targ_fn
->launch
->fn
, dims
[GOMP_DIM_GANG
],
1325 dims
[GOMP_DIM_WORKER
], dims
[GOMP_DIM_VECTOR
]);
1329 // num_gangs nctaid.x
1330 // num_workers ntid.y
1331 // vector length ntid.x
1334 CUDA_CALL_ASSERT (cuLaunchKernel
, function
,
1335 dims
[GOMP_DIM_GANG
], 1, 1,
1336 dims
[GOMP_DIM_VECTOR
], dims
[GOMP_DIM_WORKER
], 1,
1337 0, dev_str
->stream
, kargs
, 0);
1339 #ifndef DISABLE_ASYNC
1340 if (async
< acc_async_noval
)
1342 r
= CUDA_CALL_NOCHECK (cuStreamSynchronize
, dev_str
->stream
);
1343 if (r
== CUDA_ERROR_LAUNCH_FAILED
)
1344 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r
),
1346 else if (r
!= CUDA_SUCCESS
)
1347 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r
));
1353 e
= (CUevent
*)GOMP_PLUGIN_malloc (sizeof (CUevent
));
1355 r
= CUDA_CALL_NOCHECK (cuEventCreate
, e
, CU_EVENT_DISABLE_TIMING
);
1356 if (r
== CUDA_ERROR_LAUNCH_FAILED
)
1357 GOMP_PLUGIN_fatal ("cuEventCreate error: %s %s\n", cuda_error (r
),
1359 else if (r
!= CUDA_SUCCESS
)
1360 GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r
));
1364 CUDA_CALL_ASSERT (cuEventRecord
, *e
, dev_str
->stream
);
1366 event_add (PTX_EVT_KNL
, e
, (void *)dev_str
, 0);
1369 r
= CUDA_CALL_NOCHECK (cuCtxSynchronize
, );
1370 if (r
== CUDA_ERROR_LAUNCH_FAILED
)
1371 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r
),
1373 else if (r
!= CUDA_SUCCESS
)
1374 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r
));
1377 GOMP_PLUGIN_debug (0, " %s: kernel %s: finished\n", __FUNCTION__
,
1378 targ_fn
->launch
->fn
);
1380 #ifndef DISABLE_ASYNC
1381 if (async
< acc_async_noval
)
1386 void * openacc_get_current_cuda_context (void);
1389 nvptx_alloc (size_t s
)
1393 CUDA_CALL_ERET (NULL
, cuMemAlloc
, &d
, s
);
1398 nvptx_free (void *p
)
1403 CUDA_CALL (cuMemGetAddressRange
, &pb
, &ps
, (CUdeviceptr
) p
);
1404 if ((CUdeviceptr
) p
!= pb
)
1406 GOMP_PLUGIN_error ("invalid device address");
1410 CUDA_CALL (cuMemFree
, (CUdeviceptr
) p
);
1416 nvptx_host2dev (void *d
, const void *h
, size_t s
)
1420 struct nvptx_thread
*nvthd
= nvptx_thread ();
1426 GOMP_PLUGIN_error ("invalid device address");
1430 CUDA_CALL (cuMemGetAddressRange
, &pb
, &ps
, (CUdeviceptr
) d
);
1434 GOMP_PLUGIN_error ("invalid device address");
1439 GOMP_PLUGIN_error ("invalid host address");
1444 GOMP_PLUGIN_error ("invalid host or device address");
1447 if ((void *)(d
+ s
) > (void *)(pb
+ ps
))
1449 GOMP_PLUGIN_error ("invalid size");
1453 #ifndef DISABLE_ASYNC
1454 if (nvthd
&& nvthd
->current_stream
!= nvthd
->ptx_dev
->null_stream
)
1456 CUevent
*e
= (CUevent
*)GOMP_PLUGIN_malloc (sizeof (CUevent
));
1457 CUDA_CALL (cuEventCreate
, e
, CU_EVENT_DISABLE_TIMING
);
1459 CUDA_CALL (cuMemcpyHtoDAsync
,
1460 (CUdeviceptr
) d
, h
, s
, nvthd
->current_stream
->stream
);
1461 CUDA_CALL (cuEventRecord
, *e
, nvthd
->current_stream
->stream
);
1462 event_add (PTX_EVT_MEM
, e
, (void *)h
, 0);
1466 CUDA_CALL (cuMemcpyHtoD
, (CUdeviceptr
) d
, h
, s
);
1472 nvptx_dev2host (void *h
, const void *d
, size_t s
)
1476 struct nvptx_thread
*nvthd
= nvptx_thread ();
1482 GOMP_PLUGIN_error ("invalid device address");
1486 CUDA_CALL (cuMemGetAddressRange
, &pb
, &ps
, (CUdeviceptr
) d
);
1490 GOMP_PLUGIN_error ("invalid device address");
1495 GOMP_PLUGIN_error ("invalid host address");
1500 GOMP_PLUGIN_error ("invalid host or device address");
1503 if ((void *)(d
+ s
) > (void *)(pb
+ ps
))
1505 GOMP_PLUGIN_error ("invalid size");
1509 #ifndef DISABLE_ASYNC
1510 if (nvthd
&& nvthd
->current_stream
!= nvthd
->ptx_dev
->null_stream
)
1512 CUevent
*e
= (CUevent
*) GOMP_PLUGIN_malloc (sizeof (CUevent
));
1513 CUDA_CALL (cuEventCreate
, e
, CU_EVENT_DISABLE_TIMING
);
1515 CUDA_CALL (cuMemcpyDtoHAsync
,
1516 h
, (CUdeviceptr
) d
, s
, nvthd
->current_stream
->stream
);
1517 CUDA_CALL (cuEventRecord
, *e
, nvthd
->current_stream
->stream
);
1518 event_add (PTX_EVT_MEM
, e
, (void *)h
, 0);
1522 CUDA_CALL (cuMemcpyDtoH
, h
, (CUdeviceptr
) d
, s
);
1528 nvptx_set_async (int async
)
1530 struct nvptx_thread
*nvthd
= nvptx_thread ();
1531 nvthd
->current_stream
1532 = select_stream_for_async (async
, pthread_self (), true, NULL
);
1536 nvptx_async_test (int async
)
1539 struct ptx_stream
*s
;
1541 s
= select_stream_for_async (async
, pthread_self (), false, NULL
);
1544 GOMP_PLUGIN_fatal ("unknown async %d", async
);
1546 r
= CUDA_CALL_NOCHECK (cuStreamQuery
, s
->stream
);
1547 if (r
== CUDA_SUCCESS
)
1549 /* The oacc-parallel.c:goacc_wait function calls this hook to determine
1550 whether all work has completed on this stream, and if so omits the call
1551 to the wait hook. If that happens, event_gc might not get called
1552 (which prevents variables from getting unmapped and their associated
1553 device storage freed), so call it here. */
1557 else if (r
== CUDA_ERROR_NOT_READY
)
1560 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r
));
1566 nvptx_async_test_all (void)
1568 struct ptx_stream
*s
;
1569 pthread_t self
= pthread_self ();
1570 struct nvptx_thread
*nvthd
= nvptx_thread ();
1572 pthread_mutex_lock (&nvthd
->ptx_dev
->stream_lock
);
1574 for (s
= nvthd
->ptx_dev
->active_streams
; s
!= NULL
; s
= s
->next
)
1576 if ((s
->multithreaded
|| pthread_equal (s
->host_thread
, self
))
1577 && CUDA_CALL_NOCHECK (cuStreamQuery
,
1578 s
->stream
) == CUDA_ERROR_NOT_READY
)
1580 pthread_mutex_unlock (&nvthd
->ptx_dev
->stream_lock
);
1585 pthread_mutex_unlock (&nvthd
->ptx_dev
->stream_lock
);
1593 nvptx_wait (int async
)
1595 struct ptx_stream
*s
;
1597 s
= select_stream_for_async (async
, pthread_self (), false, NULL
);
1599 GOMP_PLUGIN_fatal ("unknown async %d", async
);
1601 CUDA_CALL_ASSERT (cuStreamSynchronize
, s
->stream
);
1607 nvptx_wait_async (int async1
, int async2
)
1610 struct ptx_stream
*s1
, *s2
;
1611 pthread_t self
= pthread_self ();
1613 /* The stream that is waiting (rather than being waited for) doesn't
1614 necessarily have to exist already. */
1615 s2
= select_stream_for_async (async2
, self
, true, NULL
);
1617 s1
= select_stream_for_async (async1
, self
, false, NULL
);
1619 GOMP_PLUGIN_fatal ("invalid async 1\n");
1622 GOMP_PLUGIN_fatal ("identical parameters");
1624 e
= (CUevent
*) GOMP_PLUGIN_malloc (sizeof (CUevent
));
1626 CUDA_CALL_ASSERT (cuEventCreate
, e
, CU_EVENT_DISABLE_TIMING
);
1630 CUDA_CALL_ASSERT (cuEventRecord
, *e
, s1
->stream
);
1632 event_add (PTX_EVT_SYNC
, e
, NULL
, 0);
1634 CUDA_CALL_ASSERT (cuStreamWaitEvent
, s2
->stream
, *e
, 0);
1638 nvptx_wait_all (void)
1641 struct ptx_stream
*s
;
1642 pthread_t self
= pthread_self ();
1643 struct nvptx_thread
*nvthd
= nvptx_thread ();
1645 pthread_mutex_lock (&nvthd
->ptx_dev
->stream_lock
);
1647 /* Wait for active streams initiated by this thread (or by multiple threads)
1649 for (s
= nvthd
->ptx_dev
->active_streams
; s
!= NULL
; s
= s
->next
)
1651 if (s
->multithreaded
|| pthread_equal (s
->host_thread
, self
))
1653 r
= CUDA_CALL_NOCHECK (cuStreamQuery
, s
->stream
);
1654 if (r
== CUDA_SUCCESS
)
1656 else if (r
!= CUDA_ERROR_NOT_READY
)
1657 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r
));
1659 CUDA_CALL_ASSERT (cuStreamSynchronize
, s
->stream
);
1663 pthread_mutex_unlock (&nvthd
->ptx_dev
->stream_lock
);
1669 nvptx_wait_all_async (int async
)
1671 struct ptx_stream
*waiting_stream
, *other_stream
;
1673 struct nvptx_thread
*nvthd
= nvptx_thread ();
1674 pthread_t self
= pthread_self ();
1676 /* The stream doing the waiting. This could be the first mention of the
1677 stream, so create it if necessary. */
1679 = select_stream_for_async (async
, pthread_self (), true, NULL
);
1681 /* Launches on the null stream already block on other streams in the
1683 if (!waiting_stream
|| waiting_stream
== nvthd
->ptx_dev
->null_stream
)
1688 pthread_mutex_lock (&nvthd
->ptx_dev
->stream_lock
);
1690 for (other_stream
= nvthd
->ptx_dev
->active_streams
;
1691 other_stream
!= NULL
;
1692 other_stream
= other_stream
->next
)
1694 if (!other_stream
->multithreaded
1695 && !pthread_equal (other_stream
->host_thread
, self
))
1698 e
= (CUevent
*) GOMP_PLUGIN_malloc (sizeof (CUevent
));
1700 CUDA_CALL_ASSERT (cuEventCreate
, e
, CU_EVENT_DISABLE_TIMING
);
1702 /* Record an event on the waited-for stream. */
1703 CUDA_CALL_ASSERT (cuEventRecord
, *e
, other_stream
->stream
);
1705 event_add (PTX_EVT_SYNC
, e
, NULL
, 0);
1707 CUDA_CALL_ASSERT (cuStreamWaitEvent
, waiting_stream
->stream
, *e
, 0);
1710 pthread_mutex_unlock (&nvthd
->ptx_dev
->stream_lock
);
1714 nvptx_get_current_cuda_device (void)
1716 struct nvptx_thread
*nvthd
= nvptx_thread ();
1718 if (!nvthd
|| !nvthd
->ptx_dev
)
1721 return &nvthd
->ptx_dev
->dev
;
1725 nvptx_get_current_cuda_context (void)
1727 struct nvptx_thread
*nvthd
= nvptx_thread ();
1729 if (!nvthd
|| !nvthd
->ptx_dev
)
1732 return nvthd
->ptx_dev
->ctx
;
1736 nvptx_get_cuda_stream (int async
)
1738 struct ptx_stream
*s
;
1739 struct nvptx_thread
*nvthd
= nvptx_thread ();
1741 if (!nvthd
|| !nvthd
->ptx_dev
)
1744 s
= select_stream_for_async (async
, pthread_self (), false, NULL
);
1746 return s
? s
->stream
: NULL
;
1750 nvptx_set_cuda_stream (int async
, void *stream
)
1752 struct ptx_stream
*oldstream
;
1753 pthread_t self
= pthread_self ();
1754 struct nvptx_thread
*nvthd
= nvptx_thread ();
1757 GOMP_PLUGIN_fatal ("bad async %d", async
);
1759 pthread_mutex_lock (&nvthd
->ptx_dev
->stream_lock
);
1761 /* We have a list of active streams and an array mapping async values to
1762 entries of that list. We need to take "ownership" of the passed-in stream,
1763 and add it to our list, removing the previous entry also (if there was one)
1764 in order to prevent resource leaks. Note the potential for surprise
1765 here: maybe we should keep track of passed-in streams and leave it up to
1766 the user to tidy those up, but that doesn't work for stream handles
1767 returned from acc_get_cuda_stream above... */
1769 oldstream
= select_stream_for_async (async
, self
, false, NULL
);
1773 if (nvthd
->ptx_dev
->active_streams
== oldstream
)
1774 nvthd
->ptx_dev
->active_streams
= nvthd
->ptx_dev
->active_streams
->next
;
1777 struct ptx_stream
*s
= nvthd
->ptx_dev
->active_streams
;
1778 while (s
->next
!= oldstream
)
1780 s
->next
= s
->next
->next
;
1783 CUDA_CALL_ASSERT (cuStreamDestroy
, oldstream
->stream
);
1785 if (!map_fini (oldstream
))
1786 GOMP_PLUGIN_fatal ("error when freeing host memory");
1791 pthread_mutex_unlock (&nvthd
->ptx_dev
->stream_lock
);
1793 (void) select_stream_for_async (async
, self
, true, (CUstream
) stream
);
1798 /* Plugin entry points. */
1801 GOMP_OFFLOAD_get_name (void)
1807 GOMP_OFFLOAD_get_caps (void)
1809 return GOMP_OFFLOAD_CAP_OPENACC_200
| GOMP_OFFLOAD_CAP_OPENMP_400
;
1813 GOMP_OFFLOAD_get_type (void)
1815 return OFFLOAD_TARGET_TYPE_NVIDIA_PTX
;
1819 GOMP_OFFLOAD_get_num_devices (void)
1821 return nvptx_get_num_devices ();
1825 GOMP_OFFLOAD_init_device (int n
)
1827 struct ptx_device
*dev
;
1829 pthread_mutex_lock (&ptx_dev_lock
);
1831 if (!nvptx_init () || ptx_devices
[n
] != NULL
)
1833 pthread_mutex_unlock (&ptx_dev_lock
);
1837 dev
= nvptx_open_device (n
);
1840 ptx_devices
[n
] = dev
;
1841 instantiated_devices
++;
1844 pthread_mutex_unlock (&ptx_dev_lock
);
1850 GOMP_OFFLOAD_fini_device (int n
)
1852 pthread_mutex_lock (&ptx_dev_lock
);
1854 if (ptx_devices
[n
] != NULL
)
1856 if (!nvptx_attach_host_thread_to_device (n
)
1857 || !nvptx_close_device (ptx_devices
[n
]))
1859 pthread_mutex_unlock (&ptx_dev_lock
);
1862 ptx_devices
[n
] = NULL
;
1863 instantiated_devices
--;
1866 pthread_mutex_unlock (&ptx_dev_lock
);
1870 /* Return the libgomp version number we're compatible with. There is
1871 no requirement for cross-version compatibility. */
1874 GOMP_OFFLOAD_version (void)
1876 return GOMP_VERSION
;
1879 /* Initialize __nvptx_clocktick, if present in MODULE. */
1882 nvptx_set_clocktick (CUmodule module
, struct ptx_device
*dev
)
1885 CUresult r
= CUDA_CALL_NOCHECK (cuModuleGetGlobal
, &dptr
, NULL
,
1886 module
, "__nvptx_clocktick");
1887 if (r
== CUDA_ERROR_NOT_FOUND
)
1889 if (r
!= CUDA_SUCCESS
)
1890 GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r
));
1891 double __nvptx_clocktick
= 1e-3 / dev
->clock_khz
;
1892 r
= CUDA_CALL_NOCHECK (cuMemcpyHtoD
, dptr
, &__nvptx_clocktick
,
1893 sizeof (__nvptx_clocktick
));
1894 if (r
!= CUDA_SUCCESS
)
1895 GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r
));
1898 /* Load the (partial) program described by TARGET_DATA to device
1899 number ORD. Allocate and return TARGET_TABLE. */
1902 GOMP_OFFLOAD_load_image (int ord
, unsigned version
, const void *target_data
,
1903 struct addr_pair
**target_table
)
1906 const char *const *var_names
;
1907 const struct targ_fn_launch
*fn_descs
;
1908 unsigned int fn_entries
, var_entries
, i
, j
;
1909 struct targ_fn_descriptor
*targ_fns
;
1910 struct addr_pair
*targ_tbl
;
1911 const nvptx_tdata_t
*img_header
= (const nvptx_tdata_t
*) target_data
;
1912 struct ptx_image_data
*new_image
;
1913 struct ptx_device
*dev
;
1915 if (GOMP_VERSION_DEV (version
) > GOMP_VERSION_NVIDIA_PTX
)
1917 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1918 " (expected %u, received %u)",
1919 GOMP_VERSION_NVIDIA_PTX
, GOMP_VERSION_DEV (version
));
1923 if (!nvptx_attach_host_thread_to_device (ord
)
1924 || !link_ptx (&module
, img_header
->ptx_objs
, img_header
->ptx_num
))
1927 dev
= ptx_devices
[ord
];
1929 /* The mkoffload utility emits a struct of pointers/integers at the
1930 start of each offload image. The array of kernel names and the
1931 functions addresses form a one-to-one correspondence. */
1933 var_entries
= img_header
->var_num
;
1934 var_names
= img_header
->var_names
;
1935 fn_entries
= img_header
->fn_num
;
1936 fn_descs
= img_header
->fn_descs
;
1938 targ_tbl
= GOMP_PLUGIN_malloc (sizeof (struct addr_pair
)
1939 * (fn_entries
+ var_entries
));
1940 targ_fns
= GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor
)
1943 *target_table
= targ_tbl
;
1945 new_image
= GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data
));
1946 new_image
->target_data
= target_data
;
1947 new_image
->module
= module
;
1948 new_image
->fns
= targ_fns
;
1950 pthread_mutex_lock (&dev
->image_lock
);
1951 new_image
->next
= dev
->images
;
1952 dev
->images
= new_image
;
1953 pthread_mutex_unlock (&dev
->image_lock
);
1955 for (i
= 0; i
< fn_entries
; i
++, targ_fns
++, targ_tbl
++)
1957 CUfunction function
;
1960 CUDA_CALL_ERET (-1, cuModuleGetFunction
, &function
, module
,
1962 CUDA_CALL_ERET (-1, cuFuncGetAttribute
, &nregs
,
1963 CU_FUNC_ATTRIBUTE_NUM_REGS
, function
);
1964 CUDA_CALL_ERET (-1, cuFuncGetAttribute
, &mthrs
,
1965 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK
, function
);
1967 targ_fns
->fn
= function
;
1968 targ_fns
->launch
= &fn_descs
[i
];
1969 targ_fns
->regs_per_thread
= nregs
;
1970 targ_fns
->max_threads_per_block
= mthrs
;
1972 targ_tbl
->start
= (uintptr_t) targ_fns
;
1973 targ_tbl
->end
= targ_tbl
->start
+ 1;
1976 for (j
= 0; j
< var_entries
; j
++, targ_tbl
++)
1981 CUDA_CALL_ERET (-1, cuModuleGetGlobal
,
1982 &var
, &bytes
, module
, var_names
[j
]);
1984 targ_tbl
->start
= (uintptr_t) var
;
1985 targ_tbl
->end
= targ_tbl
->start
+ bytes
;
1988 nvptx_set_clocktick (module
, dev
);
1990 return fn_entries
+ var_entries
;
1993 /* Unload the program described by TARGET_DATA. DEV_DATA is the
1994 function descriptors allocated by G_O_load_image. */
1997 GOMP_OFFLOAD_unload_image (int ord
, unsigned version
, const void *target_data
)
1999 struct ptx_image_data
*image
, **prev_p
;
2000 struct ptx_device
*dev
= ptx_devices
[ord
];
2002 if (GOMP_VERSION_DEV (version
) > GOMP_VERSION_NVIDIA_PTX
)
2004 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
2005 " (expected %u, received %u)",
2006 GOMP_VERSION_NVIDIA_PTX
, GOMP_VERSION_DEV (version
));
2011 pthread_mutex_lock (&dev
->image_lock
);
2012 for (prev_p
= &dev
->images
; (image
= *prev_p
) != 0; prev_p
= &image
->next
)
2013 if (image
->target_data
== target_data
)
2015 *prev_p
= image
->next
;
2016 if (CUDA_CALL_NOCHECK (cuModuleUnload
, image
->module
) != CUDA_SUCCESS
)
2022 pthread_mutex_unlock (&dev
->image_lock
);
2027 GOMP_OFFLOAD_alloc (int ord
, size_t size
)
2029 if (!nvptx_attach_host_thread_to_device (ord
))
2031 return nvptx_alloc (size
);
2035 GOMP_OFFLOAD_free (int ord
, void *ptr
)
2037 return (nvptx_attach_host_thread_to_device (ord
)
2038 && nvptx_free (ptr
));
2042 GOMP_OFFLOAD_dev2host (int ord
, void *dst
, const void *src
, size_t n
)
2044 return (nvptx_attach_host_thread_to_device (ord
)
2045 && nvptx_dev2host (dst
, src
, n
));
2049 GOMP_OFFLOAD_host2dev (int ord
, void *dst
, const void *src
, size_t n
)
2051 return (nvptx_attach_host_thread_to_device (ord
)
2052 && nvptx_host2dev (dst
, src
, n
));
2056 GOMP_OFFLOAD_dev2dev (int ord
, void *dst
, const void *src
, size_t n
)
2058 struct ptx_device
*ptx_dev
= ptx_devices
[ord
];
2059 CUDA_CALL (cuMemcpyDtoDAsync
, (CUdeviceptr
) dst
, (CUdeviceptr
) src
, n
,
2060 ptx_dev
->null_stream
->stream
);
2064 void (*device_run
) (int n
, void *fn_ptr
, void *vars
) = NULL
;
2067 GOMP_OFFLOAD_openacc_exec (void (*fn
) (void *), size_t mapnum
,
2068 void **hostaddrs
, void **devaddrs
,
2069 int async
, unsigned *dims
, void *targ_mem_desc
)
2071 nvptx_exec (fn
, mapnum
, hostaddrs
, devaddrs
, async
, dims
, targ_mem_desc
);
2075 GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc
, int async
)
2077 struct nvptx_thread
*nvthd
= nvptx_thread ();
2078 CUevent
*e
= (CUevent
*) GOMP_PLUGIN_malloc (sizeof (CUevent
));
2080 CUDA_CALL_ASSERT (cuEventCreate
, e
, CU_EVENT_DISABLE_TIMING
);
2081 CUDA_CALL_ASSERT (cuEventRecord
, *e
, nvthd
->current_stream
->stream
);
2082 event_add (PTX_EVT_ASYNC_CLEANUP
, e
, targ_mem_desc
, async
);
2086 GOMP_OFFLOAD_openacc_async_test (int async
)
2088 return nvptx_async_test (async
);
2092 GOMP_OFFLOAD_openacc_async_test_all (void)
2094 return nvptx_async_test_all ();
2098 GOMP_OFFLOAD_openacc_async_wait (int async
)
2104 GOMP_OFFLOAD_openacc_async_wait_async (int async1
, int async2
)
2106 nvptx_wait_async (async1
, async2
);
2110 GOMP_OFFLOAD_openacc_async_wait_all (void)
2116 GOMP_OFFLOAD_openacc_async_wait_all_async (int async
)
2118 nvptx_wait_all_async (async
);
2122 GOMP_OFFLOAD_openacc_async_set_async (int async
)
2124 nvptx_set_async (async
);
2128 GOMP_OFFLOAD_openacc_create_thread_data (int ord
)
2130 struct ptx_device
*ptx_dev
;
2131 struct nvptx_thread
*nvthd
2132 = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread
));
2135 ptx_dev
= ptx_devices
[ord
];
2139 CUDA_CALL_ASSERT (cuCtxGetCurrent
, &thd_ctx
);
2141 assert (ptx_dev
->ctx
);
2144 CUDA_CALL_ASSERT (cuCtxPushCurrent
, ptx_dev
->ctx
);
2146 nvthd
->current_stream
= ptx_dev
->null_stream
;
2147 nvthd
->ptx_dev
= ptx_dev
;
2149 return (void *) nvthd
;
2153 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data
)
2159 GOMP_OFFLOAD_openacc_cuda_get_current_device (void)
2161 return nvptx_get_current_cuda_device ();
2165 GOMP_OFFLOAD_openacc_cuda_get_current_context (void)
2167 return nvptx_get_current_cuda_context ();
2170 /* NOTE: This returns a CUstream, not a ptx_stream pointer. */
2173 GOMP_OFFLOAD_openacc_cuda_get_stream (int async
)
2175 return nvptx_get_cuda_stream (async
);
2178 /* NOTE: This takes a CUstream, not a ptx_stream pointer. */
2181 GOMP_OFFLOAD_openacc_cuda_set_stream (int async
, void *stream
)
2183 return nvptx_set_cuda_stream (async
, stream
);
2186 /* Adjust launch dimensions: pick good values for number of blocks and warps
2187 and ensure that number of warps does not exceed CUDA limits as well as GCC's
2191 nvptx_adjust_launch_bounds (struct targ_fn_descriptor
*fn
,
2192 struct ptx_device
*ptx_dev
,
2193 int *teams_p
, int *threads_p
)
2195 int max_warps_block
= fn
->max_threads_per_block
/ 32;
2196 /* Maximum 32 warps per block is an implementation limit in NVPTX backend
2197 and libgcc, which matches documented limit of all GPUs as of 2015. */
2198 if (max_warps_block
> 32)
2199 max_warps_block
= 32;
2200 if (*threads_p
<= 0)
2202 if (*threads_p
> max_warps_block
)
2203 *threads_p
= max_warps_block
;
2205 int regs_per_block
= fn
->regs_per_thread
* 32 * *threads_p
;
2206 /* This is an estimate of how many blocks the device can host simultaneously.
2207 Actual limit, which may be lower, can be queried with "occupancy control"
2208 driver interface (since CUDA 6.0). */
2209 int max_blocks
= ptx_dev
->regs_per_sm
/ regs_per_block
* ptx_dev
->num_sms
;
2210 if (*teams_p
<= 0 || *teams_p
> max_blocks
)
2211 *teams_p
= max_blocks
;
2214 /* Return the size of per-warp stacks (see gcc -msoft-stack) to use for OpenMP
2218 nvptx_stacks_size ()
2223 /* Return contiguous storage for NUM stacks, each SIZE bytes. */
2226 nvptx_stacks_alloc (size_t size
, int num
)
2229 CUresult r
= CUDA_CALL_NOCHECK (cuMemAlloc
, &stacks
, size
* num
);
2230 if (r
!= CUDA_SUCCESS
)
2231 GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r
));
2232 return (void *) stacks
;
2235 /* Release storage previously allocated by nvptx_stacks_alloc. */
2238 nvptx_stacks_free (void *p
, int num
)
2240 CUresult r
= CUDA_CALL_NOCHECK (cuMemFree
, (CUdeviceptr
) p
);
2241 if (r
!= CUDA_SUCCESS
)
2242 GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r
));
2246 GOMP_OFFLOAD_run (int ord
, void *tgt_fn
, void *tgt_vars
, void **args
)
2248 CUfunction function
= ((struct targ_fn_descriptor
*) tgt_fn
)->fn
;
2250 struct ptx_device
*ptx_dev
= ptx_devices
[ord
];
2251 const char *maybe_abort_msg
= "(perhaps abort was called)";
2252 int teams
= 0, threads
= 0;
2255 GOMP_PLUGIN_fatal ("No target arguments provided");
2258 intptr_t id
= (intptr_t) *args
++, val
;
2259 if (id
& GOMP_TARGET_ARG_SUBSEQUENT_PARAM
)
2260 val
= (intptr_t) *args
++;
2262 val
= id
>> GOMP_TARGET_ARG_VALUE_SHIFT
;
2263 if ((id
& GOMP_TARGET_ARG_DEVICE_MASK
) != GOMP_TARGET_ARG_DEVICE_ALL
)
2265 val
= val
> INT_MAX
? INT_MAX
: val
;
2266 id
&= GOMP_TARGET_ARG_ID_MASK
;
2267 if (id
== GOMP_TARGET_ARG_NUM_TEAMS
)
2269 else if (id
== GOMP_TARGET_ARG_THREAD_LIMIT
)
2272 nvptx_adjust_launch_bounds (tgt_fn
, ptx_dev
, &teams
, &threads
);
2274 size_t stack_size
= nvptx_stacks_size ();
2275 void *stacks
= nvptx_stacks_alloc (stack_size
, teams
* threads
);
2276 void *fn_args
[] = {tgt_vars
, stacks
, (void *) stack_size
};
2277 size_t fn_args_size
= sizeof fn_args
;
2279 CU_LAUNCH_PARAM_BUFFER_POINTER
, fn_args
,
2280 CU_LAUNCH_PARAM_BUFFER_SIZE
, &fn_args_size
,
2283 r
= CUDA_CALL_NOCHECK (cuLaunchKernel
, function
, teams
, 1, 1,
2284 32, threads
, 1, 0, ptx_dev
->null_stream
->stream
,
2286 if (r
!= CUDA_SUCCESS
)
2287 GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r
));
2289 r
= CUDA_CALL_NOCHECK (cuCtxSynchronize
, );
2290 if (r
== CUDA_ERROR_LAUNCH_FAILED
)
2291 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r
),
2293 else if (r
!= CUDA_SUCCESS
)
2294 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r
));
2295 nvptx_stacks_free (stacks
, teams
* threads
);
2299 GOMP_OFFLOAD_async_run (int ord
, void *tgt_fn
, void *tgt_vars
, void **args
,
2302 GOMP_PLUGIN_fatal ("GOMP_OFFLOAD_async_run unimplemented");