1 /* Plugin for NVPTX execution.
3 Copyright (C) 2013-2024 Free Software Foundation, Inc.
5 Contributed by Mentor Embedded.
7 This file is part of the GNU Offloading and Multi Processing Library
10 Libgomp is free software; you can redistribute it and/or modify it
11 under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 3, or (at your option)
15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
20 Under Section 7 of GPL version 3, you are granted additional
21 permissions described in the GCC Runtime Library Exception, version
22 3.1, as published by the Free Software Foundation.
24 You should have received a copy of the GNU General Public License and
25 a copy of the GCC Runtime Library Exception along with this program;
26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
27 <http://www.gnu.org/licenses/>. */
29 /* Nvidia PTX-specific parts of OpenACC support. The cuda driver
30 library appears to hold some implicit state, but the documentation
31 is not clear as to what that state might be. Or how one might
32 propagate it from one thread to another. */
38 #include "libgomp-plugin.h"
39 #include "oacc-plugin.h"
40 #include "gomp-constants.h"
43 /* For struct rev_offload + GOMP_REV_OFFLOAD_VAR. */
44 #include "config/nvptx/libgomp-nvptx.h"
47 #ifndef PLUGIN_NVPTX_INCLUDE_SYSTEM_CUDA_H
48 # include "cuda/cuda.h"
61 /* An arbitrary fixed limit (128MB) for the size of the OpenMP soft stacks
62 block to cache between kernel invocations. For soft-stacks blocks bigger
63 than this, we will free the block before attempting another GPU memory
64 allocation (i.e. in GOMP_OFFLOAD_alloc). Otherwise, if an allocation fails,
65 we will free the cached soft-stacks block anyway then retry the
66 allocation. If that fails too, we lose. */
68 #define SOFTSTACK_CACHE_LIMIT 134217728
70 #if CUDA_VERSION < 6000
71 extern CUresult
cuGetErrorString (CUresult
, const char **);
72 #define CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR 82
75 #if CUDA_VERSION >= 6050
78 CUresult
cuLinkAddData (CUlinkState
, CUjitInputType
, void *, size_t,
79 const char *, unsigned, CUjit_option
*, void **);
80 CUresult
cuLinkCreate (unsigned, CUjit_option
*, void **, CUlinkState
*);
82 typedef size_t (*CUoccupancyB2DSize
)(int);
83 CUresult
cuLinkAddData_v2 (CUlinkState
, CUjitInputType
, void *, size_t,
84 const char *, unsigned, CUjit_option
*, void **);
85 CUresult
cuLinkCreate_v2 (unsigned, CUjit_option
*, void **, CUlinkState
*);
86 CUresult
cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction
,
87 CUoccupancyB2DSize
, size_t, int);
90 #define DO_PRAGMA(x) _Pragma (#x)
92 #ifndef PLUGIN_NVPTX_LINK_LIBCUDA
97 # define CUDA_ONE_CALL(call) \
98 __typeof (call) *call;
99 # define CUDA_ONE_CALL_MAYBE_NULL(call) \
101 #include "cuda-lib.def"
102 # undef CUDA_ONE_CALL
103 # undef CUDA_ONE_CALL_MAYBE_NULL
107 /* -1 if init_cuda_lib has not been called yet, false
108 if it has been and failed, true if it has been and succeeded. */
109 static signed char cuda_lib_inited
= -1;
111 /* Dynamically load the CUDA runtime library and initialize function
112 pointers, return false if unsuccessful, true if successful. */
116 if (cuda_lib_inited
!= -1)
117 return cuda_lib_inited
;
118 const char *cuda_runtime_lib
= "libcuda.so.1";
119 void *h
= dlopen (cuda_runtime_lib
, RTLD_LAZY
);
120 cuda_lib_inited
= false;
124 # define CUDA_ONE_CALL(call) CUDA_ONE_CALL_1 (call, false)
125 # define CUDA_ONE_CALL_MAYBE_NULL(call) CUDA_ONE_CALL_1 (call, true)
126 # define CUDA_ONE_CALL_1(call, allow_null) \
127 cuda_lib.call = dlsym (h, #call); \
128 if (!allow_null && cuda_lib.call == NULL) \
129 GOMP_PLUGIN_fatal ("'%s' is missing '%s'", cuda_runtime_lib, #call);
130 #include "cuda-lib.def"
131 # undef CUDA_ONE_CALL
132 # undef CUDA_ONE_CALL_1
133 # undef CUDA_ONE_CALL_MAYBE_NULL
135 cuda_lib_inited
= true;
138 # define CUDA_CALL_PREFIX cuda_lib.
141 # define CUDA_ONE_CALL(call)
142 # define CUDA_ONE_CALL_MAYBE_NULL(call) DO_PRAGMA (weak call)
143 #include "cuda-lib.def"
144 #undef CUDA_ONE_CALL_MAYBE_NULL
147 # define CUDA_CALL_PREFIX
148 # define init_cuda_lib() true
151 #include "secure_getenv.h"
153 static void notify_var (const char *, const char *);
157 #define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
158 #define MAX(X,Y) ((X) > (Y) ? (X) : (Y))
160 /* Convenience macros for the frequently used CUDA library call and
161 error handling sequence as well as CUDA library calls that
162 do the error checking themselves or don't do it at all. */
164 #define CUDA_CALL_ERET(ERET, FN, ...) \
167 = CUDA_CALL_PREFIX FN (__VA_ARGS__); \
168 if (__r != CUDA_SUCCESS) \
170 GOMP_PLUGIN_error (#FN " error: %s", \
176 #define CUDA_CALL(FN, ...) \
177 CUDA_CALL_ERET (false, FN, __VA_ARGS__)
179 #define CUDA_CALL_ASSERT(FN, ...) \
182 = CUDA_CALL_PREFIX FN (__VA_ARGS__); \
183 if (__r != CUDA_SUCCESS) \
185 GOMP_PLUGIN_fatal (#FN " error: %s", \
190 #define CUDA_CALL_NOCHECK(FN, ...) \
191 CUDA_CALL_PREFIX FN (__VA_ARGS__)
193 #define CUDA_CALL_EXISTS(FN) \
197 cuda_error (CUresult r
)
199 const char *fallback
= "unknown cuda error";
202 if (!CUDA_CALL_EXISTS (cuGetErrorString
))
205 r
= CUDA_CALL_NOCHECK (cuGetErrorString
, r
, &desc
);
206 if (r
== CUDA_SUCCESS
)
212 /* Version of the CUDA Toolkit in the same MAJOR.MINOR format that is used by
213 Nvidia, such as in the 'deviceQuery' program (Nvidia's CUDA samples). */
214 static char cuda_driver_version_s
[30];
216 static unsigned int instantiated_devices
= 0;
217 static pthread_mutex_t ptx_dev_lock
= PTHREAD_MUTEX_INITIALIZER
;
219 /* NVPTX/CUDA specific definition of asynchronous queues. */
220 struct goacc_asyncqueue
222 CUstream cuda_stream
;
225 struct nvptx_callback
229 struct goacc_asyncqueue
*aq
;
230 struct nvptx_callback
*next
;
233 /* Thread-specific data for PTX. */
237 /* We currently have this embedded inside the plugin because libgomp manages
238 devices through integer target_ids. This might be better if using an
239 opaque target-specific pointer directly from gomp_device_descr. */
240 struct ptx_device
*ptx_dev
;
243 /* Target data function launch information. */
245 struct targ_fn_launch
248 unsigned short dim
[GOMP_DIM_MAX
];
251 /* Target PTX object information. */
259 /* Target data image information. */
261 typedef struct nvptx_tdata
263 const struct targ_ptx_obj
*ptx_objs
;
266 const char *const *var_names
;
269 const struct targ_fn_launch
*fn_descs
;
275 /* Descriptor of a loaded function. */
277 struct targ_fn_descriptor
280 const struct targ_fn_launch
*launch
;
282 int max_threads_per_block
;
285 /* A loaded PTX image. */
286 struct ptx_image_data
288 const void *target_data
;
291 struct targ_fn_descriptor
*fns
; /* Array of functions. */
293 struct ptx_image_data
*next
;
296 struct ptx_free_block
299 struct ptx_free_block
*next
;
319 int max_threads_per_block
;
320 int max_threads_per_multiprocessor
;
321 int default_dims
[GOMP_DIM_MAX
];
323 /* Length as used by the CUDA Runtime API ('struct cudaDeviceProp'). */
326 struct ptx_image_data
*images
; /* Images loaded on device. */
327 pthread_mutex_t image_lock
; /* Lock for above list. */
329 struct ptx_free_block
*free_blocks
;
330 pthread_mutex_t free_blocks_lock
;
332 /* OpenMP stacks, cached between kernel invocations. */
337 pthread_mutex_t lock
;
340 struct rev_offload
*rev_data
;
341 struct ptx_device
*next
;
344 static struct ptx_device
**ptx_devices
;
346 /* "Native" GPU thread stack size. */
347 static unsigned native_gpu_thread_stack_size
= 0;
349 /* OpenMP kernels reserve a small amount of ".shared" space for use by
350 omp_alloc. The size is configured using GOMP_NVPTX_LOWLAT_POOL, but the
351 default is set here. */
352 static unsigned lowlat_pool_size
= 8 * 1024;
354 static bool nvptx_do_global_cdtors (CUmodule
, struct ptx_device
*,
356 static size_t nvptx_stacks_size ();
357 static void *nvptx_stacks_acquire (struct ptx_device
*, size_t, int);
359 static inline struct nvptx_thread
*
362 return (struct nvptx_thread
*) GOMP_PLUGIN_acc_thread ();
365 /* Initialize the device. Return TRUE on success, else FALSE. PTX_DEV_LOCK
366 should be locked on entry and remains locked on exit. */
373 if (instantiated_devices
!= 0)
376 if (!init_cuda_lib ())
379 CUDA_CALL (cuInit
, 0);
381 int cuda_driver_version
;
382 CUDA_CALL_ERET (NULL
, cuDriverGetVersion
, &cuda_driver_version
);
383 snprintf (cuda_driver_version_s
, sizeof cuda_driver_version_s
,
385 cuda_driver_version
/ 1000, cuda_driver_version
% 1000 / 10);
387 CUDA_CALL (cuDeviceGetCount
, &ndevs
);
388 ptx_devices
= GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device
*)
394 /* Select the N'th PTX device for the current host thread. The device must
395 have been previously opened before calling this function. */
398 nvptx_attach_host_thread_to_device (int n
)
402 struct ptx_device
*ptx_dev
;
405 r
= CUDA_CALL_NOCHECK (cuCtxGetDevice
, &dev
);
406 if (r
== CUDA_ERROR_NOT_PERMITTED
)
408 /* Assume we're in a CUDA callback, just return true. */
411 if (r
!= CUDA_SUCCESS
&& r
!= CUDA_ERROR_INVALID_CONTEXT
)
413 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r
));
417 if (r
!= CUDA_ERROR_INVALID_CONTEXT
&& dev
== n
)
423 ptx_dev
= ptx_devices
[n
];
426 GOMP_PLUGIN_error ("device %d not found", n
);
430 CUDA_CALL (cuCtxGetCurrent
, &thd_ctx
);
432 /* We don't necessarily have a current context (e.g. if it has been
433 destroyed. Pop it if we do though. */
435 CUDA_CALL (cuCtxPopCurrent
, &old_ctx
);
437 CUDA_CALL (cuCtxPushCurrent
, ptx_dev
->ctx
);
442 static struct ptx_device
*
443 nvptx_open_device (int n
)
445 struct ptx_device
*ptx_dev
;
446 CUdevice dev
, ctx_dev
;
450 CUDA_CALL_ERET (NULL
, cuDeviceGet
, &dev
, n
);
452 ptx_dev
= GOMP_PLUGIN_malloc (sizeof (struct ptx_device
));
456 ptx_dev
->ctx_shared
= false;
458 r
= CUDA_CALL_NOCHECK (cuCtxGetDevice
, &ctx_dev
);
459 if (r
!= CUDA_SUCCESS
&& r
!= CUDA_ERROR_INVALID_CONTEXT
)
461 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r
));
465 if (r
!= CUDA_ERROR_INVALID_CONTEXT
&& ctx_dev
!= dev
)
467 /* The current host thread has an active context for a different device.
470 CUDA_CALL_ERET (NULL
, cuCtxPopCurrent
, &old_ctx
);
473 CUDA_CALL_ERET (NULL
, cuCtxGetCurrent
, &ptx_dev
->ctx
);
476 CUDA_CALL_ERET (NULL
, cuCtxCreate
, &ptx_dev
->ctx
, CU_CTX_SCHED_AUTO
, dev
);
478 ptx_dev
->ctx_shared
= true;
480 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
481 &pi
, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP
, dev
);
482 ptx_dev
->overlap
= pi
;
484 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
485 &pi
, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY
, dev
);
488 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
489 &pi
, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS
, dev
);
490 ptx_dev
->concur
= pi
;
492 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
493 &pi
, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE
, dev
);
496 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
497 &pi
, CU_DEVICE_ATTRIBUTE_INTEGRATED
, dev
);
500 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
501 &pi
, CU_DEVICE_ATTRIBUTE_CLOCK_RATE
, dev
);
502 ptx_dev
->clock_khz
= pi
;
504 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
505 &pi
, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT
, dev
);
506 ptx_dev
->num_sms
= pi
;
508 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
509 &pi
, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK
, dev
);
510 ptx_dev
->regs_per_block
= pi
;
512 /* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR is defined only
513 in CUDA 6.0 and newer. */
514 r
= CUDA_CALL_NOCHECK (cuDeviceGetAttribute
, &pi
,
515 CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR
,
517 /* Fallback: use limit of registers per block, which is usually equal. */
518 if (r
== CUDA_ERROR_INVALID_VALUE
)
519 pi
= ptx_dev
->regs_per_block
;
520 else if (r
!= CUDA_SUCCESS
)
522 GOMP_PLUGIN_error ("cuDeviceGetAttribute error: %s", cuda_error (r
));
525 ptx_dev
->regs_per_sm
= pi
;
527 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
528 &pi
, CU_DEVICE_ATTRIBUTE_WARP_SIZE
, dev
);
531 GOMP_PLUGIN_error ("Only warp size 32 is supported");
534 ptx_dev
->warp_size
= pi
;
536 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
, &pi
,
537 CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK
, dev
);
538 ptx_dev
->max_threads_per_block
= pi
;
540 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
, &pi
,
541 CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR
, dev
);
542 ptx_dev
->max_threads_per_multiprocessor
= pi
;
544 /* Required below for reverse offload as implemented, but with compute
545 capability >= 2.0 and 64bit device processes, this should be universally be
546 the case; hence, an assert. */
547 r
= CUDA_CALL_NOCHECK (cuDeviceGetAttribute
, &pi
,
548 CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING
, dev
);
549 assert (r
== CUDA_SUCCESS
&& pi
);
551 for (int i
= 0; i
!= GOMP_DIM_MAX
; i
++)
552 ptx_dev
->default_dims
[i
] = 0;
554 CUDA_CALL_ERET (NULL
, cuDeviceGetName
, ptx_dev
->name
, sizeof ptx_dev
->name
,
557 ptx_dev
->images
= NULL
;
558 pthread_mutex_init (&ptx_dev
->image_lock
, NULL
);
560 ptx_dev
->free_blocks
= NULL
;
561 pthread_mutex_init (&ptx_dev
->free_blocks_lock
, NULL
);
563 /* "Native" GPU thread stack size. */
565 /* This is intentionally undocumented, until we work out a proper, common
566 scheme (as much as makes sense) between all offload plugins as well
567 as between nvptx offloading use of "native" stacks for OpenACC vs.
568 OpenMP "soft stacks" vs. OpenMP '-msoft-stack-reserve-local=[...]'.
570 GCN offloading has a 'GCN_STACK_SIZE' environment variable (without
571 'GOMP_' prefix): documented; presumably used for all things OpenACC and
572 OpenMP? Based on GCN command-line option '-mstack-size=[...]' (marked
573 "obsolete"), that one may be set via a GCN 'mkoffload'-synthesized
574 'constructor' function. */
575 const char *var_name
= "GOMP_NVPTX_NATIVE_GPU_THREAD_STACK_SIZE";
576 const char *env_var
= secure_getenv (var_name
);
577 notify_var (var_name
, env_var
);
582 unsigned long val
= strtoul (env_var
, &endptr
, 10);
583 if (endptr
== NULL
|| *endptr
!= '\0'
584 || errno
== ERANGE
|| errno
== EINVAL
586 GOMP_PLUGIN_error ("Error parsing %s", var_name
);
588 native_gpu_thread_stack_size
= val
;
591 if (native_gpu_thread_stack_size
== 0)
592 ; /* Zero means use default. */
595 GOMP_PLUGIN_debug (0, "Setting \"native\" GPU thread stack size"
596 " ('CU_LIMIT_STACK_SIZE') to %u bytes\n",
597 native_gpu_thread_stack_size
);
598 CUDA_CALL (cuCtxSetLimit
,
599 CU_LIMIT_STACK_SIZE
, (size_t) native_gpu_thread_stack_size
);
602 /* OpenMP "soft stacks". */
603 ptx_dev
->omp_stacks
.ptr
= 0;
604 ptx_dev
->omp_stacks
.size
= 0;
605 pthread_mutex_init (&ptx_dev
->omp_stacks
.lock
, NULL
);
607 ptx_dev
->rev_data
= NULL
;
613 nvptx_close_device (struct ptx_device
*ptx_dev
)
620 for (struct ptx_image_data
*image
= ptx_dev
->images
;
624 if (!nvptx_do_global_cdtors (image
->module
, ptx_dev
,
625 "__do_global_dtors__entry"
626 /* or "__do_global_dtors__entry__mgomp" */))
630 for (struct ptx_free_block
*b
= ptx_dev
->free_blocks
; b
;)
632 struct ptx_free_block
*b_next
= b
->next
;
633 CUDA_CALL (cuMemFree
, (CUdeviceptr
) b
->ptr
);
638 pthread_mutex_destroy (&ptx_dev
->free_blocks_lock
);
639 pthread_mutex_destroy (&ptx_dev
->image_lock
);
641 pthread_mutex_destroy (&ptx_dev
->omp_stacks
.lock
);
643 if (ptx_dev
->omp_stacks
.ptr
)
644 CUDA_CALL (cuMemFree
, ptx_dev
->omp_stacks
.ptr
);
646 if (!ptx_dev
->ctx_shared
)
647 CUDA_CALL (cuCtxDestroy
, ptx_dev
->ctx
);
655 nvptx_get_num_devices (void)
659 /* This function will be called before the plugin has been initialized in
660 order to enumerate available devices, but CUDA API routines can't be used
661 until cuInit has been called. Just call it now (but don't yet do any
662 further initialization). */
663 if (instantiated_devices
== 0)
665 if (!init_cuda_lib ())
667 CUresult r
= CUDA_CALL_NOCHECK (cuInit
, 0);
668 /* This is not an error: e.g. we may have CUDA libraries installed but
669 no devices available. */
670 if (r
== CUDA_ERROR_NO_DEVICE
)
672 GOMP_PLUGIN_debug (0, "Disabling nvptx offloading; cuInit: %s\n",
676 else if (r
!= CUDA_SUCCESS
)
677 GOMP_PLUGIN_fatal ("cuInit error: %s", cuda_error (r
));
680 CUDA_CALL_ASSERT (cuDeviceGetCount
, &n
);
685 notify_var (const char *var_name
, const char *env_var
)
688 GOMP_PLUGIN_debug (0, "%s: <Not defined>\n", var_name
);
690 GOMP_PLUGIN_debug (0, "%s: '%s'\n", var_name
, env_var
);
694 process_GOMP_NVPTX_JIT (intptr_t *gomp_nvptx_o
)
696 const char *var_name
= "GOMP_NVPTX_JIT";
697 const char *env_var
= secure_getenv (var_name
);
698 notify_var (var_name
, env_var
);
703 const char *c
= env_var
;
709 if (c
[0] == '-' && c
[1] == 'O'
710 && '0' <= c
[2] && c
[2] <= '4'
711 && (c
[3] == '\0' || c
[3] == ' '))
713 *gomp_nvptx_o
= c
[2] - '0';
718 GOMP_PLUGIN_error ("Error parsing %s", var_name
);
724 link_ptx (CUmodule
*module
, const struct targ_ptx_obj
*ptx_objs
,
727 CUjit_option opts
[7];
732 CUlinkState linkstate
;
735 size_t linkoutsize
__attribute__ ((unused
));
737 opts
[0] = CU_JIT_WALL_TIME
;
738 optvals
[0] = &elapsed
;
740 opts
[1] = CU_JIT_INFO_LOG_BUFFER
;
741 optvals
[1] = &ilog
[0];
743 opts
[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES
;
744 optvals
[2] = (void *) sizeof ilog
;
746 opts
[3] = CU_JIT_ERROR_LOG_BUFFER
;
747 optvals
[3] = &elog
[0];
749 opts
[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES
;
750 optvals
[4] = (void *) sizeof elog
;
752 opts
[5] = CU_JIT_LOG_VERBOSE
;
753 optvals
[5] = (void *) 1;
755 static intptr_t gomp_nvptx_o
= -1;
757 static bool init_done
= false;
760 process_GOMP_NVPTX_JIT (&gomp_nvptx_o
);
765 if (gomp_nvptx_o
!= -1)
767 opts
[nopts
] = CU_JIT_OPTIMIZATION_LEVEL
;
768 optvals
[nopts
] = (void *) gomp_nvptx_o
;
772 if (CUDA_CALL_EXISTS (cuLinkCreate_v2
))
773 CUDA_CALL (cuLinkCreate_v2
, nopts
, opts
, optvals
, &linkstate
);
775 CUDA_CALL (cuLinkCreate
, nopts
, opts
, optvals
, &linkstate
);
777 for (; num_objs
--; ptx_objs
++)
779 /* cuLinkAddData's 'data' argument erroneously omits the const
781 GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs
->code
);
782 if (CUDA_CALL_EXISTS (cuLinkAddData_v2
))
783 r
= CUDA_CALL_NOCHECK (cuLinkAddData_v2
, linkstate
, CU_JIT_INPUT_PTX
,
784 (char *) ptx_objs
->code
, ptx_objs
->size
,
787 r
= CUDA_CALL_NOCHECK (cuLinkAddData
, linkstate
, CU_JIT_INPUT_PTX
,
788 (char *) ptx_objs
->code
, ptx_objs
->size
,
790 if (r
!= CUDA_SUCCESS
)
792 GOMP_PLUGIN_error ("Link error log %s\n", &elog
[0]);
793 GOMP_PLUGIN_error ("cuLinkAddData (ptx_code) error: %s",
799 GOMP_PLUGIN_debug (0, "Linking\n");
800 r
= CUDA_CALL_NOCHECK (cuLinkComplete
, linkstate
, &linkout
, &linkoutsize
);
802 GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed
);
803 GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog
[0]);
805 if (r
!= CUDA_SUCCESS
)
807 GOMP_PLUGIN_error ("Link error log %s\n", &elog
[0]);
808 GOMP_PLUGIN_error ("cuLinkComplete error: %s", cuda_error (r
));
812 CUDA_CALL (cuModuleLoadData
, module
, linkout
);
813 CUDA_CALL (cuLinkDestroy
, linkstate
);
818 nvptx_exec (void (*fn
), unsigned *dims
, void *targ_mem_desc
,
819 CUdeviceptr dp
, CUstream stream
)
821 struct targ_fn_descriptor
*targ_fn
= (struct targ_fn_descriptor
*) fn
;
825 struct nvptx_thread
*nvthd
= nvptx_thread ();
826 int warp_size
= nvthd
->ptx_dev
->warp_size
;
828 function
= targ_fn
->fn
;
830 /* Initialize the launch dimensions. Typically this is constant,
831 provided by the device compiler, but we must permit runtime
834 for (i
= 0; i
!= GOMP_DIM_MAX
; i
++)
836 if (targ_fn
->launch
->dim
[i
])
837 dims
[i
] = targ_fn
->launch
->dim
[i
];
844 pthread_mutex_lock (&ptx_dev_lock
);
846 static int gomp_openacc_dims
[GOMP_DIM_MAX
];
847 if (!gomp_openacc_dims
[0])
849 /* See if the user provided GOMP_OPENACC_DIM environment
850 variable to specify runtime defaults. */
851 for (int i
= 0; i
< GOMP_DIM_MAX
; ++i
)
852 gomp_openacc_dims
[i
] = GOMP_PLUGIN_acc_default_dim (i
);
855 if (!nvthd
->ptx_dev
->default_dims
[0])
857 int default_dims
[GOMP_DIM_MAX
];
858 for (int i
= 0; i
< GOMP_DIM_MAX
; ++i
)
859 default_dims
[i
] = gomp_openacc_dims
[i
];
861 int gang
, worker
, vector
;
863 int block_size
= nvthd
->ptx_dev
->max_threads_per_block
;
864 int cpu_size
= nvthd
->ptx_dev
->max_threads_per_multiprocessor
;
865 int dev_size
= nvthd
->ptx_dev
->num_sms
;
866 GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
867 " dev_size=%d, cpu_size=%d\n",
868 warp_size
, block_size
, dev_size
, cpu_size
);
870 gang
= (cpu_size
/ block_size
) * dev_size
;
871 worker
= block_size
/ warp_size
;
875 /* There is no upper bound on the gang size. The best size
876 matches the hardware configuration. Logical gangs are
877 scheduled onto physical hardware. To maximize usage, we
878 should guess a large number. */
879 if (default_dims
[GOMP_DIM_GANG
] < 1)
880 default_dims
[GOMP_DIM_GANG
] = gang
? gang
: 1024;
881 /* The worker size must not exceed the hardware. */
882 if (default_dims
[GOMP_DIM_WORKER
] < 1
883 || (default_dims
[GOMP_DIM_WORKER
] > worker
&& gang
))
884 default_dims
[GOMP_DIM_WORKER
] = worker
;
885 /* The vector size must exactly match the hardware. */
886 if (default_dims
[GOMP_DIM_VECTOR
] < 1
887 || (default_dims
[GOMP_DIM_VECTOR
] != vector
&& gang
))
888 default_dims
[GOMP_DIM_VECTOR
] = vector
;
890 GOMP_PLUGIN_debug (0, " default dimensions [%d,%d,%d]\n",
891 default_dims
[GOMP_DIM_GANG
],
892 default_dims
[GOMP_DIM_WORKER
],
893 default_dims
[GOMP_DIM_VECTOR
]);
895 for (i
= 0; i
!= GOMP_DIM_MAX
; i
++)
896 nvthd
->ptx_dev
->default_dims
[i
] = default_dims
[i
];
898 pthread_mutex_unlock (&ptx_dev_lock
);
901 bool default_dim_p
[GOMP_DIM_MAX
];
902 for (i
= 0; i
!= GOMP_DIM_MAX
; i
++)
903 default_dim_p
[i
] = !dims
[i
];
905 if (!CUDA_CALL_EXISTS (cuOccupancyMaxPotentialBlockSize
))
907 for (i
= 0; i
!= GOMP_DIM_MAX
; i
++)
908 if (default_dim_p
[i
])
909 dims
[i
] = nvthd
->ptx_dev
->default_dims
[i
];
911 if (default_dim_p
[GOMP_DIM_VECTOR
])
912 dims
[GOMP_DIM_VECTOR
]
913 = MIN (dims
[GOMP_DIM_VECTOR
],
914 (targ_fn
->max_threads_per_block
/ warp_size
917 if (default_dim_p
[GOMP_DIM_WORKER
])
918 dims
[GOMP_DIM_WORKER
]
919 = MIN (dims
[GOMP_DIM_WORKER
],
920 targ_fn
->max_threads_per_block
/ dims
[GOMP_DIM_VECTOR
]);
924 /* Handle the case that the compiler allows the runtime to choose
925 the vector-length conservatively, by ignoring
926 gomp_openacc_dims[GOMP_DIM_VECTOR]. TODO: actually handle
929 /* TODO: limit gomp_openacc_dims[GOMP_DIM_WORKER] such that that
930 gomp_openacc_dims[GOMP_DIM_WORKER] * actual_vectors does not
931 exceed targ_fn->max_threads_per_block. */
932 int workers
= gomp_openacc_dims
[GOMP_DIM_WORKER
];
933 int gangs
= gomp_openacc_dims
[GOMP_DIM_GANG
];
936 CUDA_CALL_ASSERT (cuOccupancyMaxPotentialBlockSize
, &grids
,
937 &blocks
, function
, NULL
, 0,
938 dims
[GOMP_DIM_WORKER
] * dims
[GOMP_DIM_VECTOR
]);
939 GOMP_PLUGIN_debug (0, "cuOccupancyMaxPotentialBlockSize: "
940 "grid = %d, block = %d\n", grids
, blocks
);
942 /* Keep the num_gangs proportional to the block size. In
943 the case were a block size is limited by shared-memory
944 or the register file capacity, the runtime will not
945 excessively over assign gangs to the multiprocessor
946 units if their state is going to be swapped out even
947 more than necessary. The constant factor 2 is there to
948 prevent threads from idling when there is insufficient
951 gangs
= 2 * grids
* (blocks
/ warp_size
);
958 int actual_vectors
= (default_dim_p
[GOMP_DIM_VECTOR
]
960 : dims
[GOMP_DIM_VECTOR
]);
961 workers
= blocks
/ actual_vectors
;
962 workers
= MAX (workers
, 1);
963 /* If we need a per-worker barrier ... . */
964 if (actual_vectors
> 32)
965 /* Don't use more barriers than available. */
966 workers
= MIN (workers
, 15);
969 for (i
= 0; i
!= GOMP_DIM_MAX
; i
++)
970 if (default_dim_p
[i
])
973 case GOMP_DIM_GANG
: dims
[i
] = gangs
; break;
974 case GOMP_DIM_WORKER
: dims
[i
] = workers
; break;
975 case GOMP_DIM_VECTOR
: dims
[i
] = vectors
; break;
976 default: GOMP_PLUGIN_fatal ("invalid dim");
982 /* Check if the accelerator has sufficient hardware resources to
983 launch the offloaded kernel. */
984 if (dims
[GOMP_DIM_WORKER
] * dims
[GOMP_DIM_VECTOR
]
985 > targ_fn
->max_threads_per_block
)
988 = ("The Nvidia accelerator has insufficient resources to launch '%s'"
989 " with num_workers = %d and vector_length = %d"
991 "recompile the program with 'num_workers = x and vector_length = y'"
992 " on that offloaded region or '-fopenacc-dim=:x:y' where"
995 GOMP_PLUGIN_fatal (msg
, targ_fn
->launch
->fn
, dims
[GOMP_DIM_WORKER
],
996 dims
[GOMP_DIM_VECTOR
], targ_fn
->max_threads_per_block
);
999 /* Check if the accelerator has sufficient barrier resources to
1000 launch the offloaded kernel. */
1001 if (dims
[GOMP_DIM_WORKER
] > 15 && dims
[GOMP_DIM_VECTOR
] > 32)
1004 = ("The Nvidia accelerator has insufficient barrier resources to launch"
1005 " '%s' with num_workers = %d and vector_length = %d"
1007 "recompile the program with 'num_workers = x' on that offloaded"
1008 " region or '-fopenacc-dim=:x:' where x <= 15"
1010 "or, recompile the program with 'vector_length = 32' on that"
1011 " offloaded region or '-fopenacc-dim=::32'"
1013 GOMP_PLUGIN_fatal (msg
, targ_fn
->launch
->fn
, dims
[GOMP_DIM_WORKER
],
1014 dims
[GOMP_DIM_VECTOR
]);
1017 GOMP_PLUGIN_debug (0, " %s: kernel %s: launch"
1018 " gangs=%u, workers=%u, vectors=%u\n",
1019 __FUNCTION__
, targ_fn
->launch
->fn
, dims
[GOMP_DIM_GANG
],
1020 dims
[GOMP_DIM_WORKER
], dims
[GOMP_DIM_VECTOR
]);
1024 // num_gangs nctaid.x
1025 // num_workers ntid.y
1026 // vector length ntid.x
1028 struct goacc_thread
*thr
= GOMP_PLUGIN_goacc_thread ();
1029 acc_prof_info
*prof_info
= thr
->prof_info
;
1030 acc_event_info enqueue_launch_event_info
;
1031 acc_api_info
*api_info
= thr
->api_info
;
1032 bool profiling_p
= __builtin_expect (prof_info
!= NULL
, false);
1035 prof_info
->event_type
= acc_ev_enqueue_launch_start
;
1037 enqueue_launch_event_info
.launch_event
.event_type
1038 = prof_info
->event_type
;
1039 enqueue_launch_event_info
.launch_event
.valid_bytes
1040 = _ACC_LAUNCH_EVENT_INFO_VALID_BYTES
;
1041 enqueue_launch_event_info
.launch_event
.parent_construct
1042 = acc_construct_parallel
;
1043 enqueue_launch_event_info
.launch_event
.implicit
= 1;
1044 enqueue_launch_event_info
.launch_event
.tool_info
= NULL
;
1045 enqueue_launch_event_info
.launch_event
.kernel_name
= targ_fn
->launch
->fn
;
1046 enqueue_launch_event_info
.launch_event
.num_gangs
1047 = dims
[GOMP_DIM_GANG
];
1048 enqueue_launch_event_info
.launch_event
.num_workers
1049 = dims
[GOMP_DIM_WORKER
];
1050 enqueue_launch_event_info
.launch_event
.vector_length
1051 = dims
[GOMP_DIM_VECTOR
];
1053 api_info
->device_api
= acc_device_api_cuda
;
1055 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info
, &enqueue_launch_event_info
,
1060 CUDA_CALL_ASSERT (cuLaunchKernel
, function
,
1061 dims
[GOMP_DIM_GANG
], 1, 1,
1062 dims
[GOMP_DIM_VECTOR
], dims
[GOMP_DIM_WORKER
], 1,
1063 0, stream
, kargs
, 0);
1067 prof_info
->event_type
= acc_ev_enqueue_launch_end
;
1068 enqueue_launch_event_info
.launch_event
.event_type
1069 = prof_info
->event_type
;
1070 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info
, &enqueue_launch_event_info
,
1074 GOMP_PLUGIN_debug (0, " %s: kernel %s: finished\n", __FUNCTION__
,
1075 targ_fn
->launch
->fn
);
1078 void * openacc_get_current_cuda_context (void);
1081 goacc_profiling_acc_ev_alloc (struct goacc_thread
*thr
, void *dp
, size_t s
)
1083 acc_prof_info
*prof_info
= thr
->prof_info
;
1084 acc_event_info data_event_info
;
1085 acc_api_info
*api_info
= thr
->api_info
;
1087 prof_info
->event_type
= acc_ev_alloc
;
1089 data_event_info
.data_event
.event_type
= prof_info
->event_type
;
1090 data_event_info
.data_event
.valid_bytes
= _ACC_DATA_EVENT_INFO_VALID_BYTES
;
1091 data_event_info
.data_event
.parent_construct
= acc_construct_parallel
;
1092 data_event_info
.data_event
.implicit
= 1;
1093 data_event_info
.data_event
.tool_info
= NULL
;
1094 data_event_info
.data_event
.var_name
= NULL
;
1095 data_event_info
.data_event
.bytes
= s
;
1096 data_event_info
.data_event
.host_ptr
= NULL
;
1097 data_event_info
.data_event
.device_ptr
= dp
;
1099 api_info
->device_api
= acc_device_api_cuda
;
1101 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info
, &data_event_info
, api_info
);
1104 /* Free the cached soft-stacks block if it is above the SOFTSTACK_CACHE_LIMIT
1105 size threshold, or if FORCE is true. */
1108 nvptx_stacks_free (struct ptx_device
*ptx_dev
, bool force
)
1110 pthread_mutex_lock (&ptx_dev
->omp_stacks
.lock
);
1111 if (ptx_dev
->omp_stacks
.ptr
1112 && (force
|| ptx_dev
->omp_stacks
.size
> SOFTSTACK_CACHE_LIMIT
))
1114 CUresult r
= CUDA_CALL_NOCHECK (cuMemFree
, ptx_dev
->omp_stacks
.ptr
);
1115 if (r
!= CUDA_SUCCESS
)
1116 GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r
));
1117 ptx_dev
->omp_stacks
.ptr
= 0;
1118 ptx_dev
->omp_stacks
.size
= 0;
1120 pthread_mutex_unlock (&ptx_dev
->omp_stacks
.lock
);
1124 nvptx_alloc (size_t s
, bool suppress_errors
)
1128 CUresult r
= CUDA_CALL_NOCHECK (cuMemAlloc
, &d
, s
);
1129 if (suppress_errors
&& r
== CUDA_ERROR_OUT_OF_MEMORY
)
1131 else if (r
!= CUDA_SUCCESS
)
1133 GOMP_PLUGIN_error ("nvptx_alloc error: %s", cuda_error (r
));
1137 /* NOTE: We only do profiling stuff if the memory allocation succeeds. */
1138 struct goacc_thread
*thr
= GOMP_PLUGIN_goacc_thread ();
1140 = __builtin_expect (thr
!= NULL
&& thr
->prof_info
!= NULL
, false);
1142 goacc_profiling_acc_ev_alloc (thr
, (void *) d
, s
);
1148 goacc_profiling_acc_ev_free (struct goacc_thread
*thr
, void *p
)
1150 acc_prof_info
*prof_info
= thr
->prof_info
;
1151 acc_event_info data_event_info
;
1152 acc_api_info
*api_info
= thr
->api_info
;
1154 prof_info
->event_type
= acc_ev_free
;
1156 data_event_info
.data_event
.event_type
= prof_info
->event_type
;
1157 data_event_info
.data_event
.valid_bytes
= _ACC_DATA_EVENT_INFO_VALID_BYTES
;
1158 data_event_info
.data_event
.parent_construct
= acc_construct_parallel
;
1159 data_event_info
.data_event
.implicit
= 1;
1160 data_event_info
.data_event
.tool_info
= NULL
;
1161 data_event_info
.data_event
.var_name
= NULL
;
1162 data_event_info
.data_event
.bytes
= -1;
1163 data_event_info
.data_event
.host_ptr
= NULL
;
1164 data_event_info
.data_event
.device_ptr
= p
;
1166 api_info
->device_api
= acc_device_api_cuda
;
1168 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info
, &data_event_info
, api_info
);
1172 nvptx_free (void *p
, struct ptx_device
*ptx_dev
)
1177 CUresult r
= CUDA_CALL_NOCHECK (cuMemGetAddressRange
, &pb
, &ps
,
1179 if (r
== CUDA_ERROR_NOT_PERMITTED
)
1181 /* We assume that this error indicates we are in a CUDA callback context,
1182 where all CUDA calls are not allowed (see cuStreamAddCallback
1183 documentation for description). Arrange to free this piece of device
1185 struct ptx_free_block
*n
1186 = GOMP_PLUGIN_malloc (sizeof (struct ptx_free_block
));
1188 pthread_mutex_lock (&ptx_dev
->free_blocks_lock
);
1189 n
->next
= ptx_dev
->free_blocks
;
1190 ptx_dev
->free_blocks
= n
;
1191 pthread_mutex_unlock (&ptx_dev
->free_blocks_lock
);
1194 else if (r
!= CUDA_SUCCESS
)
1196 GOMP_PLUGIN_error ("cuMemGetAddressRange error: %s", cuda_error (r
));
1199 if ((CUdeviceptr
) p
!= pb
)
1201 GOMP_PLUGIN_error ("invalid device address");
1205 CUDA_CALL (cuMemFree
, (CUdeviceptr
) p
);
1206 struct goacc_thread
*thr
= GOMP_PLUGIN_goacc_thread ();
1208 = __builtin_expect (thr
!= NULL
&& thr
->prof_info
!= NULL
, false);
1210 goacc_profiling_acc_ev_free (thr
, p
);
1216 nvptx_get_current_cuda_device (void)
1218 struct nvptx_thread
*nvthd
= nvptx_thread ();
1220 if (!nvthd
|| !nvthd
->ptx_dev
)
1223 return &nvthd
->ptx_dev
->dev
;
1227 nvptx_get_current_cuda_context (void)
1229 struct nvptx_thread
*nvthd
= nvptx_thread ();
1231 if (!nvthd
|| !nvthd
->ptx_dev
)
1234 return nvthd
->ptx_dev
->ctx
;
1237 /* Plugin entry points. */
1240 GOMP_OFFLOAD_get_name (void)
1245 /* Return the UID; if not available return NULL.
1246 Returns freshly allocated memoy. */
1249 GOMP_OFFLOAD_get_uid (int ord
)
1253 struct ptx_device
*dev
= ptx_devices
[ord
];
1255 if (CUDA_CALL_EXISTS (cuDeviceGetUuid_v2
))
1256 r
= CUDA_CALL_NOCHECK (cuDeviceGetUuid_v2
, &s
, dev
->dev
);
1257 else if (CUDA_CALL_EXISTS (cuDeviceGetUuid
))
1258 r
= CUDA_CALL_NOCHECK (cuDeviceGetUuid
, &s
, dev
->dev
);
1261 if (r
!= CUDA_SUCCESS
)
1264 size_t len
= strlen ("GPU-12345678-9abc-defg-hijk-lmniopqrstuv");
1265 char *str
= (char *) GOMP_PLUGIN_malloc (len
+ 1);
1267 "GPU-%02x" "%02x" "%02x" "%02x"
1270 "-%02x" "%02x" "%02x" "%02x" "%02x" "%02x" "%02x" "%02x",
1271 (unsigned char) s
.bytes
[0], (unsigned char) s
.bytes
[1],
1272 (unsigned char) s
.bytes
[2], (unsigned char) s
.bytes
[3],
1273 (unsigned char) s
.bytes
[4], (unsigned char) s
.bytes
[5],
1274 (unsigned char) s
.bytes
[6], (unsigned char) s
.bytes
[7],
1275 (unsigned char) s
.bytes
[8], (unsigned char) s
.bytes
[9],
1276 (unsigned char) s
.bytes
[10], (unsigned char) s
.bytes
[11],
1277 (unsigned char) s
.bytes
[12], (unsigned char) s
.bytes
[13],
1278 (unsigned char) s
.bytes
[14], (unsigned char) s
.bytes
[15]);
1283 GOMP_OFFLOAD_get_caps (void)
1285 return GOMP_OFFLOAD_CAP_OPENACC_200
| GOMP_OFFLOAD_CAP_OPENMP_400
;
1289 GOMP_OFFLOAD_get_type (void)
1291 return OFFLOAD_TARGET_TYPE_NVIDIA_PTX
;
1295 GOMP_OFFLOAD_get_num_devices (unsigned int omp_requires_mask
)
1297 int num_devices
= nvptx_get_num_devices ();
1298 /* Return -1 if no omp_requires_mask cannot be fulfilled but
1299 devices were present. Unified-shared address: see comment in
1300 nvptx_open_device for CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING. */
1302 && ((omp_requires_mask
1303 & ~(GOMP_REQUIRES_UNIFIED_ADDRESS
1304 | GOMP_REQUIRES_SELF_MAPS
1305 | GOMP_REQUIRES_UNIFIED_SHARED_MEMORY
1306 | GOMP_REQUIRES_REVERSE_OFFLOAD
)) != 0))
1308 /* Check whether host page access (direct or via migration) is supported;
1309 if so, enable USM. Currently, capabilities is per device type, hence,
1310 check all devices. */
1312 && (omp_requires_mask
1313 & (GOMP_REQUIRES_UNIFIED_SHARED_MEMORY
| GOMP_REQUIRES_SELF_MAPS
)))
1314 for (int dev
= 0; dev
< num_devices
; dev
++)
1318 r
= CUDA_CALL_NOCHECK (cuDeviceGetAttribute
, &pi
,
1319 CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS
, dev
);
1320 if (r
!= CUDA_SUCCESS
|| pi
== 0)
1327 GOMP_OFFLOAD_init_device (int n
)
1329 struct ptx_device
*dev
;
1331 pthread_mutex_lock (&ptx_dev_lock
);
1333 if (!nvptx_init () || ptx_devices
[n
] != NULL
)
1335 pthread_mutex_unlock (&ptx_dev_lock
);
1339 dev
= nvptx_open_device (n
);
1342 ptx_devices
[n
] = dev
;
1343 instantiated_devices
++;
1346 const char *var_name
= "GOMP_NVPTX_LOWLAT_POOL";
1347 const char *env_var
= secure_getenv (var_name
);
1348 notify_var (var_name
, env_var
);
1350 if (env_var
!= NULL
)
1353 unsigned long val
= strtoul (env_var
, &endptr
, 10);
1354 if (endptr
== NULL
|| *endptr
!= '\0'
1355 || errno
== ERANGE
|| errno
== EINVAL
1357 GOMP_PLUGIN_error ("Error parsing %s", var_name
);
1359 lowlat_pool_size
= val
;
1362 pthread_mutex_unlock (&ptx_dev_lock
);
1368 GOMP_OFFLOAD_fini_device (int n
)
1370 pthread_mutex_lock (&ptx_dev_lock
);
1372 if (ptx_devices
[n
] != NULL
)
1374 if (!nvptx_attach_host_thread_to_device (n
)
1375 || !nvptx_close_device (ptx_devices
[n
]))
1377 pthread_mutex_unlock (&ptx_dev_lock
);
1380 ptx_devices
[n
] = NULL
;
1381 instantiated_devices
--;
1384 if (instantiated_devices
== 0)
1390 pthread_mutex_unlock (&ptx_dev_lock
);
1394 /* Return the libgomp version number we're compatible with. There is
1395 no requirement for cross-version compatibility. */
1398 GOMP_OFFLOAD_version (void)
1400 return GOMP_VERSION
;
1403 /* Initialize __nvptx_clocktick, if present in MODULE. */
1406 nvptx_set_clocktick (CUmodule module
, struct ptx_device
*dev
)
1409 CUresult r
= CUDA_CALL_NOCHECK (cuModuleGetGlobal
, &dptr
, NULL
,
1410 module
, "__nvptx_clocktick");
1411 if (r
== CUDA_ERROR_NOT_FOUND
)
1413 if (r
!= CUDA_SUCCESS
)
1414 GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r
));
1415 double __nvptx_clocktick
= 1e-3 / dev
->clock_khz
;
1416 r
= CUDA_CALL_NOCHECK (cuMemcpyHtoD
, dptr
, &__nvptx_clocktick
,
1417 sizeof (__nvptx_clocktick
));
1418 if (r
!= CUDA_SUCCESS
)
1419 GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r
));
1422 /* Invoke MODULE's global constructors/destructors. */
1425 nvptx_do_global_cdtors (CUmodule module
, struct ptx_device
*ptx_dev
,
1426 const char *funcname
)
1429 char *funcname_mgomp
= NULL
;
1432 r
= CUDA_CALL_NOCHECK (cuModuleGetFunction
,
1433 &funcptr
, module
, funcname
);
1434 GOMP_PLUGIN_debug (0, "cuModuleGetFunction (%s): %s\n",
1435 funcname
, cuda_error (r
));
1436 if (r
== CUDA_ERROR_NOT_FOUND
)
1438 /* Try '[funcname]__mgomp'. */
1440 size_t funcname_len
= strlen (funcname
);
1441 const char *mgomp_suffix
= "__mgomp";
1442 size_t mgomp_suffix_len
= strlen (mgomp_suffix
);
1444 = GOMP_PLUGIN_malloc (funcname_len
+ mgomp_suffix_len
+ 1);
1445 memcpy (funcname_mgomp
, funcname
, funcname_len
);
1446 memcpy (funcname_mgomp
+ funcname_len
,
1447 mgomp_suffix
, mgomp_suffix_len
+ 1);
1448 funcname
= funcname_mgomp
;
1450 r
= CUDA_CALL_NOCHECK (cuModuleGetFunction
,
1451 &funcptr
, module
, funcname
);
1452 GOMP_PLUGIN_debug (0, "cuModuleGetFunction (%s): %s\n",
1453 funcname
, cuda_error (r
));
1455 if (r
== CUDA_ERROR_NOT_FOUND
)
1457 else if (r
!= CUDA_SUCCESS
)
1459 GOMP_PLUGIN_error ("cuModuleGetFunction (%s) error: %s",
1460 funcname
, cuda_error (r
));
1465 /* If necessary, set up soft stack. */
1466 void *nvptx_stacks_0
;
1470 size_t stack_size
= nvptx_stacks_size ();
1471 pthread_mutex_lock (&ptx_dev
->omp_stacks
.lock
);
1472 nvptx_stacks_0
= nvptx_stacks_acquire (ptx_dev
, stack_size
, 1);
1473 nvptx_stacks_0
+= stack_size
;
1474 kargs
[0] = &nvptx_stacks_0
;
1476 r
= CUDA_CALL_NOCHECK (cuLaunchKernel
,
1479 /* sharedMemBytes */ 0,
1481 /* kernelParams */ funcname_mgomp
? kargs
: NULL
,
1483 if (r
!= CUDA_SUCCESS
)
1485 GOMP_PLUGIN_error ("cuLaunchKernel (%s) error: %s",
1486 funcname
, cuda_error (r
));
1490 r
= CUDA_CALL_NOCHECK (cuStreamSynchronize
,
1492 if (r
!= CUDA_SUCCESS
)
1494 GOMP_PLUGIN_error ("cuStreamSynchronize (%s) error: %s",
1495 funcname
, cuda_error (r
));
1500 pthread_mutex_unlock (&ptx_dev
->omp_stacks
.lock
);
1504 free (funcname_mgomp
);
1509 /* Load the (partial) program described by TARGET_DATA to device
1510 number ORD. Allocate and return TARGET_TABLE. If not NULL, REV_FN_TABLE
1511 will contain the on-device addresses of the functions for reverse offload.
1512 To be freed by the caller. */
1515 GOMP_OFFLOAD_load_image (int ord
, unsigned version
, const void *target_data
,
1516 struct addr_pair
**target_table
,
1517 uint64_t **rev_fn_table
,
1518 uint64_t *host_ind_fn_table
)
1521 const char *const *var_names
;
1522 const struct targ_fn_launch
*fn_descs
;
1523 unsigned int fn_entries
, var_entries
, ind_fn_entries
, other_entries
, i
, j
;
1524 struct targ_fn_descriptor
*targ_fns
;
1525 struct addr_pair
*targ_tbl
;
1526 const nvptx_tdata_t
*img_header
= (const nvptx_tdata_t
*) target_data
;
1527 struct ptx_image_data
*new_image
;
1528 struct ptx_device
*dev
;
1530 if (GOMP_VERSION_DEV (version
) > GOMP_VERSION_NVIDIA_PTX
)
1532 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1533 " (expected %u, received %u)",
1534 GOMP_VERSION_NVIDIA_PTX
, GOMP_VERSION_DEV (version
));
1538 if (!nvptx_attach_host_thread_to_device (ord
)
1539 || !link_ptx (&module
, img_header
->ptx_objs
, img_header
->ptx_num
))
1542 dev
= ptx_devices
[ord
];
1544 /* The mkoffload utility emits a struct of pointers/integers at the
1545 start of each offload image. The array of kernel names and the
1546 functions addresses form a one-to-one correspondence. */
1548 var_entries
= img_header
->var_num
;
1549 var_names
= img_header
->var_names
;
1550 fn_entries
= img_header
->fn_num
;
1551 fn_descs
= img_header
->fn_descs
;
1552 ind_fn_entries
= GOMP_VERSION_SUPPORTS_INDIRECT_FUNCS (version
)
1553 ? img_header
->ind_fn_num
: 0;
1555 /* Currently, other_entries contains only the struct of ICVs. */
1558 targ_tbl
= GOMP_PLUGIN_malloc (sizeof (struct addr_pair
)
1559 * (fn_entries
+ var_entries
+ other_entries
));
1560 targ_fns
= GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor
)
1563 *target_table
= targ_tbl
;
1565 new_image
= GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data
));
1566 new_image
->target_data
= target_data
;
1567 new_image
->module
= module
;
1568 new_image
->fns
= targ_fns
;
1570 pthread_mutex_lock (&dev
->image_lock
);
1571 new_image
->next
= dev
->images
;
1572 dev
->images
= new_image
;
1573 pthread_mutex_unlock (&dev
->image_lock
);
1575 for (i
= 0; i
< fn_entries
; i
++, targ_fns
++, targ_tbl
++)
1577 CUfunction function
;
1580 CUDA_CALL_ERET (-1, cuModuleGetFunction
, &function
, module
,
1582 CUDA_CALL_ERET (-1, cuFuncGetAttribute
, &nregs
,
1583 CU_FUNC_ATTRIBUTE_NUM_REGS
, function
);
1584 CUDA_CALL_ERET (-1, cuFuncGetAttribute
, &mthrs
,
1585 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK
, function
);
1587 targ_fns
->fn
= function
;
1588 targ_fns
->launch
= &fn_descs
[i
];
1589 targ_fns
->regs_per_thread
= nregs
;
1590 targ_fns
->max_threads_per_block
= mthrs
;
1592 targ_tbl
->start
= (uintptr_t) targ_fns
;
1593 targ_tbl
->end
= targ_tbl
->start
+ 1;
1596 for (j
= 0; j
< var_entries
; j
++, targ_tbl
++)
1601 CUDA_CALL_ERET (-1, cuModuleGetGlobal
,
1602 &var
, &bytes
, module
, var_names
[j
]);
1604 targ_tbl
->start
= (uintptr_t) var
;
1605 targ_tbl
->end
= targ_tbl
->start
+ bytes
;
1608 if (ind_fn_entries
> 0)
1613 /* Read indirect function table from image. */
1614 CUresult r
= CUDA_CALL_NOCHECK (cuModuleGetGlobal
, &var
, &bytes
, module
,
1615 "$offload_ind_func_table");
1616 if (r
!= CUDA_SUCCESS
)
1617 GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r
));
1618 assert (bytes
== sizeof (uint64_t) * ind_fn_entries
);
1620 uint64_t ind_fn_table
[ind_fn_entries
];
1621 r
= CUDA_CALL_NOCHECK (cuMemcpyDtoH
, ind_fn_table
, var
, bytes
);
1622 if (r
!= CUDA_SUCCESS
)
1623 GOMP_PLUGIN_fatal ("cuMemcpyDtoH error: %s", cuda_error (r
));
1625 /* Build host->target address map for indirect functions. */
1626 uint64_t ind_fn_map
[ind_fn_entries
* 2 + 1];
1627 for (unsigned k
= 0; k
< ind_fn_entries
; k
++)
1629 ind_fn_map
[k
* 2] = host_ind_fn_table
[k
];
1630 ind_fn_map
[k
* 2 + 1] = ind_fn_table
[k
];
1631 GOMP_PLUGIN_debug (0, "Indirect function %d: %lx->%lx\n",
1632 k
, host_ind_fn_table
[k
], ind_fn_table
[k
]);
1634 ind_fn_map
[ind_fn_entries
* 2] = 0;
1636 /* Write the map onto the target. */
1637 void *map_target_addr
1638 = GOMP_OFFLOAD_alloc (ord
, sizeof (ind_fn_map
));
1639 GOMP_PLUGIN_debug (0, "Allocated indirect map at %p\n", map_target_addr
);
1641 GOMP_OFFLOAD_host2dev (ord
, map_target_addr
,
1643 sizeof (ind_fn_map
));
1645 /* Write address of the map onto the target. */
1648 r
= CUDA_CALL_NOCHECK (cuModuleGetGlobal
, &varptr
, &varsize
,
1649 module
, XSTRING (GOMP_INDIRECT_ADDR_MAP
));
1650 if (r
!= CUDA_SUCCESS
)
1651 GOMP_PLUGIN_fatal ("Indirect map variable not found in image: %s",
1654 GOMP_PLUGIN_debug (0,
1655 "Indirect map variable found at %llx with size %ld\n",
1658 GOMP_OFFLOAD_host2dev (ord
, (void *) varptr
, &map_target_addr
,
1659 sizeof (map_target_addr
));
1664 CUresult r
= CUDA_CALL_NOCHECK (cuModuleGetGlobal
, &varptr
, &varsize
,
1665 module
, XSTRING (GOMP_ADDITIONAL_ICVS
));
1667 if (r
== CUDA_SUCCESS
)
1669 targ_tbl
->start
= (uintptr_t) varptr
;
1670 targ_tbl
->end
= (uintptr_t) (varptr
+ varsize
);
1673 /* The variable was not in this image. */
1674 targ_tbl
->start
= targ_tbl
->end
= 0;
1676 if (rev_fn_table
&& fn_entries
== 0)
1677 *rev_fn_table
= NULL
;
1678 else if (rev_fn_table
)
1683 r
= CUDA_CALL_NOCHECK (cuModuleGetGlobal
, &var
, &bytes
, module
,
1684 "$offload_func_table");
1685 if (r
!= CUDA_SUCCESS
)
1686 GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r
));
1687 assert (bytes
== sizeof (uint64_t) * fn_entries
);
1688 *rev_fn_table
= GOMP_PLUGIN_malloc (sizeof (uint64_t) * fn_entries
);
1689 r
= CUDA_CALL_NOCHECK (cuMemcpyDtoH
, *rev_fn_table
, var
, bytes
);
1690 if (r
!= CUDA_SUCCESS
)
1691 GOMP_PLUGIN_fatal ("cuMemcpyDtoH error: %s", cuda_error (r
));
1692 /* Free if only NULL entries. */
1693 for (i
= 0; i
< fn_entries
; ++i
)
1694 if ((*rev_fn_table
)[i
] != 0)
1696 if (i
== fn_entries
)
1698 free (*rev_fn_table
);
1699 *rev_fn_table
= NULL
;
1703 if (rev_fn_table
&& *rev_fn_table
&& dev
->rev_data
== NULL
)
1705 /* Get the on-device GOMP_REV_OFFLOAD_VAR variable. It should be
1706 available but it might be not. One reason could be: if the user code
1707 has 'omp target device(ancestor:1)' in pure hostcode, GOMP_target_ext
1708 is not called on the device and, hence, it and GOMP_REV_OFFLOAD_VAR
1709 are not linked in. */
1710 CUdeviceptr device_rev_offload_var
;
1711 size_t device_rev_offload_size
;
1712 CUresult r
= CUDA_CALL_NOCHECK (cuModuleGetGlobal
,
1713 &device_rev_offload_var
,
1714 &device_rev_offload_size
, module
,
1715 XSTRING (GOMP_REV_OFFLOAD_VAR
));
1716 if (r
!= CUDA_SUCCESS
)
1718 free (*rev_fn_table
);
1719 *rev_fn_table
= NULL
;
1723 /* cuMemHostAlloc memory is accessible on the device, if
1724 unified-shared address is supported; this is assumed - see comment
1725 in nvptx_open_device for CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING. */
1726 CUDA_CALL_ASSERT (cuMemHostAlloc
, (void **) &dev
->rev_data
,
1727 sizeof (*dev
->rev_data
), CU_MEMHOSTALLOC_DEVICEMAP
);
1728 CUdeviceptr dp
= (CUdeviceptr
) dev
->rev_data
;
1729 r
= CUDA_CALL_NOCHECK (cuMemcpyHtoD
, device_rev_offload_var
, &dp
,
1731 if (r
!= CUDA_SUCCESS
)
1732 GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r
));
1736 nvptx_set_clocktick (module
, dev
);
1738 if (!nvptx_do_global_cdtors (module
, dev
,
1739 "__do_global_ctors__entry"
1740 /* or "__do_global_ctors__entry__mgomp" */))
1743 return fn_entries
+ var_entries
+ other_entries
;
1746 /* Unload the program described by TARGET_DATA. DEV_DATA is the
1747 function descriptors allocated by G_O_load_image. */
1750 GOMP_OFFLOAD_unload_image (int ord
, unsigned version
, const void *target_data
)
1752 struct ptx_image_data
*image
, **prev_p
;
1753 struct ptx_device
*dev
= ptx_devices
[ord
];
1755 if (GOMP_VERSION_DEV (version
) > GOMP_VERSION_NVIDIA_PTX
)
1757 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1758 " (expected %u, received %u)",
1759 GOMP_VERSION_NVIDIA_PTX
, GOMP_VERSION_DEV (version
));
1764 pthread_mutex_lock (&dev
->image_lock
);
1765 for (prev_p
= &dev
->images
; (image
= *prev_p
) != 0; prev_p
= &image
->next
)
1766 if (image
->target_data
== target_data
)
1768 if (!nvptx_do_global_cdtors (image
->module
, dev
,
1769 "__do_global_dtors__entry"
1770 /* or "__do_global_dtors__entry__mgomp" */))
1773 *prev_p
= image
->next
;
1774 if (CUDA_CALL_NOCHECK (cuModuleUnload
, image
->module
) != CUDA_SUCCESS
)
1780 pthread_mutex_unlock (&dev
->image_lock
);
1785 GOMP_OFFLOAD_alloc (int ord
, size_t size
)
1787 if (!nvptx_attach_host_thread_to_device (ord
))
1790 struct ptx_device
*ptx_dev
= ptx_devices
[ord
];
1791 struct ptx_free_block
*blocks
, *tmp
;
1793 pthread_mutex_lock (&ptx_dev
->free_blocks_lock
);
1794 blocks
= ptx_dev
->free_blocks
;
1795 ptx_dev
->free_blocks
= NULL
;
1796 pthread_mutex_unlock (&ptx_dev
->free_blocks_lock
);
1798 nvptx_stacks_free (ptx_dev
, false);
1803 nvptx_free (blocks
->ptr
, ptx_dev
);
1808 void *d
= nvptx_alloc (size
, true);
1813 /* Memory allocation failed. Try freeing the stacks block, and
1815 nvptx_stacks_free (ptx_dev
, true);
1816 return nvptx_alloc (size
, false);
1821 GOMP_OFFLOAD_free (int ord
, void *ptr
)
1823 return (nvptx_attach_host_thread_to_device (ord
)
1824 && nvptx_free (ptr
, ptx_devices
[ord
]));
1828 GOMP_OFFLOAD_openacc_exec (void (*fn
) (void *),
1829 size_t mapnum
__attribute__((unused
)),
1830 void **hostaddrs
__attribute__((unused
)),
1832 unsigned *dims
, void *targ_mem_desc
)
1834 GOMP_PLUGIN_debug (0, "nvptx %s\n", __FUNCTION__
);
1836 CUdeviceptr dp
= (CUdeviceptr
) devaddrs
;
1837 nvptx_exec (fn
, dims
, targ_mem_desc
, dp
, NULL
);
1839 CUresult r
= CUDA_CALL_NOCHECK (cuStreamSynchronize
, NULL
);
1840 const char *maybe_abort_msg
= "(perhaps abort was called)";
1841 if (r
== CUDA_ERROR_LAUNCH_FAILED
)
1842 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r
),
1844 else if (r
!= CUDA_SUCCESS
)
1845 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r
));
1849 GOMP_OFFLOAD_openacc_async_exec (void (*fn
) (void *),
1850 size_t mapnum
__attribute__((unused
)),
1851 void **hostaddrs
__attribute__((unused
)),
1853 unsigned *dims
, void *targ_mem_desc
,
1854 struct goacc_asyncqueue
*aq
)
1856 GOMP_PLUGIN_debug (0, "nvptx %s\n", __FUNCTION__
);
1858 CUdeviceptr dp
= (CUdeviceptr
) devaddrs
;
1859 nvptx_exec (fn
, dims
, targ_mem_desc
, dp
, aq
->cuda_stream
);
1863 GOMP_OFFLOAD_openacc_create_thread_data (int ord
)
1865 struct ptx_device
*ptx_dev
;
1866 struct nvptx_thread
*nvthd
1867 = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread
));
1870 ptx_dev
= ptx_devices
[ord
];
1874 CUDA_CALL_ASSERT (cuCtxGetCurrent
, &thd_ctx
);
1876 assert (ptx_dev
->ctx
);
1879 CUDA_CALL_ASSERT (cuCtxPushCurrent
, ptx_dev
->ctx
);
1881 nvthd
->ptx_dev
= ptx_dev
;
1883 return (void *) nvthd
;
1887 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data
)
1893 GOMP_OFFLOAD_openacc_cuda_get_current_device (void)
1895 return nvptx_get_current_cuda_device ();
1899 GOMP_OFFLOAD_openacc_cuda_get_current_context (void)
1901 return nvptx_get_current_cuda_context ();
1904 /* This returns a CUstream. */
1906 GOMP_OFFLOAD_openacc_cuda_get_stream (struct goacc_asyncqueue
*aq
)
1908 return (void *) aq
->cuda_stream
;
1911 /* This takes a CUstream. */
1913 GOMP_OFFLOAD_openacc_cuda_set_stream (struct goacc_asyncqueue
*aq
, void *stream
)
1915 if (aq
->cuda_stream
)
1917 CUDA_CALL_ASSERT (cuStreamSynchronize
, aq
->cuda_stream
);
1918 CUDA_CALL_ASSERT (cuStreamDestroy
, aq
->cuda_stream
);
1921 aq
->cuda_stream
= (CUstream
) stream
;
1925 static struct goacc_asyncqueue
*
1926 nvptx_goacc_asyncqueue_construct (unsigned int flags
)
1928 CUstream stream
= NULL
;
1929 CUDA_CALL_ERET (NULL
, cuStreamCreate
, &stream
, flags
);
1931 struct goacc_asyncqueue
*aq
1932 = GOMP_PLUGIN_malloc (sizeof (struct goacc_asyncqueue
));
1933 aq
->cuda_stream
= stream
;
1937 struct goacc_asyncqueue
*
1938 GOMP_OFFLOAD_openacc_async_construct (int device
__attribute__((unused
)))
1940 return nvptx_goacc_asyncqueue_construct (CU_STREAM_DEFAULT
);
1944 nvptx_goacc_asyncqueue_destruct (struct goacc_asyncqueue
*aq
)
1946 CUDA_CALL_ERET (false, cuStreamDestroy
, aq
->cuda_stream
);
1952 GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue
*aq
)
1954 return nvptx_goacc_asyncqueue_destruct (aq
);
1958 GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue
*aq
)
1960 CUresult r
= CUDA_CALL_NOCHECK (cuStreamQuery
, aq
->cuda_stream
);
1961 if (r
== CUDA_SUCCESS
)
1963 if (r
== CUDA_ERROR_NOT_READY
)
1966 GOMP_PLUGIN_error ("cuStreamQuery error: %s", cuda_error (r
));
1971 nvptx_goacc_asyncqueue_synchronize (struct goacc_asyncqueue
*aq
)
1973 CUDA_CALL_ERET (false, cuStreamSynchronize
, aq
->cuda_stream
);
1978 GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue
*aq
)
1980 return nvptx_goacc_asyncqueue_synchronize (aq
);
1984 GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue
*aq1
,
1985 struct goacc_asyncqueue
*aq2
)
1988 CUDA_CALL_ERET (false, cuEventCreate
, &e
, CU_EVENT_DISABLE_TIMING
);
1989 CUDA_CALL_ERET (false, cuEventRecord
, e
, aq1
->cuda_stream
);
1990 CUDA_CALL_ERET (false, cuStreamWaitEvent
, aq2
->cuda_stream
, e
, 0);
1995 cuda_callback_wrapper (CUstream stream
, CUresult res
, void *ptr
)
1997 if (res
!= CUDA_SUCCESS
)
1998 GOMP_PLUGIN_fatal ("%s error: %s", __FUNCTION__
, cuda_error (res
));
1999 struct nvptx_callback
*cb
= (struct nvptx_callback
*) ptr
;
2005 GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue
*aq
,
2006 void (*callback_fn
)(void *),
2009 struct nvptx_callback
*b
= GOMP_PLUGIN_malloc (sizeof (*b
));
2010 b
->fn
= callback_fn
;
2013 CUDA_CALL_ASSERT (cuStreamAddCallback
, aq
->cuda_stream
,
2014 cuda_callback_wrapper
, (void *) b
, 0);
2018 cuda_memcpy_sanity_check (const void *h
, const void *d
, size_t s
)
2026 GOMP_PLUGIN_error ("invalid device address");
2029 CUDA_CALL (cuMemGetAddressRange
, &pb
, &ps
, (CUdeviceptr
) d
);
2032 GOMP_PLUGIN_error ("invalid device address");
2037 GOMP_PLUGIN_error ("invalid host address");
2042 GOMP_PLUGIN_error ("invalid host or device address");
2045 if ((void *)(d
+ s
) > (void *)(pb
+ ps
))
2047 GOMP_PLUGIN_error ("invalid size");
2054 GOMP_OFFLOAD_host2dev (int ord
, void *dst
, const void *src
, size_t n
)
2056 if (!nvptx_attach_host_thread_to_device (ord
)
2057 || !cuda_memcpy_sanity_check (src
, dst
, n
))
2059 CUDA_CALL (cuMemcpyHtoD
, (CUdeviceptr
) dst
, src
, n
);
2064 GOMP_OFFLOAD_dev2host (int ord
, void *dst
, const void *src
, size_t n
)
2066 if (!nvptx_attach_host_thread_to_device (ord
)
2067 || !cuda_memcpy_sanity_check (dst
, src
, n
))
2069 CUDA_CALL (cuMemcpyDtoH
, dst
, (CUdeviceptr
) src
, n
);
2074 GOMP_OFFLOAD_dev2dev (int ord
, void *dst
, const void *src
, size_t n
)
2076 CUDA_CALL (cuMemcpyDtoDAsync
, (CUdeviceptr
) dst
, (CUdeviceptr
) src
, n
, NULL
);
2081 GOMP_OFFLOAD_memcpy2d (int dst_ord
, int src_ord
, size_t dim1_size
,
2082 size_t dim0_len
, void *dst
, size_t dst_offset1_size
,
2083 size_t dst_offset0_len
, size_t dst_dim1_size
,
2084 const void *src
, size_t src_offset1_size
,
2085 size_t src_offset0_len
, size_t src_dim1_size
)
2087 if (!nvptx_attach_host_thread_to_device (src_ord
!= -1 ? src_ord
: dst_ord
))
2090 /* TODO: Consider using CU_MEMORYTYPE_UNIFIED if supported. */
2094 memset (&data
, 0, sizeof (data
));
2095 data
.WidthInBytes
= dim1_size
;
2096 data
.Height
= dim0_len
;
2100 data
.dstMemoryType
= CU_MEMORYTYPE_HOST
;
2105 data
.dstMemoryType
= CU_MEMORYTYPE_DEVICE
;
2106 data
.dstDevice
= (CUdeviceptr
) dst
;
2108 data
.dstPitch
= dst_dim1_size
;
2109 data
.dstXInBytes
= dst_offset1_size
;
2110 data
.dstY
= dst_offset0_len
;
2114 data
.srcMemoryType
= CU_MEMORYTYPE_HOST
;
2119 data
.srcMemoryType
= CU_MEMORYTYPE_DEVICE
;
2120 data
.srcDevice
= (CUdeviceptr
) src
;
2122 data
.srcPitch
= src_dim1_size
;
2123 data
.srcXInBytes
= src_offset1_size
;
2124 data
.srcY
= src_offset0_len
;
2126 if (data
.srcXInBytes
!= 0 || data
.srcY
!= 0)
2128 /* Adjust origin to the actual array data, else the CUDA 2D memory
2129 copy API calls below may fail to validate source/dest pointers
2130 correctly (especially for Fortran where the "virtual origin" of an
2131 array is often outside the stored data). */
2133 data
.srcHost
= (const void *) ((const char *) data
.srcHost
2134 + data
.srcY
* data
.srcPitch
2135 + data
.srcXInBytes
);
2137 data
.srcDevice
+= data
.srcY
* data
.srcPitch
+ data
.srcXInBytes
;
2138 data
.srcXInBytes
= 0;
2142 if (data
.dstXInBytes
!= 0 || data
.dstY
!= 0)
2146 data
.dstHost
= (void *) ((char *) data
.dstHost
2147 + data
.dstY
* data
.dstPitch
2148 + data
.dstXInBytes
);
2150 data
.dstDevice
+= data
.dstY
* data
.dstPitch
+ data
.dstXInBytes
;
2151 data
.dstXInBytes
= 0;
2155 CUresult res
= CUDA_CALL_NOCHECK (cuMemcpy2D
, &data
);
2156 if (res
== CUDA_ERROR_INVALID_VALUE
)
2157 /* If pitch > CU_DEVICE_ATTRIBUTE_MAX_PITCH or for device-to-device
2158 for (some) memory not allocated by cuMemAllocPitch, cuMemcpy2D fails
2159 with an error; try the slower cuMemcpy2DUnaligned now. */
2160 CUDA_CALL (cuMemcpy2DUnaligned
, &data
);
2161 else if (res
!= CUDA_SUCCESS
)
2163 GOMP_PLUGIN_error ("cuMemcpy2D error: %s", cuda_error (res
));
2170 GOMP_OFFLOAD_memcpy3d (int dst_ord
, int src_ord
, size_t dim2_size
,
2171 size_t dim1_len
, size_t dim0_len
, void *dst
,
2172 size_t dst_offset2_size
, size_t dst_offset1_len
,
2173 size_t dst_offset0_len
, size_t dst_dim2_size
,
2174 size_t dst_dim1_len
, const void *src
,
2175 size_t src_offset2_size
, size_t src_offset1_len
,
2176 size_t src_offset0_len
, size_t src_dim2_size
,
2177 size_t src_dim1_len
)
2179 if (!nvptx_attach_host_thread_to_device (src_ord
!= -1 ? src_ord
: dst_ord
))
2182 /* TODO: Consider using CU_MEMORYTYPE_UNIFIED if supported. */
2186 memset (&data
, 0, sizeof (data
));
2187 data
.WidthInBytes
= dim2_size
;
2188 data
.Height
= dim1_len
;
2189 data
.Depth
= dim0_len
;
2193 data
.dstMemoryType
= CU_MEMORYTYPE_HOST
;
2198 data
.dstMemoryType
= CU_MEMORYTYPE_DEVICE
;
2199 data
.dstDevice
= (CUdeviceptr
) dst
;
2201 data
.dstPitch
= dst_dim2_size
;
2202 data
.dstHeight
= dst_dim1_len
;
2203 data
.dstXInBytes
= dst_offset2_size
;
2204 data
.dstY
= dst_offset1_len
;
2205 data
.dstZ
= dst_offset0_len
;
2209 data
.srcMemoryType
= CU_MEMORYTYPE_HOST
;
2214 data
.srcMemoryType
= CU_MEMORYTYPE_DEVICE
;
2215 data
.srcDevice
= (CUdeviceptr
) src
;
2217 data
.srcPitch
= src_dim2_size
;
2218 data
.srcHeight
= src_dim1_len
;
2219 data
.srcXInBytes
= src_offset2_size
;
2220 data
.srcY
= src_offset1_len
;
2221 data
.srcZ
= src_offset0_len
;
2223 if (data
.srcXInBytes
!= 0 || data
.srcY
!= 0 || data
.srcZ
!= 0)
2225 /* Adjust origin to the actual array data, else the CUDA 3D memory
2226 copy API call below may fail to validate source/dest pointers
2227 correctly (especially for Fortran where the "virtual origin" of an
2228 array is often outside the stored data). */
2231 = (const void *) ((const char *) data
.srcHost
2232 + (data
.srcZ
* data
.srcHeight
+ data
.srcY
)
2234 + data
.srcXInBytes
);
2237 += (data
.srcZ
* data
.srcHeight
+ data
.srcY
) * data
.srcPitch
2239 data
.srcXInBytes
= 0;
2244 if (data
.dstXInBytes
!= 0 || data
.dstY
!= 0 || data
.dstZ
!= 0)
2248 data
.dstHost
= (void *) ((char *) data
.dstHost
2249 + (data
.dstZ
* data
.dstHeight
+ data
.dstY
)
2251 + data
.dstXInBytes
);
2254 += (data
.dstZ
* data
.dstHeight
+ data
.dstY
) * data
.dstPitch
2256 data
.dstXInBytes
= 0;
2261 CUDA_CALL (cuMemcpy3D
, &data
);
2266 GOMP_OFFLOAD_openacc_async_host2dev (int ord
, void *dst
, const void *src
,
2267 size_t n
, struct goacc_asyncqueue
*aq
)
2269 if (!nvptx_attach_host_thread_to_device (ord
)
2270 || !cuda_memcpy_sanity_check (src
, dst
, n
))
2272 CUDA_CALL (cuMemcpyHtoDAsync
, (CUdeviceptr
) dst
, src
, n
, aq
->cuda_stream
);
2277 GOMP_OFFLOAD_openacc_async_dev2host (int ord
, void *dst
, const void *src
,
2278 size_t n
, struct goacc_asyncqueue
*aq
)
2280 if (!nvptx_attach_host_thread_to_device (ord
)
2281 || !cuda_memcpy_sanity_check (dst
, src
, n
))
2283 CUDA_CALL (cuMemcpyDtoHAsync
, dst
, (CUdeviceptr
) src
, n
, aq
->cuda_stream
);
2287 union goacc_property_value
2288 GOMP_OFFLOAD_openacc_get_property (int n
, enum goacc_property prop
)
2290 union goacc_property_value propval
= { .val
= 0 };
2292 pthread_mutex_lock (&ptx_dev_lock
);
2294 if (n
>= nvptx_get_num_devices () || n
< 0 || ptx_devices
[n
] == NULL
)
2296 pthread_mutex_unlock (&ptx_dev_lock
);
2300 struct ptx_device
*ptx_dev
= ptx_devices
[n
];
2303 case GOACC_PROPERTY_MEMORY
:
2307 CUDA_CALL_ERET (propval
, cuDeviceTotalMem
, &total_mem
, ptx_dev
->dev
);
2308 propval
.val
= total_mem
;
2311 case GOACC_PROPERTY_FREE_MEMORY
:
2317 CUDA_CALL_ERET (propval
, cuCtxGetDevice
, &ctxdev
);
2318 if (ptx_dev
->dev
== ctxdev
)
2319 CUDA_CALL_ERET (propval
, cuMemGetInfo
, &free_mem
, &total_mem
);
2320 else if (ptx_dev
->ctx
)
2324 CUDA_CALL_ERET (propval
, cuCtxPushCurrent
, ptx_dev
->ctx
);
2325 CUDA_CALL_ERET (propval
, cuMemGetInfo
, &free_mem
, &total_mem
);
2326 CUDA_CALL_ASSERT (cuCtxPopCurrent
, &old_ctx
);
2332 CUDA_CALL_ERET (propval
, cuCtxCreate
, &new_ctx
, CU_CTX_SCHED_AUTO
,
2334 CUDA_CALL_ERET (propval
, cuMemGetInfo
, &free_mem
, &total_mem
);
2335 CUDA_CALL_ASSERT (cuCtxDestroy
, new_ctx
);
2337 propval
.val
= free_mem
;
2340 case GOACC_PROPERTY_NAME
:
2341 propval
.ptr
= ptx_dev
->name
;
2343 case GOACC_PROPERTY_VENDOR
:
2344 propval
.ptr
= "Nvidia";
2346 case GOACC_PROPERTY_DRIVER
:
2347 propval
.ptr
= cuda_driver_version_s
;
2353 pthread_mutex_unlock (&ptx_dev_lock
);
2357 /* Adjust launch dimensions: pick good values for number of blocks and warps
2358 and ensure that number of warps does not exceed CUDA limits as well as GCC's
2362 nvptx_adjust_launch_bounds (struct targ_fn_descriptor
*fn
,
2363 struct ptx_device
*ptx_dev
,
2364 int *teams_p
, int *threads_p
)
2366 int max_warps_block
= fn
->max_threads_per_block
/ 32;
2367 /* Maximum 32 warps per block is an implementation limit in NVPTX backend
2368 and libgcc, which matches documented limit of all GPUs as of 2015. */
2369 if (max_warps_block
> 32)
2370 max_warps_block
= 32;
2371 if (*threads_p
<= 0)
2373 if (*threads_p
> max_warps_block
)
2374 *threads_p
= max_warps_block
;
2376 int regs_per_block
= fn
->regs_per_thread
* 32 * *threads_p
;
2377 /* This is an estimate of how many blocks the device can host simultaneously.
2378 Actual limit, which may be lower, can be queried with "occupancy control"
2379 driver interface (since CUDA 6.0). */
2380 int max_blocks
= ptx_dev
->regs_per_sm
/ regs_per_block
* ptx_dev
->num_sms
;
2381 if (*teams_p
<= 0 || *teams_p
> max_blocks
)
2382 *teams_p
= max_blocks
;
2385 /* Return the size of per-warp stacks (see gcc -msoft-stack) to use for OpenMP
2389 nvptx_stacks_size ()
2394 /* Return contiguous storage for NUM stacks, each SIZE bytes. The lock for
2395 the storage should be held on entry, and remains held on exit. */
2398 nvptx_stacks_acquire (struct ptx_device
*ptx_dev
, size_t size
, int num
)
2400 if (ptx_dev
->omp_stacks
.ptr
&& ptx_dev
->omp_stacks
.size
>= size
* num
)
2401 return (void *) ptx_dev
->omp_stacks
.ptr
;
2403 /* Free the old, too-small stacks. */
2404 if (ptx_dev
->omp_stacks
.ptr
)
2406 CUresult r
= CUDA_CALL_NOCHECK (cuCtxSynchronize
, );
2407 if (r
!= CUDA_SUCCESS
)
2408 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s\n", cuda_error (r
));
2409 r
= CUDA_CALL_NOCHECK (cuMemFree
, ptx_dev
->omp_stacks
.ptr
);
2410 if (r
!= CUDA_SUCCESS
)
2411 GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r
));
2414 /* Make new and bigger stacks, and remember where we put them and how big
2416 CUresult r
= CUDA_CALL_NOCHECK (cuMemAlloc
, &ptx_dev
->omp_stacks
.ptr
,
2418 if (r
!= CUDA_SUCCESS
)
2419 GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r
));
2421 ptx_dev
->omp_stacks
.size
= size
* num
;
2423 return (void *) ptx_dev
->omp_stacks
.ptr
;
2428 GOMP_OFFLOAD_run (int ord
, void *tgt_fn
, void *tgt_vars
, void **args
)
2430 struct targ_fn_descriptor
*tgt_fn_desc
2431 = (struct targ_fn_descriptor
*) tgt_fn
;
2432 CUfunction function
= tgt_fn_desc
->fn
;
2433 const struct targ_fn_launch
*launch
= tgt_fn_desc
->launch
;
2434 const char *fn_name
= launch
->fn
;
2436 struct ptx_device
*ptx_dev
= ptx_devices
[ord
];
2437 const char *maybe_abort_msg
= "(perhaps abort was called)";
2438 int teams
= 0, threads
= 0;
2441 GOMP_PLUGIN_fatal ("No target arguments provided");
2444 intptr_t id
= (intptr_t) *args
++, val
;
2445 if (id
& GOMP_TARGET_ARG_SUBSEQUENT_PARAM
)
2446 val
= (intptr_t) *args
++;
2448 val
= id
>> GOMP_TARGET_ARG_VALUE_SHIFT
;
2449 if ((id
& GOMP_TARGET_ARG_DEVICE_MASK
) != GOMP_TARGET_ARG_DEVICE_ALL
)
2451 val
= val
> INT_MAX
? INT_MAX
: val
;
2452 id
&= GOMP_TARGET_ARG_ID_MASK
;
2453 if (id
== GOMP_TARGET_ARG_NUM_TEAMS
)
2455 else if (id
== GOMP_TARGET_ARG_THREAD_LIMIT
)
2458 nvptx_adjust_launch_bounds (tgt_fn
, ptx_dev
, &teams
, &threads
);
2460 bool reverse_offload
= ptx_dev
->rev_data
!= NULL
;
2461 struct goacc_asyncqueue
*reverse_offload_aq
= NULL
;
2462 if (reverse_offload
)
2465 = nvptx_goacc_asyncqueue_construct (CU_STREAM_NON_BLOCKING
);
2466 if (!reverse_offload_aq
)
2467 exit (EXIT_FAILURE
);
2470 size_t stack_size
= nvptx_stacks_size ();
2472 pthread_mutex_lock (&ptx_dev
->omp_stacks
.lock
);
2473 void *stacks
= nvptx_stacks_acquire (ptx_dev
, stack_size
, teams
* threads
);
2474 void *fn_args
[] = {tgt_vars
, stacks
, (void *) stack_size
};
2475 size_t fn_args_size
= sizeof fn_args
;
2477 CU_LAUNCH_PARAM_BUFFER_POINTER
, fn_args
,
2478 CU_LAUNCH_PARAM_BUFFER_SIZE
, &fn_args_size
,
2481 GOMP_PLUGIN_debug (0, " %s: kernel %s: launch"
2482 " [(teams: %u), 1, 1] [(lanes: 32), (threads: %u), 1]\n",
2483 __FUNCTION__
, fn_name
, teams
, threads
);
2484 r
= CUDA_CALL_NOCHECK (cuLaunchKernel
, function
, teams
, 1, 1,
2485 32, threads
, 1, lowlat_pool_size
, NULL
, NULL
, config
);
2486 if (r
!= CUDA_SUCCESS
)
2487 GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r
));
2488 if (reverse_offload
)
2491 r
= CUDA_CALL_NOCHECK (cuStreamQuery
, NULL
);
2492 if (r
== CUDA_SUCCESS
)
2494 if (r
== CUDA_ERROR_LAUNCH_FAILED
)
2495 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s %s\n", cuda_error (r
),
2497 else if (r
!= CUDA_ERROR_NOT_READY
)
2498 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r
));
2500 if (__atomic_load_n (&ptx_dev
->rev_data
->fn
, __ATOMIC_ACQUIRE
) != 0)
2502 struct rev_offload
*rev_data
= ptx_dev
->rev_data
;
2503 GOMP_PLUGIN_target_rev (rev_data
->fn
, rev_data
->mapnum
,
2504 rev_data
->addrs
, rev_data
->sizes
,
2505 rev_data
->kinds
, rev_data
->dev_num
,
2506 reverse_offload_aq
);
2507 if (!nvptx_goacc_asyncqueue_synchronize (reverse_offload_aq
))
2508 exit (EXIT_FAILURE
);
2509 __atomic_store_n (&rev_data
->fn
, 0, __ATOMIC_RELEASE
);
2514 r
= CUDA_CALL_NOCHECK (cuCtxSynchronize
, );
2515 if (r
== CUDA_ERROR_LAUNCH_FAILED
)
2516 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r
),
2518 else if (r
!= CUDA_SUCCESS
)
2519 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r
));
2521 pthread_mutex_unlock (&ptx_dev
->omp_stacks
.lock
);
2523 if (reverse_offload
)
2525 if (!nvptx_goacc_asyncqueue_destruct (reverse_offload_aq
))
2526 exit (EXIT_FAILURE
);
2530 /* TODO: Implement GOMP_OFFLOAD_async_run. */