1 /* Plugin for NVPTX execution.
3 Copyright (C) 2013-2024 Free Software Foundation, Inc.
5 Contributed by Mentor Embedded.
7 This file is part of the GNU Offloading and Multi Processing Library
10 Libgomp is free software; you can redistribute it and/or modify it
11 under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 3, or (at your option)
15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
20 Under Section 7 of GPL version 3, you are granted additional
21 permissions described in the GCC Runtime Library Exception, version
22 3.1, as published by the Free Software Foundation.
24 You should have received a copy of the GNU General Public License and
25 a copy of the GCC Runtime Library Exception along with this program;
26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
27 <http://www.gnu.org/licenses/>. */
29 /* Nvidia PTX-specific parts of OpenACC support. The cuda driver
30 library appears to hold some implicit state, but the documentation
31 is not clear as to what that state might be. Or how one might
32 propagate it from one thread to another. */
38 #include "libgomp-plugin.h"
39 #include "oacc-plugin.h"
40 #include "gomp-constants.h"
43 /* For struct rev_offload + GOMP_REV_OFFLOAD_VAR. */
44 #include "config/nvptx/libgomp-nvptx.h"
47 #ifndef PLUGIN_NVPTX_INCLUDE_SYSTEM_CUDA_H
48 # include "cuda/cuda.h"
61 /* An arbitrary fixed limit (128MB) for the size of the OpenMP soft stacks
62 block to cache between kernel invocations. For soft-stacks blocks bigger
63 than this, we will free the block before attempting another GPU memory
64 allocation (i.e. in GOMP_OFFLOAD_alloc). Otherwise, if an allocation fails,
65 we will free the cached soft-stacks block anyway then retry the
66 allocation. If that fails too, we lose. */
68 #define SOFTSTACK_CACHE_LIMIT 134217728
70 #if CUDA_VERSION < 6000
71 extern CUresult
cuGetErrorString (CUresult
, const char **);
72 #define CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR 82
75 #if CUDA_VERSION >= 6050
78 CUresult
cuLinkAddData (CUlinkState
, CUjitInputType
, void *, size_t,
79 const char *, unsigned, CUjit_option
*, void **);
80 CUresult
cuLinkCreate (unsigned, CUjit_option
*, void **, CUlinkState
*);
82 typedef size_t (*CUoccupancyB2DSize
)(int);
83 CUresult
cuLinkAddData_v2 (CUlinkState
, CUjitInputType
, void *, size_t,
84 const char *, unsigned, CUjit_option
*, void **);
85 CUresult
cuLinkCreate_v2 (unsigned, CUjit_option
*, void **, CUlinkState
*);
86 CUresult
cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction
,
87 CUoccupancyB2DSize
, size_t, int);
90 #define DO_PRAGMA(x) _Pragma (#x)
92 #ifndef PLUGIN_NVPTX_LINK_LIBCUDA
97 # define CUDA_ONE_CALL(call) \
98 __typeof (call) *call;
99 # define CUDA_ONE_CALL_MAYBE_NULL(call) \
101 #include "cuda-lib.def"
102 # undef CUDA_ONE_CALL
103 # undef CUDA_ONE_CALL_MAYBE_NULL
107 /* -1 if init_cuda_lib has not been called yet, false
108 if it has been and failed, true if it has been and succeeded. */
109 static signed char cuda_lib_inited
= -1;
111 /* Dynamically load the CUDA runtime library and initialize function
112 pointers, return false if unsuccessful, true if successful. */
116 if (cuda_lib_inited
!= -1)
117 return cuda_lib_inited
;
118 const char *cuda_runtime_lib
= "libcuda.so.1";
119 void *h
= dlopen (cuda_runtime_lib
, RTLD_LAZY
);
120 cuda_lib_inited
= false;
124 # define CUDA_ONE_CALL(call) CUDA_ONE_CALL_1 (call, false)
125 # define CUDA_ONE_CALL_MAYBE_NULL(call) CUDA_ONE_CALL_1 (call, true)
126 # define CUDA_ONE_CALL_1(call, allow_null) \
127 cuda_lib.call = dlsym (h, #call); \
128 if (!allow_null && cuda_lib.call == NULL) \
130 #include "cuda-lib.def"
131 # undef CUDA_ONE_CALL
132 # undef CUDA_ONE_CALL_1
133 # undef CUDA_ONE_CALL_MAYBE_NULL
135 cuda_lib_inited
= true;
138 # define CUDA_CALL_PREFIX cuda_lib.
141 # define CUDA_ONE_CALL(call)
142 # define CUDA_ONE_CALL_MAYBE_NULL(call) DO_PRAGMA (weak call)
143 #include "cuda-lib.def"
144 #undef CUDA_ONE_CALL_MAYBE_NULL
147 # define CUDA_CALL_PREFIX
148 # define init_cuda_lib() true
151 #include "secure_getenv.h"
155 #define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
156 #define MAX(X,Y) ((X) > (Y) ? (X) : (Y))
158 /* Convenience macros for the frequently used CUDA library call and
159 error handling sequence as well as CUDA library calls that
160 do the error checking themselves or don't do it at all. */
162 #define CUDA_CALL_ERET(ERET, FN, ...) \
165 = CUDA_CALL_PREFIX FN (__VA_ARGS__); \
166 if (__r != CUDA_SUCCESS) \
168 GOMP_PLUGIN_error (#FN " error: %s", \
174 #define CUDA_CALL(FN, ...) \
175 CUDA_CALL_ERET (false, FN, __VA_ARGS__)
177 #define CUDA_CALL_ASSERT(FN, ...) \
180 = CUDA_CALL_PREFIX FN (__VA_ARGS__); \
181 if (__r != CUDA_SUCCESS) \
183 GOMP_PLUGIN_fatal (#FN " error: %s", \
188 #define CUDA_CALL_NOCHECK(FN, ...) \
189 CUDA_CALL_PREFIX FN (__VA_ARGS__)
191 #define CUDA_CALL_EXISTS(FN) \
195 cuda_error (CUresult r
)
197 const char *fallback
= "unknown cuda error";
200 if (!CUDA_CALL_EXISTS (cuGetErrorString
))
203 r
= CUDA_CALL_NOCHECK (cuGetErrorString
, r
, &desc
);
204 if (r
== CUDA_SUCCESS
)
210 /* Version of the CUDA Toolkit in the same MAJOR.MINOR format that is used by
211 Nvidia, such as in the 'deviceQuery' program (Nvidia's CUDA samples). */
212 static char cuda_driver_version_s
[30];
214 static unsigned int instantiated_devices
= 0;
215 static pthread_mutex_t ptx_dev_lock
= PTHREAD_MUTEX_INITIALIZER
;
217 /* NVPTX/CUDA specific definition of asynchronous queues. */
218 struct goacc_asyncqueue
220 CUstream cuda_stream
;
223 struct nvptx_callback
227 struct goacc_asyncqueue
*aq
;
228 struct nvptx_callback
*next
;
231 /* Thread-specific data for PTX. */
235 /* We currently have this embedded inside the plugin because libgomp manages
236 devices through integer target_ids. This might be better if using an
237 opaque target-specific pointer directly from gomp_device_descr. */
238 struct ptx_device
*ptx_dev
;
241 /* Target data function launch information. */
243 struct targ_fn_launch
246 unsigned short dim
[GOMP_DIM_MAX
];
249 /* Target PTX object information. */
257 /* Target data image information. */
259 typedef struct nvptx_tdata
261 const struct targ_ptx_obj
*ptx_objs
;
264 const char *const *var_names
;
267 const struct targ_fn_launch
*fn_descs
;
273 /* Descriptor of a loaded function. */
275 struct targ_fn_descriptor
278 const struct targ_fn_launch
*launch
;
280 int max_threads_per_block
;
283 /* A loaded PTX image. */
284 struct ptx_image_data
286 const void *target_data
;
289 struct targ_fn_descriptor
*fns
; /* Array of functions. */
291 struct ptx_image_data
*next
;
294 struct ptx_free_block
297 struct ptx_free_block
*next
;
317 int max_threads_per_block
;
318 int max_threads_per_multiprocessor
;
319 int default_dims
[GOMP_DIM_MAX
];
321 /* Length as used by the CUDA Runtime API ('struct cudaDeviceProp'). */
324 struct ptx_image_data
*images
; /* Images loaded on device. */
325 pthread_mutex_t image_lock
; /* Lock for above list. */
327 struct ptx_free_block
*free_blocks
;
328 pthread_mutex_t free_blocks_lock
;
330 /* OpenMP stacks, cached between kernel invocations. */
335 pthread_mutex_t lock
;
338 struct rev_offload
*rev_data
;
339 struct ptx_device
*next
;
342 static struct ptx_device
**ptx_devices
;
344 /* OpenMP kernels reserve a small amount of ".shared" space for use by
345 omp_alloc. The size is configured using GOMP_NVPTX_LOWLAT_POOL, but the
346 default is set here. */
347 static unsigned lowlat_pool_size
= 8 * 1024;
349 static inline struct nvptx_thread
*
352 return (struct nvptx_thread
*) GOMP_PLUGIN_acc_thread ();
355 /* Initialize the device. Return TRUE on success, else FALSE. PTX_DEV_LOCK
356 should be locked on entry and remains locked on exit. */
363 if (instantiated_devices
!= 0)
366 if (!init_cuda_lib ())
369 CUDA_CALL (cuInit
, 0);
371 int cuda_driver_version
;
372 CUDA_CALL_ERET (NULL
, cuDriverGetVersion
, &cuda_driver_version
);
373 snprintf (cuda_driver_version_s
, sizeof cuda_driver_version_s
,
375 cuda_driver_version
/ 1000, cuda_driver_version
% 1000 / 10);
377 CUDA_CALL (cuDeviceGetCount
, &ndevs
);
378 ptx_devices
= GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device
*)
384 /* Select the N'th PTX device for the current host thread. The device must
385 have been previously opened before calling this function. */
388 nvptx_attach_host_thread_to_device (int n
)
392 struct ptx_device
*ptx_dev
;
395 r
= CUDA_CALL_NOCHECK (cuCtxGetDevice
, &dev
);
396 if (r
== CUDA_ERROR_NOT_PERMITTED
)
398 /* Assume we're in a CUDA callback, just return true. */
401 if (r
!= CUDA_SUCCESS
&& r
!= CUDA_ERROR_INVALID_CONTEXT
)
403 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r
));
407 if (r
!= CUDA_ERROR_INVALID_CONTEXT
&& dev
== n
)
413 ptx_dev
= ptx_devices
[n
];
416 GOMP_PLUGIN_error ("device %d not found", n
);
420 CUDA_CALL (cuCtxGetCurrent
, &thd_ctx
);
422 /* We don't necessarily have a current context (e.g. if it has been
423 destroyed. Pop it if we do though. */
425 CUDA_CALL (cuCtxPopCurrent
, &old_ctx
);
427 CUDA_CALL (cuCtxPushCurrent
, ptx_dev
->ctx
);
432 static struct ptx_device
*
433 nvptx_open_device (int n
)
435 struct ptx_device
*ptx_dev
;
436 CUdevice dev
, ctx_dev
;
440 CUDA_CALL_ERET (NULL
, cuDeviceGet
, &dev
, n
);
442 ptx_dev
= GOMP_PLUGIN_malloc (sizeof (struct ptx_device
));
446 ptx_dev
->ctx_shared
= false;
448 r
= CUDA_CALL_NOCHECK (cuCtxGetDevice
, &ctx_dev
);
449 if (r
!= CUDA_SUCCESS
&& r
!= CUDA_ERROR_INVALID_CONTEXT
)
451 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r
));
455 if (r
!= CUDA_ERROR_INVALID_CONTEXT
&& ctx_dev
!= dev
)
457 /* The current host thread has an active context for a different device.
460 CUDA_CALL_ERET (NULL
, cuCtxPopCurrent
, &old_ctx
);
463 CUDA_CALL_ERET (NULL
, cuCtxGetCurrent
, &ptx_dev
->ctx
);
466 CUDA_CALL_ERET (NULL
, cuCtxCreate
, &ptx_dev
->ctx
, CU_CTX_SCHED_AUTO
, dev
);
468 ptx_dev
->ctx_shared
= true;
470 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
471 &pi
, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP
, dev
);
472 ptx_dev
->overlap
= pi
;
474 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
475 &pi
, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY
, dev
);
478 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
479 &pi
, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS
, dev
);
480 ptx_dev
->concur
= pi
;
482 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
483 &pi
, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE
, dev
);
486 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
487 &pi
, CU_DEVICE_ATTRIBUTE_INTEGRATED
, dev
);
490 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
491 &pi
, CU_DEVICE_ATTRIBUTE_CLOCK_RATE
, dev
);
492 ptx_dev
->clock_khz
= pi
;
494 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
495 &pi
, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT
, dev
);
496 ptx_dev
->num_sms
= pi
;
498 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
499 &pi
, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK
, dev
);
500 ptx_dev
->regs_per_block
= pi
;
502 /* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR is defined only
503 in CUDA 6.0 and newer. */
504 r
= CUDA_CALL_NOCHECK (cuDeviceGetAttribute
, &pi
,
505 CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR
,
507 /* Fallback: use limit of registers per block, which is usually equal. */
508 if (r
== CUDA_ERROR_INVALID_VALUE
)
509 pi
= ptx_dev
->regs_per_block
;
510 else if (r
!= CUDA_SUCCESS
)
512 GOMP_PLUGIN_error ("cuDeviceGetAttribute error: %s", cuda_error (r
));
515 ptx_dev
->regs_per_sm
= pi
;
517 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
518 &pi
, CU_DEVICE_ATTRIBUTE_WARP_SIZE
, dev
);
521 GOMP_PLUGIN_error ("Only warp size 32 is supported");
524 ptx_dev
->warp_size
= pi
;
526 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
, &pi
,
527 CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK
, dev
);
528 ptx_dev
->max_threads_per_block
= pi
;
530 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
, &pi
,
531 CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR
, dev
);
532 ptx_dev
->max_threads_per_multiprocessor
= pi
;
534 /* Required below for reverse offload as implemented, but with compute
535 capability >= 2.0 and 64bit device processes, this should be universally be
536 the case; hence, an assert. */
537 r
= CUDA_CALL_NOCHECK (cuDeviceGetAttribute
, &pi
,
538 CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING
, dev
);
539 assert (r
== CUDA_SUCCESS
&& pi
);
541 for (int i
= 0; i
!= GOMP_DIM_MAX
; i
++)
542 ptx_dev
->default_dims
[i
] = 0;
544 CUDA_CALL_ERET (NULL
, cuDeviceGetName
, ptx_dev
->name
, sizeof ptx_dev
->name
,
547 ptx_dev
->images
= NULL
;
548 pthread_mutex_init (&ptx_dev
->image_lock
, NULL
);
550 ptx_dev
->free_blocks
= NULL
;
551 pthread_mutex_init (&ptx_dev
->free_blocks_lock
, NULL
);
553 ptx_dev
->omp_stacks
.ptr
= 0;
554 ptx_dev
->omp_stacks
.size
= 0;
555 pthread_mutex_init (&ptx_dev
->omp_stacks
.lock
, NULL
);
557 ptx_dev
->rev_data
= NULL
;
563 nvptx_close_device (struct ptx_device
*ptx_dev
)
568 for (struct ptx_free_block
*b
= ptx_dev
->free_blocks
; b
;)
570 struct ptx_free_block
*b_next
= b
->next
;
571 CUDA_CALL (cuMemFree
, (CUdeviceptr
) b
->ptr
);
576 pthread_mutex_destroy (&ptx_dev
->free_blocks_lock
);
577 pthread_mutex_destroy (&ptx_dev
->image_lock
);
579 pthread_mutex_destroy (&ptx_dev
->omp_stacks
.lock
);
581 if (ptx_dev
->omp_stacks
.ptr
)
582 CUDA_CALL (cuMemFree
, ptx_dev
->omp_stacks
.ptr
);
584 if (!ptx_dev
->ctx_shared
)
585 CUDA_CALL (cuCtxDestroy
, ptx_dev
->ctx
);
592 nvptx_get_num_devices (void)
596 /* This function will be called before the plugin has been initialized in
597 order to enumerate available devices, but CUDA API routines can't be used
598 until cuInit has been called. Just call it now (but don't yet do any
599 further initialization). */
600 if (instantiated_devices
== 0)
602 if (!init_cuda_lib ())
604 CUresult r
= CUDA_CALL_NOCHECK (cuInit
, 0);
605 /* This is not an error: e.g. we may have CUDA libraries installed but
606 no devices available. */
607 if (r
!= CUDA_SUCCESS
)
609 GOMP_PLUGIN_debug (0, "Disabling nvptx offloading; cuInit: %s\n",
615 CUDA_CALL_ERET (-1, cuDeviceGetCount
, &n
);
620 notify_var (const char *var_name
, const char *env_var
)
623 GOMP_PLUGIN_debug (0, "%s: <Not defined>\n", var_name
);
625 GOMP_PLUGIN_debug (0, "%s: '%s'\n", var_name
, env_var
);
629 process_GOMP_NVPTX_JIT (intptr_t *gomp_nvptx_o
)
631 const char *var_name
= "GOMP_NVPTX_JIT";
632 const char *env_var
= secure_getenv (var_name
);
633 notify_var (var_name
, env_var
);
638 const char *c
= env_var
;
644 if (c
[0] == '-' && c
[1] == 'O'
645 && '0' <= c
[2] && c
[2] <= '4'
646 && (c
[3] == '\0' || c
[3] == ' '))
648 *gomp_nvptx_o
= c
[2] - '0';
653 GOMP_PLUGIN_error ("Error parsing %s", var_name
);
659 link_ptx (CUmodule
*module
, const struct targ_ptx_obj
*ptx_objs
,
662 CUjit_option opts
[7];
667 CUlinkState linkstate
;
670 size_t linkoutsize
__attribute__ ((unused
));
672 opts
[0] = CU_JIT_WALL_TIME
;
673 optvals
[0] = &elapsed
;
675 opts
[1] = CU_JIT_INFO_LOG_BUFFER
;
676 optvals
[1] = &ilog
[0];
678 opts
[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES
;
679 optvals
[2] = (void *) sizeof ilog
;
681 opts
[3] = CU_JIT_ERROR_LOG_BUFFER
;
682 optvals
[3] = &elog
[0];
684 opts
[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES
;
685 optvals
[4] = (void *) sizeof elog
;
687 opts
[5] = CU_JIT_LOG_VERBOSE
;
688 optvals
[5] = (void *) 1;
690 static intptr_t gomp_nvptx_o
= -1;
692 static bool init_done
= false;
695 process_GOMP_NVPTX_JIT (&gomp_nvptx_o
);
700 if (gomp_nvptx_o
!= -1)
702 opts
[nopts
] = CU_JIT_OPTIMIZATION_LEVEL
;
703 optvals
[nopts
] = (void *) gomp_nvptx_o
;
707 if (CUDA_CALL_EXISTS (cuLinkCreate_v2
))
708 CUDA_CALL (cuLinkCreate_v2
, nopts
, opts
, optvals
, &linkstate
);
710 CUDA_CALL (cuLinkCreate
, nopts
, opts
, optvals
, &linkstate
);
712 for (; num_objs
--; ptx_objs
++)
714 /* cuLinkAddData's 'data' argument erroneously omits the const
716 GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs
->code
);
717 if (CUDA_CALL_EXISTS (cuLinkAddData_v2
))
718 r
= CUDA_CALL_NOCHECK (cuLinkAddData_v2
, linkstate
, CU_JIT_INPUT_PTX
,
719 (char *) ptx_objs
->code
, ptx_objs
->size
,
722 r
= CUDA_CALL_NOCHECK (cuLinkAddData
, linkstate
, CU_JIT_INPUT_PTX
,
723 (char *) ptx_objs
->code
, ptx_objs
->size
,
725 if (r
!= CUDA_SUCCESS
)
727 GOMP_PLUGIN_error ("Link error log %s\n", &elog
[0]);
728 GOMP_PLUGIN_error ("cuLinkAddData (ptx_code) error: %s",
734 GOMP_PLUGIN_debug (0, "Linking\n");
735 r
= CUDA_CALL_NOCHECK (cuLinkComplete
, linkstate
, &linkout
, &linkoutsize
);
737 GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed
);
738 GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog
[0]);
740 if (r
!= CUDA_SUCCESS
)
742 GOMP_PLUGIN_error ("Link error log %s\n", &elog
[0]);
743 GOMP_PLUGIN_error ("cuLinkComplete error: %s", cuda_error (r
));
747 CUDA_CALL (cuModuleLoadData
, module
, linkout
);
748 CUDA_CALL (cuLinkDestroy
, linkstate
);
753 nvptx_exec (void (*fn
), unsigned *dims
, void *targ_mem_desc
,
754 CUdeviceptr dp
, CUstream stream
)
756 struct targ_fn_descriptor
*targ_fn
= (struct targ_fn_descriptor
*) fn
;
760 struct nvptx_thread
*nvthd
= nvptx_thread ();
761 int warp_size
= nvthd
->ptx_dev
->warp_size
;
763 function
= targ_fn
->fn
;
765 /* Initialize the launch dimensions. Typically this is constant,
766 provided by the device compiler, but we must permit runtime
769 for (i
= 0; i
!= GOMP_DIM_MAX
; i
++)
771 if (targ_fn
->launch
->dim
[i
])
772 dims
[i
] = targ_fn
->launch
->dim
[i
];
779 pthread_mutex_lock (&ptx_dev_lock
);
781 static int gomp_openacc_dims
[GOMP_DIM_MAX
];
782 if (!gomp_openacc_dims
[0])
784 /* See if the user provided GOMP_OPENACC_DIM environment
785 variable to specify runtime defaults. */
786 for (int i
= 0; i
< GOMP_DIM_MAX
; ++i
)
787 gomp_openacc_dims
[i
] = GOMP_PLUGIN_acc_default_dim (i
);
790 if (!nvthd
->ptx_dev
->default_dims
[0])
792 int default_dims
[GOMP_DIM_MAX
];
793 for (int i
= 0; i
< GOMP_DIM_MAX
; ++i
)
794 default_dims
[i
] = gomp_openacc_dims
[i
];
796 int gang
, worker
, vector
;
798 int block_size
= nvthd
->ptx_dev
->max_threads_per_block
;
799 int cpu_size
= nvthd
->ptx_dev
->max_threads_per_multiprocessor
;
800 int dev_size
= nvthd
->ptx_dev
->num_sms
;
801 GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
802 " dev_size=%d, cpu_size=%d\n",
803 warp_size
, block_size
, dev_size
, cpu_size
);
805 gang
= (cpu_size
/ block_size
) * dev_size
;
806 worker
= block_size
/ warp_size
;
810 /* There is no upper bound on the gang size. The best size
811 matches the hardware configuration. Logical gangs are
812 scheduled onto physical hardware. To maximize usage, we
813 should guess a large number. */
814 if (default_dims
[GOMP_DIM_GANG
] < 1)
815 default_dims
[GOMP_DIM_GANG
] = gang
? gang
: 1024;
816 /* The worker size must not exceed the hardware. */
817 if (default_dims
[GOMP_DIM_WORKER
] < 1
818 || (default_dims
[GOMP_DIM_WORKER
] > worker
&& gang
))
819 default_dims
[GOMP_DIM_WORKER
] = worker
;
820 /* The vector size must exactly match the hardware. */
821 if (default_dims
[GOMP_DIM_VECTOR
] < 1
822 || (default_dims
[GOMP_DIM_VECTOR
] != vector
&& gang
))
823 default_dims
[GOMP_DIM_VECTOR
] = vector
;
825 GOMP_PLUGIN_debug (0, " default dimensions [%d,%d,%d]\n",
826 default_dims
[GOMP_DIM_GANG
],
827 default_dims
[GOMP_DIM_WORKER
],
828 default_dims
[GOMP_DIM_VECTOR
]);
830 for (i
= 0; i
!= GOMP_DIM_MAX
; i
++)
831 nvthd
->ptx_dev
->default_dims
[i
] = default_dims
[i
];
833 pthread_mutex_unlock (&ptx_dev_lock
);
836 bool default_dim_p
[GOMP_DIM_MAX
];
837 for (i
= 0; i
!= GOMP_DIM_MAX
; i
++)
838 default_dim_p
[i
] = !dims
[i
];
840 if (!CUDA_CALL_EXISTS (cuOccupancyMaxPotentialBlockSize
))
842 for (i
= 0; i
!= GOMP_DIM_MAX
; i
++)
843 if (default_dim_p
[i
])
844 dims
[i
] = nvthd
->ptx_dev
->default_dims
[i
];
846 if (default_dim_p
[GOMP_DIM_VECTOR
])
847 dims
[GOMP_DIM_VECTOR
]
848 = MIN (dims
[GOMP_DIM_VECTOR
],
849 (targ_fn
->max_threads_per_block
/ warp_size
852 if (default_dim_p
[GOMP_DIM_WORKER
])
853 dims
[GOMP_DIM_WORKER
]
854 = MIN (dims
[GOMP_DIM_WORKER
],
855 targ_fn
->max_threads_per_block
/ dims
[GOMP_DIM_VECTOR
]);
859 /* Handle the case that the compiler allows the runtime to choose
860 the vector-length conservatively, by ignoring
861 gomp_openacc_dims[GOMP_DIM_VECTOR]. TODO: actually handle
864 /* TODO: limit gomp_openacc_dims[GOMP_DIM_WORKER] such that that
865 gomp_openacc_dims[GOMP_DIM_WORKER] * actual_vectors does not
866 exceed targ_fn->max_threads_per_block. */
867 int workers
= gomp_openacc_dims
[GOMP_DIM_WORKER
];
868 int gangs
= gomp_openacc_dims
[GOMP_DIM_GANG
];
871 CUDA_CALL_ASSERT (cuOccupancyMaxPotentialBlockSize
, &grids
,
872 &blocks
, function
, NULL
, 0,
873 dims
[GOMP_DIM_WORKER
] * dims
[GOMP_DIM_VECTOR
]);
874 GOMP_PLUGIN_debug (0, "cuOccupancyMaxPotentialBlockSize: "
875 "grid = %d, block = %d\n", grids
, blocks
);
877 /* Keep the num_gangs proportional to the block size. In
878 the case were a block size is limited by shared-memory
879 or the register file capacity, the runtime will not
880 excessively over assign gangs to the multiprocessor
881 units if their state is going to be swapped out even
882 more than necessary. The constant factor 2 is there to
883 prevent threads from idling when there is insufficient
886 gangs
= 2 * grids
* (blocks
/ warp_size
);
893 int actual_vectors
= (default_dim_p
[GOMP_DIM_VECTOR
]
895 : dims
[GOMP_DIM_VECTOR
]);
896 workers
= blocks
/ actual_vectors
;
897 workers
= MAX (workers
, 1);
898 /* If we need a per-worker barrier ... . */
899 if (actual_vectors
> 32)
900 /* Don't use more barriers than available. */
901 workers
= MIN (workers
, 15);
904 for (i
= 0; i
!= GOMP_DIM_MAX
; i
++)
905 if (default_dim_p
[i
])
908 case GOMP_DIM_GANG
: dims
[i
] = gangs
; break;
909 case GOMP_DIM_WORKER
: dims
[i
] = workers
; break;
910 case GOMP_DIM_VECTOR
: dims
[i
] = vectors
; break;
911 default: GOMP_PLUGIN_fatal ("invalid dim");
917 /* Check if the accelerator has sufficient hardware resources to
918 launch the offloaded kernel. */
919 if (dims
[GOMP_DIM_WORKER
] * dims
[GOMP_DIM_VECTOR
]
920 > targ_fn
->max_threads_per_block
)
923 = ("The Nvidia accelerator has insufficient resources to launch '%s'"
924 " with num_workers = %d and vector_length = %d"
926 "recompile the program with 'num_workers = x and vector_length = y'"
927 " on that offloaded region or '-fopenacc-dim=:x:y' where"
930 GOMP_PLUGIN_fatal (msg
, targ_fn
->launch
->fn
, dims
[GOMP_DIM_WORKER
],
931 dims
[GOMP_DIM_VECTOR
], targ_fn
->max_threads_per_block
);
934 /* Check if the accelerator has sufficient barrier resources to
935 launch the offloaded kernel. */
936 if (dims
[GOMP_DIM_WORKER
] > 15 && dims
[GOMP_DIM_VECTOR
] > 32)
939 = ("The Nvidia accelerator has insufficient barrier resources to launch"
940 " '%s' with num_workers = %d and vector_length = %d"
942 "recompile the program with 'num_workers = x' on that offloaded"
943 " region or '-fopenacc-dim=:x:' where x <= 15"
945 "or, recompile the program with 'vector_length = 32' on that"
946 " offloaded region or '-fopenacc-dim=::32'"
948 GOMP_PLUGIN_fatal (msg
, targ_fn
->launch
->fn
, dims
[GOMP_DIM_WORKER
],
949 dims
[GOMP_DIM_VECTOR
]);
952 GOMP_PLUGIN_debug (0, " %s: kernel %s: launch"
953 " gangs=%u, workers=%u, vectors=%u\n",
954 __FUNCTION__
, targ_fn
->launch
->fn
, dims
[GOMP_DIM_GANG
],
955 dims
[GOMP_DIM_WORKER
], dims
[GOMP_DIM_VECTOR
]);
959 // num_gangs nctaid.x
960 // num_workers ntid.y
961 // vector length ntid.x
963 struct goacc_thread
*thr
= GOMP_PLUGIN_goacc_thread ();
964 acc_prof_info
*prof_info
= thr
->prof_info
;
965 acc_event_info enqueue_launch_event_info
;
966 acc_api_info
*api_info
= thr
->api_info
;
967 bool profiling_p
= __builtin_expect (prof_info
!= NULL
, false);
970 prof_info
->event_type
= acc_ev_enqueue_launch_start
;
972 enqueue_launch_event_info
.launch_event
.event_type
973 = prof_info
->event_type
;
974 enqueue_launch_event_info
.launch_event
.valid_bytes
975 = _ACC_LAUNCH_EVENT_INFO_VALID_BYTES
;
976 enqueue_launch_event_info
.launch_event
.parent_construct
977 = acc_construct_parallel
;
978 enqueue_launch_event_info
.launch_event
.implicit
= 1;
979 enqueue_launch_event_info
.launch_event
.tool_info
= NULL
;
980 enqueue_launch_event_info
.launch_event
.kernel_name
= targ_fn
->launch
->fn
;
981 enqueue_launch_event_info
.launch_event
.num_gangs
982 = dims
[GOMP_DIM_GANG
];
983 enqueue_launch_event_info
.launch_event
.num_workers
984 = dims
[GOMP_DIM_WORKER
];
985 enqueue_launch_event_info
.launch_event
.vector_length
986 = dims
[GOMP_DIM_VECTOR
];
988 api_info
->device_api
= acc_device_api_cuda
;
990 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info
, &enqueue_launch_event_info
,
995 CUDA_CALL_ASSERT (cuLaunchKernel
, function
,
996 dims
[GOMP_DIM_GANG
], 1, 1,
997 dims
[GOMP_DIM_VECTOR
], dims
[GOMP_DIM_WORKER
], 1,
998 0, stream
, kargs
, 0);
1002 prof_info
->event_type
= acc_ev_enqueue_launch_end
;
1003 enqueue_launch_event_info
.launch_event
.event_type
1004 = prof_info
->event_type
;
1005 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info
, &enqueue_launch_event_info
,
1009 GOMP_PLUGIN_debug (0, " %s: kernel %s: finished\n", __FUNCTION__
,
1010 targ_fn
->launch
->fn
);
1013 void * openacc_get_current_cuda_context (void);
1016 goacc_profiling_acc_ev_alloc (struct goacc_thread
*thr
, void *dp
, size_t s
)
1018 acc_prof_info
*prof_info
= thr
->prof_info
;
1019 acc_event_info data_event_info
;
1020 acc_api_info
*api_info
= thr
->api_info
;
1022 prof_info
->event_type
= acc_ev_alloc
;
1024 data_event_info
.data_event
.event_type
= prof_info
->event_type
;
1025 data_event_info
.data_event
.valid_bytes
= _ACC_DATA_EVENT_INFO_VALID_BYTES
;
1026 data_event_info
.data_event
.parent_construct
= acc_construct_parallel
;
1027 data_event_info
.data_event
.implicit
= 1;
1028 data_event_info
.data_event
.tool_info
= NULL
;
1029 data_event_info
.data_event
.var_name
= NULL
;
1030 data_event_info
.data_event
.bytes
= s
;
1031 data_event_info
.data_event
.host_ptr
= NULL
;
1032 data_event_info
.data_event
.device_ptr
= dp
;
1034 api_info
->device_api
= acc_device_api_cuda
;
1036 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info
, &data_event_info
, api_info
);
1039 /* Free the cached soft-stacks block if it is above the SOFTSTACK_CACHE_LIMIT
1040 size threshold, or if FORCE is true. */
1043 nvptx_stacks_free (struct ptx_device
*ptx_dev
, bool force
)
1045 pthread_mutex_lock (&ptx_dev
->omp_stacks
.lock
);
1046 if (ptx_dev
->omp_stacks
.ptr
1047 && (force
|| ptx_dev
->omp_stacks
.size
> SOFTSTACK_CACHE_LIMIT
))
1049 CUresult r
= CUDA_CALL_NOCHECK (cuMemFree
, ptx_dev
->omp_stacks
.ptr
);
1050 if (r
!= CUDA_SUCCESS
)
1051 GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r
));
1052 ptx_dev
->omp_stacks
.ptr
= 0;
1053 ptx_dev
->omp_stacks
.size
= 0;
1055 pthread_mutex_unlock (&ptx_dev
->omp_stacks
.lock
);
1059 nvptx_alloc (size_t s
, bool suppress_errors
)
1063 CUresult r
= CUDA_CALL_NOCHECK (cuMemAlloc
, &d
, s
);
1064 if (suppress_errors
&& r
== CUDA_ERROR_OUT_OF_MEMORY
)
1066 else if (r
!= CUDA_SUCCESS
)
1068 GOMP_PLUGIN_error ("nvptx_alloc error: %s", cuda_error (r
));
1072 /* NOTE: We only do profiling stuff if the memory allocation succeeds. */
1073 struct goacc_thread
*thr
= GOMP_PLUGIN_goacc_thread ();
1075 = __builtin_expect (thr
!= NULL
&& thr
->prof_info
!= NULL
, false);
1077 goacc_profiling_acc_ev_alloc (thr
, (void *) d
, s
);
1083 goacc_profiling_acc_ev_free (struct goacc_thread
*thr
, void *p
)
1085 acc_prof_info
*prof_info
= thr
->prof_info
;
1086 acc_event_info data_event_info
;
1087 acc_api_info
*api_info
= thr
->api_info
;
1089 prof_info
->event_type
= acc_ev_free
;
1091 data_event_info
.data_event
.event_type
= prof_info
->event_type
;
1092 data_event_info
.data_event
.valid_bytes
= _ACC_DATA_EVENT_INFO_VALID_BYTES
;
1093 data_event_info
.data_event
.parent_construct
= acc_construct_parallel
;
1094 data_event_info
.data_event
.implicit
= 1;
1095 data_event_info
.data_event
.tool_info
= NULL
;
1096 data_event_info
.data_event
.var_name
= NULL
;
1097 data_event_info
.data_event
.bytes
= -1;
1098 data_event_info
.data_event
.host_ptr
= NULL
;
1099 data_event_info
.data_event
.device_ptr
= p
;
1101 api_info
->device_api
= acc_device_api_cuda
;
1103 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info
, &data_event_info
, api_info
);
1107 nvptx_free (void *p
, struct ptx_device
*ptx_dev
)
1112 CUresult r
= CUDA_CALL_NOCHECK (cuMemGetAddressRange
, &pb
, &ps
,
1114 if (r
== CUDA_ERROR_NOT_PERMITTED
)
1116 /* We assume that this error indicates we are in a CUDA callback context,
1117 where all CUDA calls are not allowed (see cuStreamAddCallback
1118 documentation for description). Arrange to free this piece of device
1120 struct ptx_free_block
*n
1121 = GOMP_PLUGIN_malloc (sizeof (struct ptx_free_block
));
1123 pthread_mutex_lock (&ptx_dev
->free_blocks_lock
);
1124 n
->next
= ptx_dev
->free_blocks
;
1125 ptx_dev
->free_blocks
= n
;
1126 pthread_mutex_unlock (&ptx_dev
->free_blocks_lock
);
1129 else if (r
!= CUDA_SUCCESS
)
1131 GOMP_PLUGIN_error ("cuMemGetAddressRange error: %s", cuda_error (r
));
1134 if ((CUdeviceptr
) p
!= pb
)
1136 GOMP_PLUGIN_error ("invalid device address");
1140 CUDA_CALL (cuMemFree
, (CUdeviceptr
) p
);
1141 struct goacc_thread
*thr
= GOMP_PLUGIN_goacc_thread ();
1143 = __builtin_expect (thr
!= NULL
&& thr
->prof_info
!= NULL
, false);
1145 goacc_profiling_acc_ev_free (thr
, p
);
1151 nvptx_get_current_cuda_device (void)
1153 struct nvptx_thread
*nvthd
= nvptx_thread ();
1155 if (!nvthd
|| !nvthd
->ptx_dev
)
1158 return &nvthd
->ptx_dev
->dev
;
1162 nvptx_get_current_cuda_context (void)
1164 struct nvptx_thread
*nvthd
= nvptx_thread ();
1166 if (!nvthd
|| !nvthd
->ptx_dev
)
1169 return nvthd
->ptx_dev
->ctx
;
1172 /* Plugin entry points. */
1175 GOMP_OFFLOAD_get_name (void)
1181 GOMP_OFFLOAD_get_caps (void)
1183 return GOMP_OFFLOAD_CAP_OPENACC_200
| GOMP_OFFLOAD_CAP_OPENMP_400
;
1187 GOMP_OFFLOAD_get_type (void)
1189 return OFFLOAD_TARGET_TYPE_NVIDIA_PTX
;
1193 GOMP_OFFLOAD_get_num_devices (unsigned int omp_requires_mask
)
1195 int num_devices
= nvptx_get_num_devices ();
1196 /* Return -1 if no omp_requires_mask cannot be fulfilled but
1197 devices were present. Unified-shared address: see comment in
1198 nvptx_open_device for CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING. */
1200 && ((omp_requires_mask
1201 & ~(GOMP_REQUIRES_UNIFIED_ADDRESS
1202 | GOMP_REQUIRES_REVERSE_OFFLOAD
)) != 0))
1208 GOMP_OFFLOAD_init_device (int n
)
1210 struct ptx_device
*dev
;
1212 pthread_mutex_lock (&ptx_dev_lock
);
1214 if (!nvptx_init () || ptx_devices
[n
] != NULL
)
1216 pthread_mutex_unlock (&ptx_dev_lock
);
1220 dev
= nvptx_open_device (n
);
1223 ptx_devices
[n
] = dev
;
1224 instantiated_devices
++;
1227 const char *var_name
= "GOMP_NVPTX_LOWLAT_POOL";
1228 const char *env_var
= secure_getenv (var_name
);
1229 notify_var (var_name
, env_var
);
1231 if (env_var
!= NULL
)
1234 unsigned long val
= strtoul (env_var
, &endptr
, 10);
1235 if (endptr
== NULL
|| *endptr
!= '\0'
1236 || errno
== ERANGE
|| errno
== EINVAL
1238 GOMP_PLUGIN_error ("Error parsing %s", var_name
);
1240 lowlat_pool_size
= val
;
1243 pthread_mutex_unlock (&ptx_dev_lock
);
1249 GOMP_OFFLOAD_fini_device (int n
)
1251 pthread_mutex_lock (&ptx_dev_lock
);
1253 if (ptx_devices
[n
] != NULL
)
1255 if (!nvptx_attach_host_thread_to_device (n
)
1256 || !nvptx_close_device (ptx_devices
[n
]))
1258 pthread_mutex_unlock (&ptx_dev_lock
);
1261 ptx_devices
[n
] = NULL
;
1262 instantiated_devices
--;
1265 if (instantiated_devices
== 0)
1271 pthread_mutex_unlock (&ptx_dev_lock
);
1275 /* Return the libgomp version number we're compatible with. There is
1276 no requirement for cross-version compatibility. */
1279 GOMP_OFFLOAD_version (void)
1281 return GOMP_VERSION
;
1284 /* Initialize __nvptx_clocktick, if present in MODULE. */
1287 nvptx_set_clocktick (CUmodule module
, struct ptx_device
*dev
)
1290 CUresult r
= CUDA_CALL_NOCHECK (cuModuleGetGlobal
, &dptr
, NULL
,
1291 module
, "__nvptx_clocktick");
1292 if (r
== CUDA_ERROR_NOT_FOUND
)
1294 if (r
!= CUDA_SUCCESS
)
1295 GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r
));
1296 double __nvptx_clocktick
= 1e-3 / dev
->clock_khz
;
1297 r
= CUDA_CALL_NOCHECK (cuMemcpyHtoD
, dptr
, &__nvptx_clocktick
,
1298 sizeof (__nvptx_clocktick
));
1299 if (r
!= CUDA_SUCCESS
)
1300 GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r
));
1303 /* Load the (partial) program described by TARGET_DATA to device
1304 number ORD. Allocate and return TARGET_TABLE. If not NULL, REV_FN_TABLE
1305 will contain the on-device addresses of the functions for reverse offload.
1306 To be freed by the caller. */
1309 GOMP_OFFLOAD_load_image (int ord
, unsigned version
, const void *target_data
,
1310 struct addr_pair
**target_table
,
1311 uint64_t **rev_fn_table
,
1312 uint64_t *host_ind_fn_table
)
1315 const char *const *var_names
;
1316 const struct targ_fn_launch
*fn_descs
;
1317 unsigned int fn_entries
, var_entries
, ind_fn_entries
, other_entries
, i
, j
;
1318 struct targ_fn_descriptor
*targ_fns
;
1319 struct addr_pair
*targ_tbl
;
1320 const nvptx_tdata_t
*img_header
= (const nvptx_tdata_t
*) target_data
;
1321 struct ptx_image_data
*new_image
;
1322 struct ptx_device
*dev
;
1324 if (GOMP_VERSION_DEV (version
) > GOMP_VERSION_NVIDIA_PTX
)
1326 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1327 " (expected %u, received %u)",
1328 GOMP_VERSION_NVIDIA_PTX
, GOMP_VERSION_DEV (version
));
1332 if (!nvptx_attach_host_thread_to_device (ord
)
1333 || !link_ptx (&module
, img_header
->ptx_objs
, img_header
->ptx_num
))
1336 dev
= ptx_devices
[ord
];
1338 /* The mkoffload utility emits a struct of pointers/integers at the
1339 start of each offload image. The array of kernel names and the
1340 functions addresses form a one-to-one correspondence. */
1342 var_entries
= img_header
->var_num
;
1343 var_names
= img_header
->var_names
;
1344 fn_entries
= img_header
->fn_num
;
1345 fn_descs
= img_header
->fn_descs
;
1346 ind_fn_entries
= GOMP_VERSION_SUPPORTS_INDIRECT_FUNCS (version
)
1347 ? img_header
->ind_fn_num
: 0;
1349 /* Currently, other_entries contains only the struct of ICVs. */
1352 targ_tbl
= GOMP_PLUGIN_malloc (sizeof (struct addr_pair
)
1353 * (fn_entries
+ var_entries
+ other_entries
));
1354 targ_fns
= GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor
)
1357 *target_table
= targ_tbl
;
1359 new_image
= GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data
));
1360 new_image
->target_data
= target_data
;
1361 new_image
->module
= module
;
1362 new_image
->fns
= targ_fns
;
1364 pthread_mutex_lock (&dev
->image_lock
);
1365 new_image
->next
= dev
->images
;
1366 dev
->images
= new_image
;
1367 pthread_mutex_unlock (&dev
->image_lock
);
1369 for (i
= 0; i
< fn_entries
; i
++, targ_fns
++, targ_tbl
++)
1371 CUfunction function
;
1374 CUDA_CALL_ERET (-1, cuModuleGetFunction
, &function
, module
,
1376 CUDA_CALL_ERET (-1, cuFuncGetAttribute
, &nregs
,
1377 CU_FUNC_ATTRIBUTE_NUM_REGS
, function
);
1378 CUDA_CALL_ERET (-1, cuFuncGetAttribute
, &mthrs
,
1379 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK
, function
);
1381 targ_fns
->fn
= function
;
1382 targ_fns
->launch
= &fn_descs
[i
];
1383 targ_fns
->regs_per_thread
= nregs
;
1384 targ_fns
->max_threads_per_block
= mthrs
;
1386 targ_tbl
->start
= (uintptr_t) targ_fns
;
1387 targ_tbl
->end
= targ_tbl
->start
+ 1;
1390 for (j
= 0; j
< var_entries
; j
++, targ_tbl
++)
1395 CUDA_CALL_ERET (-1, cuModuleGetGlobal
,
1396 &var
, &bytes
, module
, var_names
[j
]);
1398 targ_tbl
->start
= (uintptr_t) var
;
1399 targ_tbl
->end
= targ_tbl
->start
+ bytes
;
1402 if (ind_fn_entries
> 0)
1407 /* Read indirect function table from image. */
1408 CUresult r
= CUDA_CALL_NOCHECK (cuModuleGetGlobal
, &var
, &bytes
, module
,
1409 "$offload_ind_func_table");
1410 if (r
!= CUDA_SUCCESS
)
1411 GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r
));
1412 assert (bytes
== sizeof (uint64_t) * ind_fn_entries
);
1414 uint64_t ind_fn_table
[ind_fn_entries
];
1415 r
= CUDA_CALL_NOCHECK (cuMemcpyDtoH
, ind_fn_table
, var
, bytes
);
1416 if (r
!= CUDA_SUCCESS
)
1417 GOMP_PLUGIN_fatal ("cuMemcpyDtoH error: %s", cuda_error (r
));
1419 /* Build host->target address map for indirect functions. */
1420 uint64_t ind_fn_map
[ind_fn_entries
* 2 + 1];
1421 for (unsigned k
= 0; k
< ind_fn_entries
; k
++)
1423 ind_fn_map
[k
* 2] = host_ind_fn_table
[k
];
1424 ind_fn_map
[k
* 2 + 1] = ind_fn_table
[k
];
1425 GOMP_PLUGIN_debug (0, "Indirect function %d: %lx->%lx\n",
1426 k
, host_ind_fn_table
[k
], ind_fn_table
[k
]);
1428 ind_fn_map
[ind_fn_entries
* 2] = 0;
1430 /* Write the map onto the target. */
1431 void *map_target_addr
1432 = GOMP_OFFLOAD_alloc (ord
, sizeof (ind_fn_map
));
1433 GOMP_PLUGIN_debug (0, "Allocated indirect map at %p\n", map_target_addr
);
1435 GOMP_OFFLOAD_host2dev (ord
, map_target_addr
,
1437 sizeof (ind_fn_map
));
1439 /* Write address of the map onto the target. */
1442 r
= CUDA_CALL_NOCHECK (cuModuleGetGlobal
, &varptr
, &varsize
,
1443 module
, XSTRING (GOMP_INDIRECT_ADDR_MAP
));
1444 if (r
!= CUDA_SUCCESS
)
1445 GOMP_PLUGIN_fatal ("Indirect map variable not found in image: %s",
1448 GOMP_PLUGIN_debug (0,
1449 "Indirect map variable found at %llx with size %ld\n",
1452 GOMP_OFFLOAD_host2dev (ord
, (void *) varptr
, &map_target_addr
,
1453 sizeof (map_target_addr
));
1458 CUresult r
= CUDA_CALL_NOCHECK (cuModuleGetGlobal
, &varptr
, &varsize
,
1459 module
, XSTRING (GOMP_ADDITIONAL_ICVS
));
1461 if (r
== CUDA_SUCCESS
)
1463 targ_tbl
->start
= (uintptr_t) varptr
;
1464 targ_tbl
->end
= (uintptr_t) (varptr
+ varsize
);
1467 /* The variable was not in this image. */
1468 targ_tbl
->start
= targ_tbl
->end
= 0;
1470 if (rev_fn_table
&& fn_entries
== 0)
1471 *rev_fn_table
= NULL
;
1472 else if (rev_fn_table
)
1477 r
= CUDA_CALL_NOCHECK (cuModuleGetGlobal
, &var
, &bytes
, module
,
1478 "$offload_func_table");
1479 if (r
!= CUDA_SUCCESS
)
1480 GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r
));
1481 assert (bytes
== sizeof (uint64_t) * fn_entries
);
1482 *rev_fn_table
= GOMP_PLUGIN_malloc (sizeof (uint64_t) * fn_entries
);
1483 r
= CUDA_CALL_NOCHECK (cuMemcpyDtoH
, *rev_fn_table
, var
, bytes
);
1484 if (r
!= CUDA_SUCCESS
)
1485 GOMP_PLUGIN_fatal ("cuMemcpyDtoH error: %s", cuda_error (r
));
1486 /* Free if only NULL entries. */
1487 for (i
= 0; i
< fn_entries
; ++i
)
1488 if ((*rev_fn_table
)[i
] != 0)
1490 if (i
== fn_entries
)
1492 free (*rev_fn_table
);
1493 *rev_fn_table
= NULL
;
1497 if (rev_fn_table
&& *rev_fn_table
&& dev
->rev_data
== NULL
)
1499 /* Get the on-device GOMP_REV_OFFLOAD_VAR variable. It should be
1500 available but it might be not. One reason could be: if the user code
1501 has 'omp target device(ancestor:1)' in pure hostcode, GOMP_target_ext
1502 is not called on the device and, hence, it and GOMP_REV_OFFLOAD_VAR
1503 are not linked in. */
1504 CUdeviceptr device_rev_offload_var
;
1505 size_t device_rev_offload_size
;
1506 CUresult r
= CUDA_CALL_NOCHECK (cuModuleGetGlobal
,
1507 &device_rev_offload_var
,
1508 &device_rev_offload_size
, module
,
1509 XSTRING (GOMP_REV_OFFLOAD_VAR
));
1510 if (r
!= CUDA_SUCCESS
)
1512 free (*rev_fn_table
);
1513 *rev_fn_table
= NULL
;
1517 /* cuMemHostAlloc memory is accessible on the device, if
1518 unified-shared address is supported; this is assumed - see comment
1519 in nvptx_open_device for CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING. */
1520 CUDA_CALL_ASSERT (cuMemHostAlloc
, (void **) &dev
->rev_data
,
1521 sizeof (*dev
->rev_data
), CU_MEMHOSTALLOC_DEVICEMAP
);
1522 CUdeviceptr dp
= (CUdeviceptr
) dev
->rev_data
;
1523 r
= CUDA_CALL_NOCHECK (cuMemcpyHtoD
, device_rev_offload_var
, &dp
,
1525 if (r
!= CUDA_SUCCESS
)
1526 GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r
));
1530 nvptx_set_clocktick (module
, dev
);
1532 return fn_entries
+ var_entries
+ other_entries
;
1535 /* Unload the program described by TARGET_DATA. DEV_DATA is the
1536 function descriptors allocated by G_O_load_image. */
1539 GOMP_OFFLOAD_unload_image (int ord
, unsigned version
, const void *target_data
)
1541 struct ptx_image_data
*image
, **prev_p
;
1542 struct ptx_device
*dev
= ptx_devices
[ord
];
1544 if (GOMP_VERSION_DEV (version
) > GOMP_VERSION_NVIDIA_PTX
)
1546 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1547 " (expected %u, received %u)",
1548 GOMP_VERSION_NVIDIA_PTX
, GOMP_VERSION_DEV (version
));
1553 pthread_mutex_lock (&dev
->image_lock
);
1554 for (prev_p
= &dev
->images
; (image
= *prev_p
) != 0; prev_p
= &image
->next
)
1555 if (image
->target_data
== target_data
)
1557 *prev_p
= image
->next
;
1558 if (CUDA_CALL_NOCHECK (cuModuleUnload
, image
->module
) != CUDA_SUCCESS
)
1564 pthread_mutex_unlock (&dev
->image_lock
);
1569 GOMP_OFFLOAD_alloc (int ord
, size_t size
)
1571 if (!nvptx_attach_host_thread_to_device (ord
))
1574 struct ptx_device
*ptx_dev
= ptx_devices
[ord
];
1575 struct ptx_free_block
*blocks
, *tmp
;
1577 pthread_mutex_lock (&ptx_dev
->free_blocks_lock
);
1578 blocks
= ptx_dev
->free_blocks
;
1579 ptx_dev
->free_blocks
= NULL
;
1580 pthread_mutex_unlock (&ptx_dev
->free_blocks_lock
);
1582 nvptx_stacks_free (ptx_dev
, false);
1587 nvptx_free (blocks
->ptr
, ptx_dev
);
1592 void *d
= nvptx_alloc (size
, true);
1597 /* Memory allocation failed. Try freeing the stacks block, and
1599 nvptx_stacks_free (ptx_dev
, true);
1600 return nvptx_alloc (size
, false);
1605 GOMP_OFFLOAD_free (int ord
, void *ptr
)
1607 return (nvptx_attach_host_thread_to_device (ord
)
1608 && nvptx_free (ptr
, ptx_devices
[ord
]));
1612 GOMP_OFFLOAD_openacc_exec (void (*fn
) (void *),
1613 size_t mapnum
__attribute__((unused
)),
1614 void **hostaddrs
__attribute__((unused
)),
1616 unsigned *dims
, void *targ_mem_desc
)
1618 GOMP_PLUGIN_debug (0, "nvptx %s\n", __FUNCTION__
);
1620 CUdeviceptr dp
= (CUdeviceptr
) devaddrs
;
1621 nvptx_exec (fn
, dims
, targ_mem_desc
, dp
, NULL
);
1623 CUresult r
= CUDA_CALL_NOCHECK (cuStreamSynchronize
, NULL
);
1624 const char *maybe_abort_msg
= "(perhaps abort was called)";
1625 if (r
== CUDA_ERROR_LAUNCH_FAILED
)
1626 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r
),
1628 else if (r
!= CUDA_SUCCESS
)
1629 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r
));
1633 GOMP_OFFLOAD_openacc_async_exec (void (*fn
) (void *),
1634 size_t mapnum
__attribute__((unused
)),
1635 void **hostaddrs
__attribute__((unused
)),
1637 unsigned *dims
, void *targ_mem_desc
,
1638 struct goacc_asyncqueue
*aq
)
1640 GOMP_PLUGIN_debug (0, "nvptx %s\n", __FUNCTION__
);
1642 CUdeviceptr dp
= (CUdeviceptr
) devaddrs
;
1643 nvptx_exec (fn
, dims
, targ_mem_desc
, dp
, aq
->cuda_stream
);
1647 GOMP_OFFLOAD_openacc_create_thread_data (int ord
)
1649 struct ptx_device
*ptx_dev
;
1650 struct nvptx_thread
*nvthd
1651 = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread
));
1654 ptx_dev
= ptx_devices
[ord
];
1658 CUDA_CALL_ASSERT (cuCtxGetCurrent
, &thd_ctx
);
1660 assert (ptx_dev
->ctx
);
1663 CUDA_CALL_ASSERT (cuCtxPushCurrent
, ptx_dev
->ctx
);
1665 nvthd
->ptx_dev
= ptx_dev
;
1667 return (void *) nvthd
;
1671 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data
)
1677 GOMP_OFFLOAD_openacc_cuda_get_current_device (void)
1679 return nvptx_get_current_cuda_device ();
1683 GOMP_OFFLOAD_openacc_cuda_get_current_context (void)
1685 return nvptx_get_current_cuda_context ();
1688 /* This returns a CUstream. */
1690 GOMP_OFFLOAD_openacc_cuda_get_stream (struct goacc_asyncqueue
*aq
)
1692 return (void *) aq
->cuda_stream
;
1695 /* This takes a CUstream. */
1697 GOMP_OFFLOAD_openacc_cuda_set_stream (struct goacc_asyncqueue
*aq
, void *stream
)
1699 if (aq
->cuda_stream
)
1701 CUDA_CALL_ASSERT (cuStreamSynchronize
, aq
->cuda_stream
);
1702 CUDA_CALL_ASSERT (cuStreamDestroy
, aq
->cuda_stream
);
1705 aq
->cuda_stream
= (CUstream
) stream
;
1709 static struct goacc_asyncqueue
*
1710 nvptx_goacc_asyncqueue_construct (unsigned int flags
)
1712 CUstream stream
= NULL
;
1713 CUDA_CALL_ERET (NULL
, cuStreamCreate
, &stream
, flags
);
1715 struct goacc_asyncqueue
*aq
1716 = GOMP_PLUGIN_malloc (sizeof (struct goacc_asyncqueue
));
1717 aq
->cuda_stream
= stream
;
1721 struct goacc_asyncqueue
*
1722 GOMP_OFFLOAD_openacc_async_construct (int device
__attribute__((unused
)))
1724 return nvptx_goacc_asyncqueue_construct (CU_STREAM_DEFAULT
);
1728 nvptx_goacc_asyncqueue_destruct (struct goacc_asyncqueue
*aq
)
1730 CUDA_CALL_ERET (false, cuStreamDestroy
, aq
->cuda_stream
);
1736 GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue
*aq
)
1738 return nvptx_goacc_asyncqueue_destruct (aq
);
1742 GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue
*aq
)
1744 CUresult r
= CUDA_CALL_NOCHECK (cuStreamQuery
, aq
->cuda_stream
);
1745 if (r
== CUDA_SUCCESS
)
1747 if (r
== CUDA_ERROR_NOT_READY
)
1750 GOMP_PLUGIN_error ("cuStreamQuery error: %s", cuda_error (r
));
1755 nvptx_goacc_asyncqueue_synchronize (struct goacc_asyncqueue
*aq
)
1757 CUDA_CALL_ERET (false, cuStreamSynchronize
, aq
->cuda_stream
);
1762 GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue
*aq
)
1764 return nvptx_goacc_asyncqueue_synchronize (aq
);
1768 GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue
*aq1
,
1769 struct goacc_asyncqueue
*aq2
)
1772 CUDA_CALL_ERET (false, cuEventCreate
, &e
, CU_EVENT_DISABLE_TIMING
);
1773 CUDA_CALL_ERET (false, cuEventRecord
, e
, aq1
->cuda_stream
);
1774 CUDA_CALL_ERET (false, cuStreamWaitEvent
, aq2
->cuda_stream
, e
, 0);
1779 cuda_callback_wrapper (CUstream stream
, CUresult res
, void *ptr
)
1781 if (res
!= CUDA_SUCCESS
)
1782 GOMP_PLUGIN_fatal ("%s error: %s", __FUNCTION__
, cuda_error (res
));
1783 struct nvptx_callback
*cb
= (struct nvptx_callback
*) ptr
;
1789 GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue
*aq
,
1790 void (*callback_fn
)(void *),
1793 struct nvptx_callback
*b
= GOMP_PLUGIN_malloc (sizeof (*b
));
1794 b
->fn
= callback_fn
;
1797 CUDA_CALL_ASSERT (cuStreamAddCallback
, aq
->cuda_stream
,
1798 cuda_callback_wrapper
, (void *) b
, 0);
1802 cuda_memcpy_sanity_check (const void *h
, const void *d
, size_t s
)
1810 GOMP_PLUGIN_error ("invalid device address");
1813 CUDA_CALL (cuMemGetAddressRange
, &pb
, &ps
, (CUdeviceptr
) d
);
1816 GOMP_PLUGIN_error ("invalid device address");
1821 GOMP_PLUGIN_error ("invalid host address");
1826 GOMP_PLUGIN_error ("invalid host or device address");
1829 if ((void *)(d
+ s
) > (void *)(pb
+ ps
))
1831 GOMP_PLUGIN_error ("invalid size");
1838 GOMP_OFFLOAD_host2dev (int ord
, void *dst
, const void *src
, size_t n
)
1840 if (!nvptx_attach_host_thread_to_device (ord
)
1841 || !cuda_memcpy_sanity_check (src
, dst
, n
))
1843 CUDA_CALL (cuMemcpyHtoD
, (CUdeviceptr
) dst
, src
, n
);
1848 GOMP_OFFLOAD_dev2host (int ord
, void *dst
, const void *src
, size_t n
)
1850 if (!nvptx_attach_host_thread_to_device (ord
)
1851 || !cuda_memcpy_sanity_check (dst
, src
, n
))
1853 CUDA_CALL (cuMemcpyDtoH
, dst
, (CUdeviceptr
) src
, n
);
1858 GOMP_OFFLOAD_dev2dev (int ord
, void *dst
, const void *src
, size_t n
)
1860 CUDA_CALL (cuMemcpyDtoDAsync
, (CUdeviceptr
) dst
, (CUdeviceptr
) src
, n
, NULL
);
1865 GOMP_OFFLOAD_memcpy2d (int dst_ord
, int src_ord
, size_t dim1_size
,
1866 size_t dim0_len
, void *dst
, size_t dst_offset1_size
,
1867 size_t dst_offset0_len
, size_t dst_dim1_size
,
1868 const void *src
, size_t src_offset1_size
,
1869 size_t src_offset0_len
, size_t src_dim1_size
)
1871 if (!nvptx_attach_host_thread_to_device (src_ord
!= -1 ? src_ord
: dst_ord
))
1874 /* TODO: Consider using CU_MEMORYTYPE_UNIFIED if supported. */
1878 memset (&data
, 0, sizeof (data
));
1879 data
.WidthInBytes
= dim1_size
;
1880 data
.Height
= dim0_len
;
1884 data
.dstMemoryType
= CU_MEMORYTYPE_HOST
;
1889 data
.dstMemoryType
= CU_MEMORYTYPE_DEVICE
;
1890 data
.dstDevice
= (CUdeviceptr
) dst
;
1892 data
.dstPitch
= dst_dim1_size
;
1893 data
.dstXInBytes
= dst_offset1_size
;
1894 data
.dstY
= dst_offset0_len
;
1898 data
.srcMemoryType
= CU_MEMORYTYPE_HOST
;
1903 data
.srcMemoryType
= CU_MEMORYTYPE_DEVICE
;
1904 data
.srcDevice
= (CUdeviceptr
) src
;
1906 data
.srcPitch
= src_dim1_size
;
1907 data
.srcXInBytes
= src_offset1_size
;
1908 data
.srcY
= src_offset0_len
;
1910 if (data
.srcXInBytes
!= 0 || data
.srcY
!= 0)
1912 /* Adjust origin to the actual array data, else the CUDA 2D memory
1913 copy API calls below may fail to validate source/dest pointers
1914 correctly (especially for Fortran where the "virtual origin" of an
1915 array is often outside the stored data). */
1917 data
.srcHost
= (const void *) ((const char *) data
.srcHost
1918 + data
.srcY
* data
.srcPitch
1919 + data
.srcXInBytes
);
1921 data
.srcDevice
+= data
.srcY
* data
.srcPitch
+ data
.srcXInBytes
;
1922 data
.srcXInBytes
= 0;
1926 if (data
.dstXInBytes
!= 0 || data
.dstY
!= 0)
1930 data
.dstHost
= (void *) ((char *) data
.dstHost
1931 + data
.dstY
* data
.dstPitch
1932 + data
.dstXInBytes
);
1934 data
.dstDevice
+= data
.dstY
* data
.dstPitch
+ data
.dstXInBytes
;
1935 data
.dstXInBytes
= 0;
1939 CUresult res
= CUDA_CALL_NOCHECK (cuMemcpy2D
, &data
);
1940 if (res
== CUDA_ERROR_INVALID_VALUE
)
1941 /* If pitch > CU_DEVICE_ATTRIBUTE_MAX_PITCH or for device-to-device
1942 for (some) memory not allocated by cuMemAllocPitch, cuMemcpy2D fails
1943 with an error; try the slower cuMemcpy2DUnaligned now. */
1944 CUDA_CALL (cuMemcpy2DUnaligned
, &data
);
1945 else if (res
!= CUDA_SUCCESS
)
1947 GOMP_PLUGIN_error ("cuMemcpy2D error: %s", cuda_error (res
));
1954 GOMP_OFFLOAD_memcpy3d (int dst_ord
, int src_ord
, size_t dim2_size
,
1955 size_t dim1_len
, size_t dim0_len
, void *dst
,
1956 size_t dst_offset2_size
, size_t dst_offset1_len
,
1957 size_t dst_offset0_len
, size_t dst_dim2_size
,
1958 size_t dst_dim1_len
, const void *src
,
1959 size_t src_offset2_size
, size_t src_offset1_len
,
1960 size_t src_offset0_len
, size_t src_dim2_size
,
1961 size_t src_dim1_len
)
1963 if (!nvptx_attach_host_thread_to_device (src_ord
!= -1 ? src_ord
: dst_ord
))
1966 /* TODO: Consider using CU_MEMORYTYPE_UNIFIED if supported. */
1970 memset (&data
, 0, sizeof (data
));
1971 data
.WidthInBytes
= dim2_size
;
1972 data
.Height
= dim1_len
;
1973 data
.Depth
= dim0_len
;
1977 data
.dstMemoryType
= CU_MEMORYTYPE_HOST
;
1982 data
.dstMemoryType
= CU_MEMORYTYPE_DEVICE
;
1983 data
.dstDevice
= (CUdeviceptr
) dst
;
1985 data
.dstPitch
= dst_dim2_size
;
1986 data
.dstHeight
= dst_dim1_len
;
1987 data
.dstXInBytes
= dst_offset2_size
;
1988 data
.dstY
= dst_offset1_len
;
1989 data
.dstZ
= dst_offset0_len
;
1993 data
.srcMemoryType
= CU_MEMORYTYPE_HOST
;
1998 data
.srcMemoryType
= CU_MEMORYTYPE_DEVICE
;
1999 data
.srcDevice
= (CUdeviceptr
) src
;
2001 data
.srcPitch
= src_dim2_size
;
2002 data
.srcHeight
= src_dim1_len
;
2003 data
.srcXInBytes
= src_offset2_size
;
2004 data
.srcY
= src_offset1_len
;
2005 data
.srcZ
= src_offset0_len
;
2007 if (data
.srcXInBytes
!= 0 || data
.srcY
!= 0 || data
.srcZ
!= 0)
2009 /* Adjust origin to the actual array data, else the CUDA 3D memory
2010 copy API call below may fail to validate source/dest pointers
2011 correctly (especially for Fortran where the "virtual origin" of an
2012 array is often outside the stored data). */
2015 = (const void *) ((const char *) data
.srcHost
2016 + (data
.srcZ
* data
.srcHeight
+ data
.srcY
)
2018 + data
.srcXInBytes
);
2021 += (data
.srcZ
* data
.srcHeight
+ data
.srcY
) * data
.srcPitch
2023 data
.srcXInBytes
= 0;
2028 if (data
.dstXInBytes
!= 0 || data
.dstY
!= 0 || data
.dstZ
!= 0)
2032 data
.dstHost
= (void *) ((char *) data
.dstHost
2033 + (data
.dstZ
* data
.dstHeight
+ data
.dstY
)
2035 + data
.dstXInBytes
);
2038 += (data
.dstZ
* data
.dstHeight
+ data
.dstY
) * data
.dstPitch
2040 data
.dstXInBytes
= 0;
2045 CUDA_CALL (cuMemcpy3D
, &data
);
2050 GOMP_OFFLOAD_openacc_async_host2dev (int ord
, void *dst
, const void *src
,
2051 size_t n
, struct goacc_asyncqueue
*aq
)
2053 if (!nvptx_attach_host_thread_to_device (ord
)
2054 || !cuda_memcpy_sanity_check (src
, dst
, n
))
2056 CUDA_CALL (cuMemcpyHtoDAsync
, (CUdeviceptr
) dst
, src
, n
, aq
->cuda_stream
);
2061 GOMP_OFFLOAD_openacc_async_dev2host (int ord
, void *dst
, const void *src
,
2062 size_t n
, struct goacc_asyncqueue
*aq
)
2064 if (!nvptx_attach_host_thread_to_device (ord
)
2065 || !cuda_memcpy_sanity_check (dst
, src
, n
))
2067 CUDA_CALL (cuMemcpyDtoHAsync
, dst
, (CUdeviceptr
) src
, n
, aq
->cuda_stream
);
2071 union goacc_property_value
2072 GOMP_OFFLOAD_openacc_get_property (int n
, enum goacc_property prop
)
2074 union goacc_property_value propval
= { .val
= 0 };
2076 pthread_mutex_lock (&ptx_dev_lock
);
2078 if (n
>= nvptx_get_num_devices () || n
< 0 || ptx_devices
[n
] == NULL
)
2080 pthread_mutex_unlock (&ptx_dev_lock
);
2084 struct ptx_device
*ptx_dev
= ptx_devices
[n
];
2087 case GOACC_PROPERTY_MEMORY
:
2091 CUDA_CALL_ERET (propval
, cuDeviceTotalMem
, &total_mem
, ptx_dev
->dev
);
2092 propval
.val
= total_mem
;
2095 case GOACC_PROPERTY_FREE_MEMORY
:
2101 CUDA_CALL_ERET (propval
, cuCtxGetDevice
, &ctxdev
);
2102 if (ptx_dev
->dev
== ctxdev
)
2103 CUDA_CALL_ERET (propval
, cuMemGetInfo
, &free_mem
, &total_mem
);
2104 else if (ptx_dev
->ctx
)
2108 CUDA_CALL_ERET (propval
, cuCtxPushCurrent
, ptx_dev
->ctx
);
2109 CUDA_CALL_ERET (propval
, cuMemGetInfo
, &free_mem
, &total_mem
);
2110 CUDA_CALL_ASSERT (cuCtxPopCurrent
, &old_ctx
);
2116 CUDA_CALL_ERET (propval
, cuCtxCreate
, &new_ctx
, CU_CTX_SCHED_AUTO
,
2118 CUDA_CALL_ERET (propval
, cuMemGetInfo
, &free_mem
, &total_mem
);
2119 CUDA_CALL_ASSERT (cuCtxDestroy
, new_ctx
);
2121 propval
.val
= free_mem
;
2124 case GOACC_PROPERTY_NAME
:
2125 propval
.ptr
= ptx_dev
->name
;
2127 case GOACC_PROPERTY_VENDOR
:
2128 propval
.ptr
= "Nvidia";
2130 case GOACC_PROPERTY_DRIVER
:
2131 propval
.ptr
= cuda_driver_version_s
;
2137 pthread_mutex_unlock (&ptx_dev_lock
);
2141 /* Adjust launch dimensions: pick good values for number of blocks and warps
2142 and ensure that number of warps does not exceed CUDA limits as well as GCC's
2146 nvptx_adjust_launch_bounds (struct targ_fn_descriptor
*fn
,
2147 struct ptx_device
*ptx_dev
,
2148 int *teams_p
, int *threads_p
)
2150 int max_warps_block
= fn
->max_threads_per_block
/ 32;
2151 /* Maximum 32 warps per block is an implementation limit in NVPTX backend
2152 and libgcc, which matches documented limit of all GPUs as of 2015. */
2153 if (max_warps_block
> 32)
2154 max_warps_block
= 32;
2155 if (*threads_p
<= 0)
2157 if (*threads_p
> max_warps_block
)
2158 *threads_p
= max_warps_block
;
2160 int regs_per_block
= fn
->regs_per_thread
* 32 * *threads_p
;
2161 /* This is an estimate of how many blocks the device can host simultaneously.
2162 Actual limit, which may be lower, can be queried with "occupancy control"
2163 driver interface (since CUDA 6.0). */
2164 int max_blocks
= ptx_dev
->regs_per_sm
/ regs_per_block
* ptx_dev
->num_sms
;
2165 if (*teams_p
<= 0 || *teams_p
> max_blocks
)
2166 *teams_p
= max_blocks
;
2169 /* Return the size of per-warp stacks (see gcc -msoft-stack) to use for OpenMP
2173 nvptx_stacks_size ()
2178 /* Return contiguous storage for NUM stacks, each SIZE bytes. The lock for
2179 the storage should be held on entry, and remains held on exit. */
2182 nvptx_stacks_acquire (struct ptx_device
*ptx_dev
, size_t size
, int num
)
2184 if (ptx_dev
->omp_stacks
.ptr
&& ptx_dev
->omp_stacks
.size
>= size
* num
)
2185 return (void *) ptx_dev
->omp_stacks
.ptr
;
2187 /* Free the old, too-small stacks. */
2188 if (ptx_dev
->omp_stacks
.ptr
)
2190 CUresult r
= CUDA_CALL_NOCHECK (cuCtxSynchronize
, );
2191 if (r
!= CUDA_SUCCESS
)
2192 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s\n", cuda_error (r
));
2193 r
= CUDA_CALL_NOCHECK (cuMemFree
, ptx_dev
->omp_stacks
.ptr
);
2194 if (r
!= CUDA_SUCCESS
)
2195 GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r
));
2198 /* Make new and bigger stacks, and remember where we put them and how big
2200 CUresult r
= CUDA_CALL_NOCHECK (cuMemAlloc
, &ptx_dev
->omp_stacks
.ptr
,
2202 if (r
!= CUDA_SUCCESS
)
2203 GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r
));
2205 ptx_dev
->omp_stacks
.size
= size
* num
;
2207 return (void *) ptx_dev
->omp_stacks
.ptr
;
2212 GOMP_OFFLOAD_run (int ord
, void *tgt_fn
, void *tgt_vars
, void **args
)
2214 struct targ_fn_descriptor
*tgt_fn_desc
2215 = (struct targ_fn_descriptor
*) tgt_fn
;
2216 CUfunction function
= tgt_fn_desc
->fn
;
2217 const struct targ_fn_launch
*launch
= tgt_fn_desc
->launch
;
2218 const char *fn_name
= launch
->fn
;
2220 struct ptx_device
*ptx_dev
= ptx_devices
[ord
];
2221 const char *maybe_abort_msg
= "(perhaps abort was called)";
2222 int teams
= 0, threads
= 0;
2225 GOMP_PLUGIN_fatal ("No target arguments provided");
2228 intptr_t id
= (intptr_t) *args
++, val
;
2229 if (id
& GOMP_TARGET_ARG_SUBSEQUENT_PARAM
)
2230 val
= (intptr_t) *args
++;
2232 val
= id
>> GOMP_TARGET_ARG_VALUE_SHIFT
;
2233 if ((id
& GOMP_TARGET_ARG_DEVICE_MASK
) != GOMP_TARGET_ARG_DEVICE_ALL
)
2235 val
= val
> INT_MAX
? INT_MAX
: val
;
2236 id
&= GOMP_TARGET_ARG_ID_MASK
;
2237 if (id
== GOMP_TARGET_ARG_NUM_TEAMS
)
2239 else if (id
== GOMP_TARGET_ARG_THREAD_LIMIT
)
2242 nvptx_adjust_launch_bounds (tgt_fn
, ptx_dev
, &teams
, &threads
);
2244 bool reverse_offload
= ptx_dev
->rev_data
!= NULL
;
2245 struct goacc_asyncqueue
*reverse_offload_aq
= NULL
;
2246 if (reverse_offload
)
2249 = nvptx_goacc_asyncqueue_construct (CU_STREAM_NON_BLOCKING
);
2250 if (!reverse_offload_aq
)
2251 exit (EXIT_FAILURE
);
2254 size_t stack_size
= nvptx_stacks_size ();
2256 pthread_mutex_lock (&ptx_dev
->omp_stacks
.lock
);
2257 void *stacks
= nvptx_stacks_acquire (ptx_dev
, stack_size
, teams
* threads
);
2258 void *fn_args
[] = {tgt_vars
, stacks
, (void *) stack_size
};
2259 size_t fn_args_size
= sizeof fn_args
;
2261 CU_LAUNCH_PARAM_BUFFER_POINTER
, fn_args
,
2262 CU_LAUNCH_PARAM_BUFFER_SIZE
, &fn_args_size
,
2265 GOMP_PLUGIN_debug (0, " %s: kernel %s: launch"
2266 " [(teams: %u), 1, 1] [(lanes: 32), (threads: %u), 1]\n",
2267 __FUNCTION__
, fn_name
, teams
, threads
);
2268 r
= CUDA_CALL_NOCHECK (cuLaunchKernel
, function
, teams
, 1, 1,
2269 32, threads
, 1, lowlat_pool_size
, NULL
, NULL
, config
);
2270 if (r
!= CUDA_SUCCESS
)
2271 GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r
));
2272 if (reverse_offload
)
2275 r
= CUDA_CALL_NOCHECK (cuStreamQuery
, NULL
);
2276 if (r
== CUDA_SUCCESS
)
2278 if (r
== CUDA_ERROR_LAUNCH_FAILED
)
2279 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s %s\n", cuda_error (r
),
2281 else if (r
!= CUDA_ERROR_NOT_READY
)
2282 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r
));
2284 if (__atomic_load_n (&ptx_dev
->rev_data
->fn
, __ATOMIC_ACQUIRE
) != 0)
2286 struct rev_offload
*rev_data
= ptx_dev
->rev_data
;
2287 GOMP_PLUGIN_target_rev (rev_data
->fn
, rev_data
->mapnum
,
2288 rev_data
->addrs
, rev_data
->sizes
,
2289 rev_data
->kinds
, rev_data
->dev_num
,
2290 reverse_offload_aq
);
2291 if (!nvptx_goacc_asyncqueue_synchronize (reverse_offload_aq
))
2292 exit (EXIT_FAILURE
);
2293 __atomic_store_n (&rev_data
->fn
, 0, __ATOMIC_RELEASE
);
2298 r
= CUDA_CALL_NOCHECK (cuCtxSynchronize
, );
2299 if (r
== CUDA_ERROR_LAUNCH_FAILED
)
2300 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r
),
2302 else if (r
!= CUDA_SUCCESS
)
2303 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r
));
2305 pthread_mutex_unlock (&ptx_dev
->omp_stacks
.lock
);
2307 if (reverse_offload
)
2309 if (!nvptx_goacc_asyncqueue_destruct (reverse_offload_aq
))
2310 exit (EXIT_FAILURE
);
2314 /* TODO: Implement GOMP_OFFLOAD_async_run. */