1 /* Plugin for NVPTX execution.
3 Copyright (C) 2013-2021 Free Software Foundation, Inc.
5 Contributed by Mentor Embedded.
7 This file is part of the GNU Offloading and Multi Processing Library
10 Libgomp is free software; you can redistribute it and/or modify it
11 under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 3, or (at your option)
15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
20 Under Section 7 of GPL version 3, you are granted additional
21 permissions described in the GCC Runtime Library Exception, version
22 3.1, as published by the Free Software Foundation.
24 You should have received a copy of the GNU General Public License and
25 a copy of the GCC Runtime Library Exception along with this program;
26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
27 <http://www.gnu.org/licenses/>. */
29 /* Nvidia PTX-specific parts of OpenACC support. The cuda driver
30 library appears to hold some implicit state, but the documentation
31 is not clear as to what that state might be. Or how one might
32 propagate it from one thread to another. */
38 #include "libgomp-plugin.h"
39 #include "oacc-plugin.h"
40 #include "gomp-constants.h"
53 /* An arbitrary fixed limit (128MB) for the size of the OpenMP soft stacks
54 block to cache between kernel invocations. For soft-stacks blocks bigger
55 than this, we will free the block before attempting another GPU memory
56 allocation (i.e. in GOMP_OFFLOAD_alloc). Otherwise, if an allocation fails,
57 we will free the cached soft-stacks block anyway then retry the
58 allocation. If that fails too, we lose. */
60 #define SOFTSTACK_CACHE_LIMIT 134217728
62 #if CUDA_VERSION < 6000
63 extern CUresult
cuGetErrorString (CUresult
, const char **);
64 #define CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR 82
67 #if CUDA_VERSION >= 6050
70 CUresult
cuLinkAddData (CUlinkState
, CUjitInputType
, void *, size_t,
71 const char *, unsigned, CUjit_option
*, void **);
72 CUresult
cuLinkCreate (unsigned, CUjit_option
*, void **, CUlinkState
*);
74 typedef size_t (*CUoccupancyB2DSize
)(int);
75 CUresult
cuLinkAddData_v2 (CUlinkState
, CUjitInputType
, void *, size_t,
76 const char *, unsigned, CUjit_option
*, void **);
77 CUresult
cuLinkCreate_v2 (unsigned, CUjit_option
*, void **, CUlinkState
*);
78 CUresult
cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction
,
79 CUoccupancyB2DSize
, size_t, int);
82 #define DO_PRAGMA(x) _Pragma (#x)
84 #if PLUGIN_NVPTX_DYNAMIC
89 # define CUDA_ONE_CALL(call) \
90 __typeof (call) *call;
91 # define CUDA_ONE_CALL_MAYBE_NULL(call) \
93 #include "cuda-lib.def"
95 # undef CUDA_ONE_CALL_MAYBE_NULL
99 /* -1 if init_cuda_lib has not been called yet, false
100 if it has been and failed, true if it has been and succeeded. */
101 static signed char cuda_lib_inited
= -1;
103 /* Dynamically load the CUDA runtime library and initialize function
104 pointers, return false if unsuccessful, true if successful. */
108 if (cuda_lib_inited
!= -1)
109 return cuda_lib_inited
;
110 const char *cuda_runtime_lib
= "libcuda.so.1";
111 void *h
= dlopen (cuda_runtime_lib
, RTLD_LAZY
);
112 cuda_lib_inited
= false;
116 # define CUDA_ONE_CALL(call) CUDA_ONE_CALL_1 (call, false)
117 # define CUDA_ONE_CALL_MAYBE_NULL(call) CUDA_ONE_CALL_1 (call, true)
118 # define CUDA_ONE_CALL_1(call, allow_null) \
119 cuda_lib.call = dlsym (h, #call); \
120 if (!allow_null && cuda_lib.call == NULL) \
122 #include "cuda-lib.def"
123 # undef CUDA_ONE_CALL
124 # undef CUDA_ONE_CALL_1
125 # undef CUDA_ONE_CALL_MAYBE_NULL
127 cuda_lib_inited
= true;
130 # define CUDA_CALL_PREFIX cuda_lib.
133 # define CUDA_ONE_CALL(call)
134 # define CUDA_ONE_CALL_MAYBE_NULL(call) DO_PRAGMA (weak call)
135 #include "cuda-lib.def"
136 #undef CUDA_ONE_CALL_MAYBE_NULL
139 # define CUDA_CALL_PREFIX
140 # define init_cuda_lib() true
143 #include "secure_getenv.h"
147 #define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
148 #define MAX(X,Y) ((X) > (Y) ? (X) : (Y))
150 /* Convenience macros for the frequently used CUDA library call and
151 error handling sequence as well as CUDA library calls that
152 do the error checking themselves or don't do it at all. */
154 #define CUDA_CALL_ERET(ERET, FN, ...) \
157 = CUDA_CALL_PREFIX FN (__VA_ARGS__); \
158 if (__r != CUDA_SUCCESS) \
160 GOMP_PLUGIN_error (#FN " error: %s", \
166 #define CUDA_CALL(FN, ...) \
167 CUDA_CALL_ERET (false, FN, __VA_ARGS__)
169 #define CUDA_CALL_ASSERT(FN, ...) \
172 = CUDA_CALL_PREFIX FN (__VA_ARGS__); \
173 if (__r != CUDA_SUCCESS) \
175 GOMP_PLUGIN_fatal (#FN " error: %s", \
180 #define CUDA_CALL_NOCHECK(FN, ...) \
181 CUDA_CALL_PREFIX FN (__VA_ARGS__)
183 #define CUDA_CALL_EXISTS(FN) \
187 cuda_error (CUresult r
)
189 const char *fallback
= "unknown cuda error";
192 if (!CUDA_CALL_EXISTS (cuGetErrorString
))
195 r
= CUDA_CALL_NOCHECK (cuGetErrorString
, r
, &desc
);
196 if (r
== CUDA_SUCCESS
)
202 /* Version of the CUDA Toolkit in the same MAJOR.MINOR format that is used by
203 Nvidia, such as in the 'deviceQuery' program (Nvidia's CUDA samples). */
204 static char cuda_driver_version_s
[30];
206 static unsigned int instantiated_devices
= 0;
207 static pthread_mutex_t ptx_dev_lock
= PTHREAD_MUTEX_INITIALIZER
;
209 /* NVPTX/CUDA specific definition of asynchronous queues. */
210 struct goacc_asyncqueue
212 CUstream cuda_stream
;
215 struct nvptx_callback
219 struct goacc_asyncqueue
*aq
;
220 struct nvptx_callback
*next
;
223 /* Thread-specific data for PTX. */
227 /* We currently have this embedded inside the plugin because libgomp manages
228 devices through integer target_ids. This might be better if using an
229 opaque target-specific pointer directly from gomp_device_descr. */
230 struct ptx_device
*ptx_dev
;
233 /* Target data function launch information. */
235 struct targ_fn_launch
238 unsigned short dim
[GOMP_DIM_MAX
];
241 /* Target PTX object information. */
249 /* Target data image information. */
251 typedef struct nvptx_tdata
253 const struct targ_ptx_obj
*ptx_objs
;
256 const char *const *var_names
;
259 const struct targ_fn_launch
*fn_descs
;
263 /* Descriptor of a loaded function. */
265 struct targ_fn_descriptor
268 const struct targ_fn_launch
*launch
;
270 int max_threads_per_block
;
273 /* A loaded PTX image. */
274 struct ptx_image_data
276 const void *target_data
;
279 struct targ_fn_descriptor
*fns
; /* Array of functions. */
281 struct ptx_image_data
*next
;
284 struct ptx_free_block
287 struct ptx_free_block
*next
;
307 int max_threads_per_block
;
308 int max_threads_per_multiprocessor
;
309 int default_dims
[GOMP_DIM_MAX
];
311 /* Length as used by the CUDA Runtime API ('struct cudaDeviceProp'). */
314 struct ptx_image_data
*images
; /* Images loaded on device. */
315 pthread_mutex_t image_lock
; /* Lock for above list. */
317 struct ptx_free_block
*free_blocks
;
318 pthread_mutex_t free_blocks_lock
;
320 /* OpenMP stacks, cached between kernel invocations. */
325 pthread_mutex_t lock
;
328 struct ptx_device
*next
;
331 static struct ptx_device
**ptx_devices
;
333 static inline struct nvptx_thread
*
336 return (struct nvptx_thread
*) GOMP_PLUGIN_acc_thread ();
339 /* Initialize the device. Return TRUE on success, else FALSE. PTX_DEV_LOCK
340 should be locked on entry and remains locked on exit. */
347 if (instantiated_devices
!= 0)
350 if (!init_cuda_lib ())
353 CUDA_CALL (cuInit
, 0);
355 int cuda_driver_version
;
356 CUDA_CALL_ERET (NULL
, cuDriverGetVersion
, &cuda_driver_version
);
357 snprintf (cuda_driver_version_s
, sizeof cuda_driver_version_s
,
359 cuda_driver_version
/ 1000, cuda_driver_version
% 1000 / 10);
361 CUDA_CALL (cuDeviceGetCount
, &ndevs
);
362 ptx_devices
= GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device
*)
368 /* Select the N'th PTX device for the current host thread. The device must
369 have been previously opened before calling this function. */
372 nvptx_attach_host_thread_to_device (int n
)
376 struct ptx_device
*ptx_dev
;
379 r
= CUDA_CALL_NOCHECK (cuCtxGetDevice
, &dev
);
380 if (r
== CUDA_ERROR_NOT_PERMITTED
)
382 /* Assume we're in a CUDA callback, just return true. */
385 if (r
!= CUDA_SUCCESS
&& r
!= CUDA_ERROR_INVALID_CONTEXT
)
387 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r
));
391 if (r
!= CUDA_ERROR_INVALID_CONTEXT
&& dev
== n
)
397 ptx_dev
= ptx_devices
[n
];
400 GOMP_PLUGIN_error ("device %d not found", n
);
404 CUDA_CALL (cuCtxGetCurrent
, &thd_ctx
);
406 /* We don't necessarily have a current context (e.g. if it has been
407 destroyed. Pop it if we do though. */
409 CUDA_CALL (cuCtxPopCurrent
, &old_ctx
);
411 CUDA_CALL (cuCtxPushCurrent
, ptx_dev
->ctx
);
416 static struct ptx_device
*
417 nvptx_open_device (int n
)
419 struct ptx_device
*ptx_dev
;
420 CUdevice dev
, ctx_dev
;
422 int async_engines
, pi
;
424 CUDA_CALL_ERET (NULL
, cuDeviceGet
, &dev
, n
);
426 ptx_dev
= GOMP_PLUGIN_malloc (sizeof (struct ptx_device
));
430 ptx_dev
->ctx_shared
= false;
432 r
= CUDA_CALL_NOCHECK (cuCtxGetDevice
, &ctx_dev
);
433 if (r
!= CUDA_SUCCESS
&& r
!= CUDA_ERROR_INVALID_CONTEXT
)
435 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r
));
439 if (r
!= CUDA_ERROR_INVALID_CONTEXT
&& ctx_dev
!= dev
)
441 /* The current host thread has an active context for a different device.
444 CUDA_CALL_ERET (NULL
, cuCtxPopCurrent
, &old_ctx
);
447 CUDA_CALL_ERET (NULL
, cuCtxGetCurrent
, &ptx_dev
->ctx
);
450 CUDA_CALL_ERET (NULL
, cuCtxCreate
, &ptx_dev
->ctx
, CU_CTX_SCHED_AUTO
, dev
);
452 ptx_dev
->ctx_shared
= true;
454 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
455 &pi
, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP
, dev
);
456 ptx_dev
->overlap
= pi
;
458 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
459 &pi
, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY
, dev
);
462 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
463 &pi
, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS
, dev
);
464 ptx_dev
->concur
= pi
;
466 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
467 &pi
, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE
, dev
);
470 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
471 &pi
, CU_DEVICE_ATTRIBUTE_INTEGRATED
, dev
);
474 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
475 &pi
, CU_DEVICE_ATTRIBUTE_CLOCK_RATE
, dev
);
476 ptx_dev
->clock_khz
= pi
;
478 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
479 &pi
, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT
, dev
);
480 ptx_dev
->num_sms
= pi
;
482 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
483 &pi
, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK
, dev
);
484 ptx_dev
->regs_per_block
= pi
;
486 /* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR is defined only
487 in CUDA 6.0 and newer. */
488 r
= CUDA_CALL_NOCHECK (cuDeviceGetAttribute
, &pi
,
489 CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR
,
491 /* Fallback: use limit of registers per block, which is usually equal. */
492 if (r
== CUDA_ERROR_INVALID_VALUE
)
493 pi
= ptx_dev
->regs_per_block
;
494 else if (r
!= CUDA_SUCCESS
)
496 GOMP_PLUGIN_error ("cuDeviceGetAttribute error: %s", cuda_error (r
));
499 ptx_dev
->regs_per_sm
= pi
;
501 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
502 &pi
, CU_DEVICE_ATTRIBUTE_WARP_SIZE
, dev
);
505 GOMP_PLUGIN_error ("Only warp size 32 is supported");
508 ptx_dev
->warp_size
= pi
;
510 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
, &pi
,
511 CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK
, dev
);
512 ptx_dev
->max_threads_per_block
= pi
;
514 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
, &pi
,
515 CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR
, dev
);
516 ptx_dev
->max_threads_per_multiprocessor
= pi
;
518 r
= CUDA_CALL_NOCHECK (cuDeviceGetAttribute
, &async_engines
,
519 CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT
, dev
);
520 if (r
!= CUDA_SUCCESS
)
523 for (int i
= 0; i
!= GOMP_DIM_MAX
; i
++)
524 ptx_dev
->default_dims
[i
] = 0;
526 CUDA_CALL_ERET (NULL
, cuDeviceGetName
, ptx_dev
->name
, sizeof ptx_dev
->name
,
529 ptx_dev
->images
= NULL
;
530 pthread_mutex_init (&ptx_dev
->image_lock
, NULL
);
532 ptx_dev
->free_blocks
= NULL
;
533 pthread_mutex_init (&ptx_dev
->free_blocks_lock
, NULL
);
535 ptx_dev
->omp_stacks
.ptr
= 0;
536 ptx_dev
->omp_stacks
.size
= 0;
537 pthread_mutex_init (&ptx_dev
->omp_stacks
.lock
, NULL
);
543 nvptx_close_device (struct ptx_device
*ptx_dev
)
548 for (struct ptx_free_block
*b
= ptx_dev
->free_blocks
; b
;)
550 struct ptx_free_block
*b_next
= b
->next
;
551 CUDA_CALL (cuMemFree
, (CUdeviceptr
) b
->ptr
);
556 pthread_mutex_destroy (&ptx_dev
->free_blocks_lock
);
557 pthread_mutex_destroy (&ptx_dev
->image_lock
);
559 pthread_mutex_destroy (&ptx_dev
->omp_stacks
.lock
);
561 if (ptx_dev
->omp_stacks
.ptr
)
562 CUDA_CALL (cuMemFree
, ptx_dev
->omp_stacks
.ptr
);
564 if (!ptx_dev
->ctx_shared
)
565 CUDA_CALL (cuCtxDestroy
, ptx_dev
->ctx
);
572 nvptx_get_num_devices (void)
576 /* This function will be called before the plugin has been initialized in
577 order to enumerate available devices, but CUDA API routines can't be used
578 until cuInit has been called. Just call it now (but don't yet do any
579 further initialization). */
580 if (instantiated_devices
== 0)
582 if (!init_cuda_lib ())
584 CUresult r
= CUDA_CALL_NOCHECK (cuInit
, 0);
585 /* This is not an error: e.g. we may have CUDA libraries installed but
586 no devices available. */
587 if (r
!= CUDA_SUCCESS
)
589 GOMP_PLUGIN_debug (0, "Disabling nvptx offloading; cuInit: %s\n",
595 CUDA_CALL_ERET (-1, cuDeviceGetCount
, &n
);
600 notify_var (const char *var_name
, const char *env_var
)
603 GOMP_PLUGIN_debug (0, "%s: <Not defined>\n", var_name
);
605 GOMP_PLUGIN_debug (0, "%s: '%s'\n", var_name
, env_var
);
609 process_GOMP_NVPTX_JIT (intptr_t *gomp_nvptx_o
)
611 const char *var_name
= "GOMP_NVPTX_JIT";
612 const char *env_var
= secure_getenv (var_name
);
613 notify_var (var_name
, env_var
);
618 const char *c
= env_var
;
624 if (c
[0] == '-' && c
[1] == 'O'
625 && '0' <= c
[2] && c
[2] <= '4'
626 && (c
[3] == '\0' || c
[3] == ' '))
628 *gomp_nvptx_o
= c
[2] - '0';
633 GOMP_PLUGIN_error ("Error parsing %s", var_name
);
639 link_ptx (CUmodule
*module
, const struct targ_ptx_obj
*ptx_objs
,
642 CUjit_option opts
[7];
647 CUlinkState linkstate
;
650 size_t linkoutsize
__attribute__ ((unused
));
652 opts
[0] = CU_JIT_WALL_TIME
;
653 optvals
[0] = &elapsed
;
655 opts
[1] = CU_JIT_INFO_LOG_BUFFER
;
656 optvals
[1] = &ilog
[0];
658 opts
[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES
;
659 optvals
[2] = (void *) sizeof ilog
;
661 opts
[3] = CU_JIT_ERROR_LOG_BUFFER
;
662 optvals
[3] = &elog
[0];
664 opts
[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES
;
665 optvals
[4] = (void *) sizeof elog
;
667 opts
[5] = CU_JIT_LOG_VERBOSE
;
668 optvals
[5] = (void *) 1;
670 static intptr_t gomp_nvptx_o
= -1;
672 static bool init_done
= false;
675 process_GOMP_NVPTX_JIT (&gomp_nvptx_o
);
680 if (gomp_nvptx_o
!= -1)
682 opts
[nopts
] = CU_JIT_OPTIMIZATION_LEVEL
;
683 optvals
[nopts
] = (void *) gomp_nvptx_o
;
687 if (CUDA_CALL_EXISTS (cuLinkCreate_v2
))
688 CUDA_CALL (cuLinkCreate_v2
, nopts
, opts
, optvals
, &linkstate
);
690 CUDA_CALL (cuLinkCreate
, nopts
, opts
, optvals
, &linkstate
);
692 for (; num_objs
--; ptx_objs
++)
694 /* cuLinkAddData's 'data' argument erroneously omits the const
696 GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs
->code
);
697 if (CUDA_CALL_EXISTS (cuLinkAddData_v2
))
698 r
= CUDA_CALL_NOCHECK (cuLinkAddData_v2
, linkstate
, CU_JIT_INPUT_PTX
,
699 (char *) ptx_objs
->code
, ptx_objs
->size
,
702 r
= CUDA_CALL_NOCHECK (cuLinkAddData
, linkstate
, CU_JIT_INPUT_PTX
,
703 (char *) ptx_objs
->code
, ptx_objs
->size
,
705 if (r
!= CUDA_SUCCESS
)
707 GOMP_PLUGIN_error ("Link error log %s\n", &elog
[0]);
708 GOMP_PLUGIN_error ("cuLinkAddData (ptx_code) error: %s",
714 GOMP_PLUGIN_debug (0, "Linking\n");
715 r
= CUDA_CALL_NOCHECK (cuLinkComplete
, linkstate
, &linkout
, &linkoutsize
);
717 GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed
);
718 GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog
[0]);
720 if (r
!= CUDA_SUCCESS
)
722 GOMP_PLUGIN_error ("Link error log %s\n", &elog
[0]);
723 GOMP_PLUGIN_error ("cuLinkComplete error: %s", cuda_error (r
));
727 CUDA_CALL (cuModuleLoadData
, module
, linkout
);
728 CUDA_CALL (cuLinkDestroy
, linkstate
);
733 nvptx_exec (void (*fn
), size_t mapnum
, void **hostaddrs
, void **devaddrs
,
734 unsigned *dims
, void *targ_mem_desc
,
735 CUdeviceptr dp
, CUstream stream
)
737 struct targ_fn_descriptor
*targ_fn
= (struct targ_fn_descriptor
*) fn
;
741 struct nvptx_thread
*nvthd
= nvptx_thread ();
742 int warp_size
= nvthd
->ptx_dev
->warp_size
;
744 function
= targ_fn
->fn
;
746 /* Initialize the launch dimensions. Typically this is constant,
747 provided by the device compiler, but we must permit runtime
750 for (i
= 0; i
!= GOMP_DIM_MAX
; i
++)
752 if (targ_fn
->launch
->dim
[i
])
753 dims
[i
] = targ_fn
->launch
->dim
[i
];
760 pthread_mutex_lock (&ptx_dev_lock
);
762 static int gomp_openacc_dims
[GOMP_DIM_MAX
];
763 if (!gomp_openacc_dims
[0])
765 /* See if the user provided GOMP_OPENACC_DIM environment
766 variable to specify runtime defaults. */
767 for (int i
= 0; i
< GOMP_DIM_MAX
; ++i
)
768 gomp_openacc_dims
[i
] = GOMP_PLUGIN_acc_default_dim (i
);
771 if (!nvthd
->ptx_dev
->default_dims
[0])
773 int default_dims
[GOMP_DIM_MAX
];
774 for (int i
= 0; i
< GOMP_DIM_MAX
; ++i
)
775 default_dims
[i
] = gomp_openacc_dims
[i
];
777 int gang
, worker
, vector
;
779 int block_size
= nvthd
->ptx_dev
->max_threads_per_block
;
780 int cpu_size
= nvthd
->ptx_dev
->max_threads_per_multiprocessor
;
781 int dev_size
= nvthd
->ptx_dev
->num_sms
;
782 GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
783 " dev_size=%d, cpu_size=%d\n",
784 warp_size
, block_size
, dev_size
, cpu_size
);
786 gang
= (cpu_size
/ block_size
) * dev_size
;
787 worker
= block_size
/ warp_size
;
791 /* There is no upper bound on the gang size. The best size
792 matches the hardware configuration. Logical gangs are
793 scheduled onto physical hardware. To maximize usage, we
794 should guess a large number. */
795 if (default_dims
[GOMP_DIM_GANG
] < 1)
796 default_dims
[GOMP_DIM_GANG
] = gang
? gang
: 1024;
797 /* The worker size must not exceed the hardware. */
798 if (default_dims
[GOMP_DIM_WORKER
] < 1
799 || (default_dims
[GOMP_DIM_WORKER
] > worker
&& gang
))
800 default_dims
[GOMP_DIM_WORKER
] = worker
;
801 /* The vector size must exactly match the hardware. */
802 if (default_dims
[GOMP_DIM_VECTOR
] < 1
803 || (default_dims
[GOMP_DIM_VECTOR
] != vector
&& gang
))
804 default_dims
[GOMP_DIM_VECTOR
] = vector
;
806 GOMP_PLUGIN_debug (0, " default dimensions [%d,%d,%d]\n",
807 default_dims
[GOMP_DIM_GANG
],
808 default_dims
[GOMP_DIM_WORKER
],
809 default_dims
[GOMP_DIM_VECTOR
]);
811 for (i
= 0; i
!= GOMP_DIM_MAX
; i
++)
812 nvthd
->ptx_dev
->default_dims
[i
] = default_dims
[i
];
814 pthread_mutex_unlock (&ptx_dev_lock
);
817 bool default_dim_p
[GOMP_DIM_MAX
];
818 for (i
= 0; i
!= GOMP_DIM_MAX
; i
++)
819 default_dim_p
[i
] = !dims
[i
];
821 if (!CUDA_CALL_EXISTS (cuOccupancyMaxPotentialBlockSize
))
823 for (i
= 0; i
!= GOMP_DIM_MAX
; i
++)
824 if (default_dim_p
[i
])
825 dims
[i
] = nvthd
->ptx_dev
->default_dims
[i
];
827 if (default_dim_p
[GOMP_DIM_VECTOR
])
828 dims
[GOMP_DIM_VECTOR
]
829 = MIN (dims
[GOMP_DIM_VECTOR
],
830 (targ_fn
->max_threads_per_block
/ warp_size
833 if (default_dim_p
[GOMP_DIM_WORKER
])
834 dims
[GOMP_DIM_WORKER
]
835 = MIN (dims
[GOMP_DIM_WORKER
],
836 targ_fn
->max_threads_per_block
/ dims
[GOMP_DIM_VECTOR
]);
840 /* Handle the case that the compiler allows the runtime to choose
841 the vector-length conservatively, by ignoring
842 gomp_openacc_dims[GOMP_DIM_VECTOR]. TODO: actually handle
845 /* TODO: limit gomp_openacc_dims[GOMP_DIM_WORKER] such that that
846 gomp_openacc_dims[GOMP_DIM_WORKER] * actual_vectors does not
847 exceed targ_fn->max_threads_per_block. */
848 int workers
= gomp_openacc_dims
[GOMP_DIM_WORKER
];
849 int gangs
= gomp_openacc_dims
[GOMP_DIM_GANG
];
852 CUDA_CALL_ASSERT (cuOccupancyMaxPotentialBlockSize
, &grids
,
853 &blocks
, function
, NULL
, 0,
854 dims
[GOMP_DIM_WORKER
] * dims
[GOMP_DIM_VECTOR
]);
855 GOMP_PLUGIN_debug (0, "cuOccupancyMaxPotentialBlockSize: "
856 "grid = %d, block = %d\n", grids
, blocks
);
858 /* Keep the num_gangs proportional to the block size. In
859 the case were a block size is limited by shared-memory
860 or the register file capacity, the runtime will not
861 excessively over assign gangs to the multiprocessor
862 units if their state is going to be swapped out even
863 more than necessary. The constant factor 2 is there to
864 prevent threads from idling when there is insufficient
867 gangs
= 2 * grids
* (blocks
/ warp_size
);
874 int actual_vectors
= (default_dim_p
[GOMP_DIM_VECTOR
]
876 : dims
[GOMP_DIM_VECTOR
]);
877 workers
= blocks
/ actual_vectors
;
878 workers
= MAX (workers
, 1);
879 /* If we need a per-worker barrier ... . */
880 if (actual_vectors
> 32)
881 /* Don't use more barriers than available. */
882 workers
= MIN (workers
, 15);
885 for (i
= 0; i
!= GOMP_DIM_MAX
; i
++)
886 if (default_dim_p
[i
])
889 case GOMP_DIM_GANG
: dims
[i
] = gangs
; break;
890 case GOMP_DIM_WORKER
: dims
[i
] = workers
; break;
891 case GOMP_DIM_VECTOR
: dims
[i
] = vectors
; break;
892 default: GOMP_PLUGIN_fatal ("invalid dim");
898 /* Check if the accelerator has sufficient hardware resources to
899 launch the offloaded kernel. */
900 if (dims
[GOMP_DIM_WORKER
] * dims
[GOMP_DIM_VECTOR
]
901 > targ_fn
->max_threads_per_block
)
904 = ("The Nvidia accelerator has insufficient resources to launch '%s'"
905 " with num_workers = %d and vector_length = %d"
907 "recompile the program with 'num_workers = x and vector_length = y'"
908 " on that offloaded region or '-fopenacc-dim=:x:y' where"
911 GOMP_PLUGIN_fatal (msg
, targ_fn
->launch
->fn
, dims
[GOMP_DIM_WORKER
],
912 dims
[GOMP_DIM_VECTOR
], targ_fn
->max_threads_per_block
);
915 /* Check if the accelerator has sufficient barrier resources to
916 launch the offloaded kernel. */
917 if (dims
[GOMP_DIM_WORKER
] > 15 && dims
[GOMP_DIM_VECTOR
] > 32)
920 = ("The Nvidia accelerator has insufficient barrier resources to launch"
921 " '%s' with num_workers = %d and vector_length = %d"
923 "recompile the program with 'num_workers = x' on that offloaded"
924 " region or '-fopenacc-dim=:x:' where x <= 15"
926 "or, recompile the program with 'vector_length = 32' on that"
927 " offloaded region or '-fopenacc-dim=::32'"
929 GOMP_PLUGIN_fatal (msg
, targ_fn
->launch
->fn
, dims
[GOMP_DIM_WORKER
],
930 dims
[GOMP_DIM_VECTOR
]);
933 GOMP_PLUGIN_debug (0, " %s: kernel %s: launch"
934 " gangs=%u, workers=%u, vectors=%u\n",
935 __FUNCTION__
, targ_fn
->launch
->fn
, dims
[GOMP_DIM_GANG
],
936 dims
[GOMP_DIM_WORKER
], dims
[GOMP_DIM_VECTOR
]);
940 // num_gangs nctaid.x
941 // num_workers ntid.y
942 // vector length ntid.x
944 struct goacc_thread
*thr
= GOMP_PLUGIN_goacc_thread ();
945 acc_prof_info
*prof_info
= thr
->prof_info
;
946 acc_event_info enqueue_launch_event_info
;
947 acc_api_info
*api_info
= thr
->api_info
;
948 bool profiling_p
= __builtin_expect (prof_info
!= NULL
, false);
951 prof_info
->event_type
= acc_ev_enqueue_launch_start
;
953 enqueue_launch_event_info
.launch_event
.event_type
954 = prof_info
->event_type
;
955 enqueue_launch_event_info
.launch_event
.valid_bytes
956 = _ACC_LAUNCH_EVENT_INFO_VALID_BYTES
;
957 enqueue_launch_event_info
.launch_event
.parent_construct
958 = acc_construct_parallel
;
959 enqueue_launch_event_info
.launch_event
.implicit
= 1;
960 enqueue_launch_event_info
.launch_event
.tool_info
= NULL
;
961 enqueue_launch_event_info
.launch_event
.kernel_name
= targ_fn
->launch
->fn
;
962 enqueue_launch_event_info
.launch_event
.num_gangs
963 = dims
[GOMP_DIM_GANG
];
964 enqueue_launch_event_info
.launch_event
.num_workers
965 = dims
[GOMP_DIM_WORKER
];
966 enqueue_launch_event_info
.launch_event
.vector_length
967 = dims
[GOMP_DIM_VECTOR
];
969 api_info
->device_api
= acc_device_api_cuda
;
971 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info
, &enqueue_launch_event_info
,
976 CUDA_CALL_ASSERT (cuLaunchKernel
, function
,
977 dims
[GOMP_DIM_GANG
], 1, 1,
978 dims
[GOMP_DIM_VECTOR
], dims
[GOMP_DIM_WORKER
], 1,
979 0, stream
, kargs
, 0);
983 prof_info
->event_type
= acc_ev_enqueue_launch_end
;
984 enqueue_launch_event_info
.launch_event
.event_type
985 = prof_info
->event_type
;
986 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info
, &enqueue_launch_event_info
,
990 GOMP_PLUGIN_debug (0, " %s: kernel %s: finished\n", __FUNCTION__
,
991 targ_fn
->launch
->fn
);
994 void * openacc_get_current_cuda_context (void);
997 goacc_profiling_acc_ev_alloc (struct goacc_thread
*thr
, void *dp
, size_t s
)
999 acc_prof_info
*prof_info
= thr
->prof_info
;
1000 acc_event_info data_event_info
;
1001 acc_api_info
*api_info
= thr
->api_info
;
1003 prof_info
->event_type
= acc_ev_alloc
;
1005 data_event_info
.data_event
.event_type
= prof_info
->event_type
;
1006 data_event_info
.data_event
.valid_bytes
= _ACC_DATA_EVENT_INFO_VALID_BYTES
;
1007 data_event_info
.data_event
.parent_construct
= acc_construct_parallel
;
1008 data_event_info
.data_event
.implicit
= 1;
1009 data_event_info
.data_event
.tool_info
= NULL
;
1010 data_event_info
.data_event
.var_name
= NULL
;
1011 data_event_info
.data_event
.bytes
= s
;
1012 data_event_info
.data_event
.host_ptr
= NULL
;
1013 data_event_info
.data_event
.device_ptr
= dp
;
1015 api_info
->device_api
= acc_device_api_cuda
;
1017 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info
, &data_event_info
, api_info
);
1020 /* Free the cached soft-stacks block if it is above the SOFTSTACK_CACHE_LIMIT
1021 size threshold, or if FORCE is true. */
1024 nvptx_stacks_free (struct ptx_device
*ptx_dev
, bool force
)
1026 pthread_mutex_lock (&ptx_dev
->omp_stacks
.lock
);
1027 if (ptx_dev
->omp_stacks
.ptr
1028 && (force
|| ptx_dev
->omp_stacks
.size
> SOFTSTACK_CACHE_LIMIT
))
1030 CUresult r
= CUDA_CALL_NOCHECK (cuMemFree
, ptx_dev
->omp_stacks
.ptr
);
1031 if (r
!= CUDA_SUCCESS
)
1032 GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r
));
1033 ptx_dev
->omp_stacks
.ptr
= 0;
1034 ptx_dev
->omp_stacks
.size
= 0;
1036 pthread_mutex_unlock (&ptx_dev
->omp_stacks
.lock
);
1040 nvptx_alloc (size_t s
, bool suppress_errors
)
1044 CUresult r
= CUDA_CALL_NOCHECK (cuMemAlloc
, &d
, s
);
1045 if (suppress_errors
&& r
== CUDA_ERROR_OUT_OF_MEMORY
)
1047 else if (r
!= CUDA_SUCCESS
)
1049 GOMP_PLUGIN_error ("nvptx_alloc error: %s", cuda_error (r
));
1053 /* NOTE: We only do profiling stuff if the memory allocation succeeds. */
1054 struct goacc_thread
*thr
= GOMP_PLUGIN_goacc_thread ();
1056 = __builtin_expect (thr
!= NULL
&& thr
->prof_info
!= NULL
, false);
1058 goacc_profiling_acc_ev_alloc (thr
, (void *) d
, s
);
1064 goacc_profiling_acc_ev_free (struct goacc_thread
*thr
, void *p
)
1066 acc_prof_info
*prof_info
= thr
->prof_info
;
1067 acc_event_info data_event_info
;
1068 acc_api_info
*api_info
= thr
->api_info
;
1070 prof_info
->event_type
= acc_ev_free
;
1072 data_event_info
.data_event
.event_type
= prof_info
->event_type
;
1073 data_event_info
.data_event
.valid_bytes
= _ACC_DATA_EVENT_INFO_VALID_BYTES
;
1074 data_event_info
.data_event
.parent_construct
= acc_construct_parallel
;
1075 data_event_info
.data_event
.implicit
= 1;
1076 data_event_info
.data_event
.tool_info
= NULL
;
1077 data_event_info
.data_event
.var_name
= NULL
;
1078 data_event_info
.data_event
.bytes
= -1;
1079 data_event_info
.data_event
.host_ptr
= NULL
;
1080 data_event_info
.data_event
.device_ptr
= p
;
1082 api_info
->device_api
= acc_device_api_cuda
;
1084 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info
, &data_event_info
, api_info
);
1088 nvptx_free (void *p
, struct ptx_device
*ptx_dev
)
1093 CUresult r
= CUDA_CALL_NOCHECK (cuMemGetAddressRange
, &pb
, &ps
,
1095 if (r
== CUDA_ERROR_NOT_PERMITTED
)
1097 /* We assume that this error indicates we are in a CUDA callback context,
1098 where all CUDA calls are not allowed (see cuStreamAddCallback
1099 documentation for description). Arrange to free this piece of device
1101 struct ptx_free_block
*n
1102 = GOMP_PLUGIN_malloc (sizeof (struct ptx_free_block
));
1104 pthread_mutex_lock (&ptx_dev
->free_blocks_lock
);
1105 n
->next
= ptx_dev
->free_blocks
;
1106 ptx_dev
->free_blocks
= n
;
1107 pthread_mutex_unlock (&ptx_dev
->free_blocks_lock
);
1110 else if (r
!= CUDA_SUCCESS
)
1112 GOMP_PLUGIN_error ("cuMemGetAddressRange error: %s", cuda_error (r
));
1115 if ((CUdeviceptr
) p
!= pb
)
1117 GOMP_PLUGIN_error ("invalid device address");
1121 CUDA_CALL (cuMemFree
, (CUdeviceptr
) p
);
1122 struct goacc_thread
*thr
= GOMP_PLUGIN_goacc_thread ();
1124 = __builtin_expect (thr
!= NULL
&& thr
->prof_info
!= NULL
, false);
1126 goacc_profiling_acc_ev_free (thr
, p
);
1132 nvptx_get_current_cuda_device (void)
1134 struct nvptx_thread
*nvthd
= nvptx_thread ();
1136 if (!nvthd
|| !nvthd
->ptx_dev
)
1139 return &nvthd
->ptx_dev
->dev
;
1143 nvptx_get_current_cuda_context (void)
1145 struct nvptx_thread
*nvthd
= nvptx_thread ();
1147 if (!nvthd
|| !nvthd
->ptx_dev
)
1150 return nvthd
->ptx_dev
->ctx
;
1153 /* Plugin entry points. */
1156 GOMP_OFFLOAD_get_name (void)
1162 GOMP_OFFLOAD_get_caps (void)
1164 return GOMP_OFFLOAD_CAP_OPENACC_200
| GOMP_OFFLOAD_CAP_OPENMP_400
;
1168 GOMP_OFFLOAD_get_type (void)
1170 return OFFLOAD_TARGET_TYPE_NVIDIA_PTX
;
1174 GOMP_OFFLOAD_get_num_devices (void)
1176 return nvptx_get_num_devices ();
1180 GOMP_OFFLOAD_init_device (int n
)
1182 struct ptx_device
*dev
;
1184 pthread_mutex_lock (&ptx_dev_lock
);
1186 if (!nvptx_init () || ptx_devices
[n
] != NULL
)
1188 pthread_mutex_unlock (&ptx_dev_lock
);
1192 dev
= nvptx_open_device (n
);
1195 ptx_devices
[n
] = dev
;
1196 instantiated_devices
++;
1199 pthread_mutex_unlock (&ptx_dev_lock
);
1205 GOMP_OFFLOAD_fini_device (int n
)
1207 pthread_mutex_lock (&ptx_dev_lock
);
1209 if (ptx_devices
[n
] != NULL
)
1211 if (!nvptx_attach_host_thread_to_device (n
)
1212 || !nvptx_close_device (ptx_devices
[n
]))
1214 pthread_mutex_unlock (&ptx_dev_lock
);
1217 ptx_devices
[n
] = NULL
;
1218 instantiated_devices
--;
1221 if (instantiated_devices
== 0)
1227 pthread_mutex_unlock (&ptx_dev_lock
);
1231 /* Return the libgomp version number we're compatible with. There is
1232 no requirement for cross-version compatibility. */
1235 GOMP_OFFLOAD_version (void)
1237 return GOMP_VERSION
;
1240 /* Initialize __nvptx_clocktick, if present in MODULE. */
1243 nvptx_set_clocktick (CUmodule module
, struct ptx_device
*dev
)
1246 CUresult r
= CUDA_CALL_NOCHECK (cuModuleGetGlobal
, &dptr
, NULL
,
1247 module
, "__nvptx_clocktick");
1248 if (r
== CUDA_ERROR_NOT_FOUND
)
1250 if (r
!= CUDA_SUCCESS
)
1251 GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r
));
1252 double __nvptx_clocktick
= 1e-3 / dev
->clock_khz
;
1253 r
= CUDA_CALL_NOCHECK (cuMemcpyHtoD
, dptr
, &__nvptx_clocktick
,
1254 sizeof (__nvptx_clocktick
));
1255 if (r
!= CUDA_SUCCESS
)
1256 GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r
));
1259 /* Load the (partial) program described by TARGET_DATA to device
1260 number ORD. Allocate and return TARGET_TABLE. */
1263 GOMP_OFFLOAD_load_image (int ord
, unsigned version
, const void *target_data
,
1264 struct addr_pair
**target_table
)
1267 const char *const *var_names
;
1268 const struct targ_fn_launch
*fn_descs
;
1269 unsigned int fn_entries
, var_entries
, other_entries
, i
, j
;
1270 struct targ_fn_descriptor
*targ_fns
;
1271 struct addr_pair
*targ_tbl
;
1272 const nvptx_tdata_t
*img_header
= (const nvptx_tdata_t
*) target_data
;
1273 struct ptx_image_data
*new_image
;
1274 struct ptx_device
*dev
;
1276 if (GOMP_VERSION_DEV (version
) > GOMP_VERSION_NVIDIA_PTX
)
1278 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1279 " (expected %u, received %u)",
1280 GOMP_VERSION_NVIDIA_PTX
, GOMP_VERSION_DEV (version
));
1284 if (!nvptx_attach_host_thread_to_device (ord
)
1285 || !link_ptx (&module
, img_header
->ptx_objs
, img_header
->ptx_num
))
1288 dev
= ptx_devices
[ord
];
1290 /* The mkoffload utility emits a struct of pointers/integers at the
1291 start of each offload image. The array of kernel names and the
1292 functions addresses form a one-to-one correspondence. */
1294 var_entries
= img_header
->var_num
;
1295 var_names
= img_header
->var_names
;
1296 fn_entries
= img_header
->fn_num
;
1297 fn_descs
= img_header
->fn_descs
;
1299 /* Currently, the only other entry kind is 'device number'. */
1302 targ_tbl
= GOMP_PLUGIN_malloc (sizeof (struct addr_pair
)
1303 * (fn_entries
+ var_entries
+ other_entries
));
1304 targ_fns
= GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor
)
1307 *target_table
= targ_tbl
;
1309 new_image
= GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data
));
1310 new_image
->target_data
= target_data
;
1311 new_image
->module
= module
;
1312 new_image
->fns
= targ_fns
;
1314 pthread_mutex_lock (&dev
->image_lock
);
1315 new_image
->next
= dev
->images
;
1316 dev
->images
= new_image
;
1317 pthread_mutex_unlock (&dev
->image_lock
);
1319 for (i
= 0; i
< fn_entries
; i
++, targ_fns
++, targ_tbl
++)
1321 CUfunction function
;
1324 CUDA_CALL_ERET (-1, cuModuleGetFunction
, &function
, module
,
1326 CUDA_CALL_ERET (-1, cuFuncGetAttribute
, &nregs
,
1327 CU_FUNC_ATTRIBUTE_NUM_REGS
, function
);
1328 CUDA_CALL_ERET (-1, cuFuncGetAttribute
, &mthrs
,
1329 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK
, function
);
1331 targ_fns
->fn
= function
;
1332 targ_fns
->launch
= &fn_descs
[i
];
1333 targ_fns
->regs_per_thread
= nregs
;
1334 targ_fns
->max_threads_per_block
= mthrs
;
1336 targ_tbl
->start
= (uintptr_t) targ_fns
;
1337 targ_tbl
->end
= targ_tbl
->start
+ 1;
1340 for (j
= 0; j
< var_entries
; j
++, targ_tbl
++)
1345 CUDA_CALL_ERET (-1, cuModuleGetGlobal
,
1346 &var
, &bytes
, module
, var_names
[j
]);
1348 targ_tbl
->start
= (uintptr_t) var
;
1349 targ_tbl
->end
= targ_tbl
->start
+ bytes
;
1352 CUdeviceptr device_num_varptr
;
1353 size_t device_num_varsize
;
1354 CUresult r
= CUDA_CALL_NOCHECK (cuModuleGetGlobal
, &device_num_varptr
,
1355 &device_num_varsize
, module
,
1356 STRINGX (GOMP_DEVICE_NUM_VAR
));
1357 if (r
== CUDA_SUCCESS
)
1359 targ_tbl
->start
= (uintptr_t) device_num_varptr
;
1360 targ_tbl
->end
= (uintptr_t) (device_num_varptr
+ device_num_varsize
);
1363 /* The 'GOMP_DEVICE_NUM_VAR' variable was not in this image. */
1364 targ_tbl
->start
= targ_tbl
->end
= 0;
1367 nvptx_set_clocktick (module
, dev
);
1369 return fn_entries
+ var_entries
+ other_entries
;
1372 /* Unload the program described by TARGET_DATA. DEV_DATA is the
1373 function descriptors allocated by G_O_load_image. */
1376 GOMP_OFFLOAD_unload_image (int ord
, unsigned version
, const void *target_data
)
1378 struct ptx_image_data
*image
, **prev_p
;
1379 struct ptx_device
*dev
= ptx_devices
[ord
];
1381 if (GOMP_VERSION_DEV (version
) > GOMP_VERSION_NVIDIA_PTX
)
1383 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1384 " (expected %u, received %u)",
1385 GOMP_VERSION_NVIDIA_PTX
, GOMP_VERSION_DEV (version
));
1390 pthread_mutex_lock (&dev
->image_lock
);
1391 for (prev_p
= &dev
->images
; (image
= *prev_p
) != 0; prev_p
= &image
->next
)
1392 if (image
->target_data
== target_data
)
1394 *prev_p
= image
->next
;
1395 if (CUDA_CALL_NOCHECK (cuModuleUnload
, image
->module
) != CUDA_SUCCESS
)
1401 pthread_mutex_unlock (&dev
->image_lock
);
1406 GOMP_OFFLOAD_alloc (int ord
, size_t size
)
1408 if (!nvptx_attach_host_thread_to_device (ord
))
1411 struct ptx_device
*ptx_dev
= ptx_devices
[ord
];
1412 struct ptx_free_block
*blocks
, *tmp
;
1414 pthread_mutex_lock (&ptx_dev
->free_blocks_lock
);
1415 blocks
= ptx_dev
->free_blocks
;
1416 ptx_dev
->free_blocks
= NULL
;
1417 pthread_mutex_unlock (&ptx_dev
->free_blocks_lock
);
1419 nvptx_stacks_free (ptx_dev
, false);
1424 nvptx_free (blocks
->ptr
, ptx_dev
);
1429 void *d
= nvptx_alloc (size
, true);
1434 /* Memory allocation failed. Try freeing the stacks block, and
1436 nvptx_stacks_free (ptx_dev
, true);
1437 return nvptx_alloc (size
, false);
1442 GOMP_OFFLOAD_free (int ord
, void *ptr
)
1444 return (nvptx_attach_host_thread_to_device (ord
)
1445 && nvptx_free (ptr
, ptx_devices
[ord
]));
1449 GOMP_OFFLOAD_openacc_exec (void (*fn
) (void *), size_t mapnum
,
1450 void **hostaddrs
, void **devaddrs
,
1451 unsigned *dims
, void *targ_mem_desc
)
1453 GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__
);
1455 struct goacc_thread
*thr
= GOMP_PLUGIN_goacc_thread ();
1456 acc_prof_info
*prof_info
= thr
->prof_info
;
1457 acc_event_info data_event_info
;
1458 acc_api_info
*api_info
= thr
->api_info
;
1459 bool profiling_p
= __builtin_expect (prof_info
!= NULL
, false);
1466 size_t s
= mapnum
* sizeof (void *);
1468 for (int i
= 0; i
< mapnum
; i
++)
1469 hp
[i
] = (devaddrs
[i
] ? devaddrs
[i
] : hostaddrs
[i
]);
1470 CUDA_CALL_ASSERT (cuMemAlloc
, &dp
, s
);
1472 goacc_profiling_acc_ev_alloc (thr
, (void *) dp
, s
);
1475 /* Copy the (device) pointers to arguments to the device (dp and hp might in
1476 fact have the same value on a unified-memory system). */
1481 prof_info
->event_type
= acc_ev_enqueue_upload_start
;
1483 data_event_info
.data_event
.event_type
= prof_info
->event_type
;
1484 data_event_info
.data_event
.valid_bytes
1485 = _ACC_DATA_EVENT_INFO_VALID_BYTES
;
1486 data_event_info
.data_event
.parent_construct
1487 = acc_construct_parallel
;
1488 data_event_info
.data_event
.implicit
= 1; /* Always implicit. */
1489 data_event_info
.data_event
.tool_info
= NULL
;
1490 data_event_info
.data_event
.var_name
= NULL
;
1491 data_event_info
.data_event
.bytes
= mapnum
* sizeof (void *);
1492 data_event_info
.data_event
.host_ptr
= hp
;
1493 data_event_info
.data_event
.device_ptr
= (const void *) dp
;
1495 api_info
->device_api
= acc_device_api_cuda
;
1497 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info
, &data_event_info
,
1500 CUDA_CALL_ASSERT (cuMemcpyHtoD
, dp
, (void *) hp
,
1501 mapnum
* sizeof (void *));
1504 prof_info
->event_type
= acc_ev_enqueue_upload_end
;
1505 data_event_info
.data_event
.event_type
= prof_info
->event_type
;
1506 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info
, &data_event_info
,
1511 nvptx_exec (fn
, mapnum
, hostaddrs
, devaddrs
, dims
, targ_mem_desc
,
1514 CUresult r
= CUDA_CALL_NOCHECK (cuStreamSynchronize
, NULL
);
1515 const char *maybe_abort_msg
= "(perhaps abort was called)";
1516 if (r
== CUDA_ERROR_LAUNCH_FAILED
)
1517 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r
),
1519 else if (r
!= CUDA_SUCCESS
)
1520 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r
));
1522 CUDA_CALL_ASSERT (cuMemFree
, dp
);
1524 goacc_profiling_acc_ev_free (thr
, (void *) dp
);
1528 cuda_free_argmem (void *ptr
)
1530 void **block
= (void **) ptr
;
1531 nvptx_free (block
[0], (struct ptx_device
*) block
[1]);
1536 GOMP_OFFLOAD_openacc_async_exec (void (*fn
) (void *), size_t mapnum
,
1537 void **hostaddrs
, void **devaddrs
,
1538 unsigned *dims
, void *targ_mem_desc
,
1539 struct goacc_asyncqueue
*aq
)
1541 GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__
);
1543 struct goacc_thread
*thr
= GOMP_PLUGIN_goacc_thread ();
1544 acc_prof_info
*prof_info
= thr
->prof_info
;
1545 acc_event_info data_event_info
;
1546 acc_api_info
*api_info
= thr
->api_info
;
1547 bool profiling_p
= __builtin_expect (prof_info
!= NULL
, false);
1551 void **block
= NULL
;
1555 size_t s
= mapnum
* sizeof (void *);
1556 block
= (void **) GOMP_PLUGIN_malloc (2 * sizeof (void *) + s
);
1558 for (int i
= 0; i
< mapnum
; i
++)
1559 hp
[i
] = (devaddrs
[i
] ? devaddrs
[i
] : hostaddrs
[i
]);
1560 CUDA_CALL_ASSERT (cuMemAlloc
, &dp
, s
);
1562 goacc_profiling_acc_ev_alloc (thr
, (void *) dp
, s
);
1565 /* Copy the (device) pointers to arguments to the device (dp and hp might in
1566 fact have the same value on a unified-memory system). */
1571 prof_info
->event_type
= acc_ev_enqueue_upload_start
;
1573 data_event_info
.data_event
.event_type
= prof_info
->event_type
;
1574 data_event_info
.data_event
.valid_bytes
1575 = _ACC_DATA_EVENT_INFO_VALID_BYTES
;
1576 data_event_info
.data_event
.parent_construct
1577 = acc_construct_parallel
;
1578 data_event_info
.data_event
.implicit
= 1; /* Always implicit. */
1579 data_event_info
.data_event
.tool_info
= NULL
;
1580 data_event_info
.data_event
.var_name
= NULL
;
1581 data_event_info
.data_event
.bytes
= mapnum
* sizeof (void *);
1582 data_event_info
.data_event
.host_ptr
= hp
;
1583 data_event_info
.data_event
.device_ptr
= (const void *) dp
;
1585 api_info
->device_api
= acc_device_api_cuda
;
1587 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info
, &data_event_info
,
1591 CUDA_CALL_ASSERT (cuMemcpyHtoDAsync
, dp
, (void *) hp
,
1592 mapnum
* sizeof (void *), aq
->cuda_stream
);
1593 block
[0] = (void *) dp
;
1595 struct nvptx_thread
*nvthd
=
1596 (struct nvptx_thread
*) GOMP_PLUGIN_acc_thread ();
1597 block
[1] = (void *) nvthd
->ptx_dev
;
1601 prof_info
->event_type
= acc_ev_enqueue_upload_end
;
1602 data_event_info
.data_event
.event_type
= prof_info
->event_type
;
1603 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info
, &data_event_info
,
1608 nvptx_exec (fn
, mapnum
, hostaddrs
, devaddrs
, dims
, targ_mem_desc
,
1609 dp
, aq
->cuda_stream
);
1612 GOMP_OFFLOAD_openacc_async_queue_callback (aq
, cuda_free_argmem
, block
);
1616 GOMP_OFFLOAD_openacc_create_thread_data (int ord
)
1618 struct ptx_device
*ptx_dev
;
1619 struct nvptx_thread
*nvthd
1620 = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread
));
1623 ptx_dev
= ptx_devices
[ord
];
1627 CUDA_CALL_ASSERT (cuCtxGetCurrent
, &thd_ctx
);
1629 assert (ptx_dev
->ctx
);
1632 CUDA_CALL_ASSERT (cuCtxPushCurrent
, ptx_dev
->ctx
);
1634 nvthd
->ptx_dev
= ptx_dev
;
1636 return (void *) nvthd
;
1640 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data
)
1646 GOMP_OFFLOAD_openacc_cuda_get_current_device (void)
1648 return nvptx_get_current_cuda_device ();
1652 GOMP_OFFLOAD_openacc_cuda_get_current_context (void)
1654 return nvptx_get_current_cuda_context ();
1657 /* This returns a CUstream. */
1659 GOMP_OFFLOAD_openacc_cuda_get_stream (struct goacc_asyncqueue
*aq
)
1661 return (void *) aq
->cuda_stream
;
1664 /* This takes a CUstream. */
1666 GOMP_OFFLOAD_openacc_cuda_set_stream (struct goacc_asyncqueue
*aq
, void *stream
)
1668 if (aq
->cuda_stream
)
1670 CUDA_CALL_ASSERT (cuStreamSynchronize
, aq
->cuda_stream
);
1671 CUDA_CALL_ASSERT (cuStreamDestroy
, aq
->cuda_stream
);
1674 aq
->cuda_stream
= (CUstream
) stream
;
1678 struct goacc_asyncqueue
*
1679 GOMP_OFFLOAD_openacc_async_construct (int device
__attribute__((unused
)))
1681 CUstream stream
= NULL
;
1682 CUDA_CALL_ERET (NULL
, cuStreamCreate
, &stream
, CU_STREAM_DEFAULT
);
1684 struct goacc_asyncqueue
*aq
1685 = GOMP_PLUGIN_malloc (sizeof (struct goacc_asyncqueue
));
1686 aq
->cuda_stream
= stream
;
1691 GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue
*aq
)
1693 CUDA_CALL_ERET (false, cuStreamDestroy
, aq
->cuda_stream
);
1699 GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue
*aq
)
1701 CUresult r
= CUDA_CALL_NOCHECK (cuStreamQuery
, aq
->cuda_stream
);
1702 if (r
== CUDA_SUCCESS
)
1704 if (r
== CUDA_ERROR_NOT_READY
)
1707 GOMP_PLUGIN_error ("cuStreamQuery error: %s", cuda_error (r
));
1712 GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue
*aq
)
1714 CUDA_CALL_ERET (false, cuStreamSynchronize
, aq
->cuda_stream
);
1719 GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue
*aq1
,
1720 struct goacc_asyncqueue
*aq2
)
1723 CUDA_CALL_ERET (false, cuEventCreate
, &e
, CU_EVENT_DISABLE_TIMING
);
1724 CUDA_CALL_ERET (false, cuEventRecord
, e
, aq1
->cuda_stream
);
1725 CUDA_CALL_ERET (false, cuStreamWaitEvent
, aq2
->cuda_stream
, e
, 0);
1730 cuda_callback_wrapper (CUstream stream
, CUresult res
, void *ptr
)
1732 if (res
!= CUDA_SUCCESS
)
1733 GOMP_PLUGIN_fatal ("%s error: %s", __FUNCTION__
, cuda_error (res
));
1734 struct nvptx_callback
*cb
= (struct nvptx_callback
*) ptr
;
1740 GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue
*aq
,
1741 void (*callback_fn
)(void *),
1744 struct nvptx_callback
*b
= GOMP_PLUGIN_malloc (sizeof (*b
));
1745 b
->fn
= callback_fn
;
1748 CUDA_CALL_ASSERT (cuStreamAddCallback
, aq
->cuda_stream
,
1749 cuda_callback_wrapper
, (void *) b
, 0);
1753 cuda_memcpy_sanity_check (const void *h
, const void *d
, size_t s
)
1761 GOMP_PLUGIN_error ("invalid device address");
1764 CUDA_CALL (cuMemGetAddressRange
, &pb
, &ps
, (CUdeviceptr
) d
);
1767 GOMP_PLUGIN_error ("invalid device address");
1772 GOMP_PLUGIN_error ("invalid host address");
1777 GOMP_PLUGIN_error ("invalid host or device address");
1780 if ((void *)(d
+ s
) > (void *)(pb
+ ps
))
1782 GOMP_PLUGIN_error ("invalid size");
1789 GOMP_OFFLOAD_host2dev (int ord
, void *dst
, const void *src
, size_t n
)
1791 if (!nvptx_attach_host_thread_to_device (ord
)
1792 || !cuda_memcpy_sanity_check (src
, dst
, n
))
1794 CUDA_CALL (cuMemcpyHtoD
, (CUdeviceptr
) dst
, src
, n
);
1799 GOMP_OFFLOAD_dev2host (int ord
, void *dst
, const void *src
, size_t n
)
1801 if (!nvptx_attach_host_thread_to_device (ord
)
1802 || !cuda_memcpy_sanity_check (dst
, src
, n
))
1804 CUDA_CALL (cuMemcpyDtoH
, dst
, (CUdeviceptr
) src
, n
);
1809 GOMP_OFFLOAD_dev2dev (int ord
, void *dst
, const void *src
, size_t n
)
1811 CUDA_CALL (cuMemcpyDtoDAsync
, (CUdeviceptr
) dst
, (CUdeviceptr
) src
, n
, NULL
);
1816 GOMP_OFFLOAD_openacc_async_host2dev (int ord
, void *dst
, const void *src
,
1817 size_t n
, struct goacc_asyncqueue
*aq
)
1819 if (!nvptx_attach_host_thread_to_device (ord
)
1820 || !cuda_memcpy_sanity_check (src
, dst
, n
))
1822 CUDA_CALL (cuMemcpyHtoDAsync
, (CUdeviceptr
) dst
, src
, n
, aq
->cuda_stream
);
1827 GOMP_OFFLOAD_openacc_async_dev2host (int ord
, void *dst
, const void *src
,
1828 size_t n
, struct goacc_asyncqueue
*aq
)
1830 if (!nvptx_attach_host_thread_to_device (ord
)
1831 || !cuda_memcpy_sanity_check (dst
, src
, n
))
1833 CUDA_CALL (cuMemcpyDtoHAsync
, dst
, (CUdeviceptr
) src
, n
, aq
->cuda_stream
);
1837 union goacc_property_value
1838 GOMP_OFFLOAD_openacc_get_property (int n
, enum goacc_property prop
)
1840 union goacc_property_value propval
= { .val
= 0 };
1842 pthread_mutex_lock (&ptx_dev_lock
);
1844 if (n
>= nvptx_get_num_devices () || n
< 0 || ptx_devices
[n
] == NULL
)
1846 pthread_mutex_unlock (&ptx_dev_lock
);
1850 struct ptx_device
*ptx_dev
= ptx_devices
[n
];
1853 case GOACC_PROPERTY_MEMORY
:
1857 CUDA_CALL_ERET (propval
, cuDeviceTotalMem
, &total_mem
, ptx_dev
->dev
);
1858 propval
.val
= total_mem
;
1861 case GOACC_PROPERTY_FREE_MEMORY
:
1867 CUDA_CALL_ERET (propval
, cuCtxGetDevice
, &ctxdev
);
1868 if (ptx_dev
->dev
== ctxdev
)
1869 CUDA_CALL_ERET (propval
, cuMemGetInfo
, &free_mem
, &total_mem
);
1870 else if (ptx_dev
->ctx
)
1874 CUDA_CALL_ERET (propval
, cuCtxPushCurrent
, ptx_dev
->ctx
);
1875 CUDA_CALL_ERET (propval
, cuMemGetInfo
, &free_mem
, &total_mem
);
1876 CUDA_CALL_ASSERT (cuCtxPopCurrent
, &old_ctx
);
1882 CUDA_CALL_ERET (propval
, cuCtxCreate
, &new_ctx
, CU_CTX_SCHED_AUTO
,
1884 CUDA_CALL_ERET (propval
, cuMemGetInfo
, &free_mem
, &total_mem
);
1885 CUDA_CALL_ASSERT (cuCtxDestroy
, new_ctx
);
1887 propval
.val
= free_mem
;
1890 case GOACC_PROPERTY_NAME
:
1891 propval
.ptr
= ptx_dev
->name
;
1893 case GOACC_PROPERTY_VENDOR
:
1894 propval
.ptr
= "Nvidia";
1896 case GOACC_PROPERTY_DRIVER
:
1897 propval
.ptr
= cuda_driver_version_s
;
1903 pthread_mutex_unlock (&ptx_dev_lock
);
1907 /* Adjust launch dimensions: pick good values for number of blocks and warps
1908 and ensure that number of warps does not exceed CUDA limits as well as GCC's
1912 nvptx_adjust_launch_bounds (struct targ_fn_descriptor
*fn
,
1913 struct ptx_device
*ptx_dev
,
1914 int *teams_p
, int *threads_p
)
1916 int max_warps_block
= fn
->max_threads_per_block
/ 32;
1917 /* Maximum 32 warps per block is an implementation limit in NVPTX backend
1918 and libgcc, which matches documented limit of all GPUs as of 2015. */
1919 if (max_warps_block
> 32)
1920 max_warps_block
= 32;
1921 if (*threads_p
<= 0)
1923 if (*threads_p
> max_warps_block
)
1924 *threads_p
= max_warps_block
;
1926 int regs_per_block
= fn
->regs_per_thread
* 32 * *threads_p
;
1927 /* This is an estimate of how many blocks the device can host simultaneously.
1928 Actual limit, which may be lower, can be queried with "occupancy control"
1929 driver interface (since CUDA 6.0). */
1930 int max_blocks
= ptx_dev
->regs_per_sm
/ regs_per_block
* ptx_dev
->num_sms
;
1931 if (*teams_p
<= 0 || *teams_p
> max_blocks
)
1932 *teams_p
= max_blocks
;
1935 /* Return the size of per-warp stacks (see gcc -msoft-stack) to use for OpenMP
1939 nvptx_stacks_size ()
1944 /* Return contiguous storage for NUM stacks, each SIZE bytes. The lock for
1945 the storage should be held on entry, and remains held on exit. */
1948 nvptx_stacks_acquire (struct ptx_device
*ptx_dev
, size_t size
, int num
)
1950 if (ptx_dev
->omp_stacks
.ptr
&& ptx_dev
->omp_stacks
.size
>= size
* num
)
1951 return (void *) ptx_dev
->omp_stacks
.ptr
;
1953 /* Free the old, too-small stacks. */
1954 if (ptx_dev
->omp_stacks
.ptr
)
1956 CUresult r
= CUDA_CALL_NOCHECK (cuCtxSynchronize
, );
1957 if (r
!= CUDA_SUCCESS
)
1958 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s\n", cuda_error (r
));
1959 r
= CUDA_CALL_NOCHECK (cuMemFree
, ptx_dev
->omp_stacks
.ptr
);
1960 if (r
!= CUDA_SUCCESS
)
1961 GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r
));
1964 /* Make new and bigger stacks, and remember where we put them and how big
1966 CUresult r
= CUDA_CALL_NOCHECK (cuMemAlloc
, &ptx_dev
->omp_stacks
.ptr
,
1968 if (r
!= CUDA_SUCCESS
)
1969 GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r
));
1971 ptx_dev
->omp_stacks
.size
= size
* num
;
1973 return (void *) ptx_dev
->omp_stacks
.ptr
;
1977 GOMP_OFFLOAD_run (int ord
, void *tgt_fn
, void *tgt_vars
, void **args
)
1979 struct targ_fn_descriptor
*tgt_fn_desc
1980 = (struct targ_fn_descriptor
*) tgt_fn
;
1981 CUfunction function
= tgt_fn_desc
->fn
;
1982 const struct targ_fn_launch
*launch
= tgt_fn_desc
->launch
;
1983 const char *fn_name
= launch
->fn
;
1985 struct ptx_device
*ptx_dev
= ptx_devices
[ord
];
1986 const char *maybe_abort_msg
= "(perhaps abort was called)";
1987 int teams
= 0, threads
= 0;
1990 GOMP_PLUGIN_fatal ("No target arguments provided");
1993 intptr_t id
= (intptr_t) *args
++, val
;
1994 if (id
& GOMP_TARGET_ARG_SUBSEQUENT_PARAM
)
1995 val
= (intptr_t) *args
++;
1997 val
= id
>> GOMP_TARGET_ARG_VALUE_SHIFT
;
1998 if ((id
& GOMP_TARGET_ARG_DEVICE_MASK
) != GOMP_TARGET_ARG_DEVICE_ALL
)
2000 val
= val
> INT_MAX
? INT_MAX
: val
;
2001 id
&= GOMP_TARGET_ARG_ID_MASK
;
2002 if (id
== GOMP_TARGET_ARG_NUM_TEAMS
)
2004 else if (id
== GOMP_TARGET_ARG_THREAD_LIMIT
)
2007 nvptx_adjust_launch_bounds (tgt_fn
, ptx_dev
, &teams
, &threads
);
2009 size_t stack_size
= nvptx_stacks_size ();
2011 pthread_mutex_lock (&ptx_dev
->omp_stacks
.lock
);
2012 void *stacks
= nvptx_stacks_acquire (ptx_dev
, stack_size
, teams
* threads
);
2013 void *fn_args
[] = {tgt_vars
, stacks
, (void *) stack_size
};
2014 size_t fn_args_size
= sizeof fn_args
;
2016 CU_LAUNCH_PARAM_BUFFER_POINTER
, fn_args
,
2017 CU_LAUNCH_PARAM_BUFFER_SIZE
, &fn_args_size
,
2020 GOMP_PLUGIN_debug (0, " %s: kernel %s: launch"
2021 " [(teams: %u), 1, 1] [(lanes: 32), (threads: %u), 1]\n",
2022 __FUNCTION__
, fn_name
, teams
, threads
);
2023 r
= CUDA_CALL_NOCHECK (cuLaunchKernel
, function
, teams
, 1, 1,
2024 32, threads
, 1, 0, NULL
, NULL
, config
);
2025 if (r
!= CUDA_SUCCESS
)
2026 GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r
));
2028 r
= CUDA_CALL_NOCHECK (cuCtxSynchronize
, );
2029 if (r
== CUDA_ERROR_LAUNCH_FAILED
)
2030 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r
),
2032 else if (r
!= CUDA_SUCCESS
)
2033 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r
));
2035 pthread_mutex_unlock (&ptx_dev
->omp_stacks
.lock
);
2038 /* TODO: Implement GOMP_OFFLOAD_async_run. */