Daily bump.
[official-gcc.git] / libgomp / plugin / plugin-nvptx.c
blob0f16e1cf00dba800d03ab5ef7bbc8a72de6c6003
1 /* Plugin for NVPTX execution.
3 Copyright (C) 2013-2021 Free Software Foundation, Inc.
5 Contributed by Mentor Embedded.
7 This file is part of the GNU Offloading and Multi Processing Library
8 (libgomp).
10 Libgomp is free software; you can redistribute it and/or modify it
11 under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 3, or (at your option)
13 any later version.
15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
18 more details.
20 Under Section 7 of GPL version 3, you are granted additional
21 permissions described in the GCC Runtime Library Exception, version
22 3.1, as published by the Free Software Foundation.
24 You should have received a copy of the GNU General Public License and
25 a copy of the GCC Runtime Library Exception along with this program;
26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
27 <http://www.gnu.org/licenses/>. */
29 /* Nvidia PTX-specific parts of OpenACC support. The cuda driver
30 library appears to hold some implicit state, but the documentation
31 is not clear as to what that state might be. Or how one might
32 propagate it from one thread to another. */
34 #define _GNU_SOURCE
35 #include "openacc.h"
36 #include "config.h"
37 #include "symcat.h"
38 #include "libgomp-plugin.h"
39 #include "oacc-plugin.h"
40 #include "gomp-constants.h"
41 #include "oacc-int.h"
43 #include <pthread.h>
44 #include <cuda.h>
45 #include <stdbool.h>
46 #include <limits.h>
47 #include <string.h>
48 #include <stdio.h>
49 #include <unistd.h>
50 #include <assert.h>
51 #include <errno.h>
53 /* An arbitrary fixed limit (128MB) for the size of the OpenMP soft stacks
54 block to cache between kernel invocations. For soft-stacks blocks bigger
55 than this, we will free the block before attempting another GPU memory
56 allocation (i.e. in GOMP_OFFLOAD_alloc). Otherwise, if an allocation fails,
57 we will free the cached soft-stacks block anyway then retry the
58 allocation. If that fails too, we lose. */
60 #define SOFTSTACK_CACHE_LIMIT 134217728
62 #if CUDA_VERSION < 6000
63 extern CUresult cuGetErrorString (CUresult, const char **);
64 #define CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR 82
65 #endif
67 #if CUDA_VERSION >= 6050
68 #undef cuLinkCreate
69 #undef cuLinkAddData
70 CUresult cuLinkAddData (CUlinkState, CUjitInputType, void *, size_t,
71 const char *, unsigned, CUjit_option *, void **);
72 CUresult cuLinkCreate (unsigned, CUjit_option *, void **, CUlinkState *);
73 #else
74 typedef size_t (*CUoccupancyB2DSize)(int);
75 CUresult cuLinkAddData_v2 (CUlinkState, CUjitInputType, void *, size_t,
76 const char *, unsigned, CUjit_option *, void **);
77 CUresult cuLinkCreate_v2 (unsigned, CUjit_option *, void **, CUlinkState *);
78 CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
79 CUoccupancyB2DSize, size_t, int);
80 #endif
82 #define DO_PRAGMA(x) _Pragma (#x)
84 #if PLUGIN_NVPTX_DYNAMIC
85 # include <dlfcn.h>
87 struct cuda_lib_s {
89 # define CUDA_ONE_CALL(call) \
90 __typeof (call) *call;
91 # define CUDA_ONE_CALL_MAYBE_NULL(call) \
92 CUDA_ONE_CALL (call)
93 #include "cuda-lib.def"
94 # undef CUDA_ONE_CALL
95 # undef CUDA_ONE_CALL_MAYBE_NULL
97 } cuda_lib;
99 /* -1 if init_cuda_lib has not been called yet, false
100 if it has been and failed, true if it has been and succeeded. */
101 static signed char cuda_lib_inited = -1;
103 /* Dynamically load the CUDA runtime library and initialize function
104 pointers, return false if unsuccessful, true if successful. */
105 static bool
106 init_cuda_lib (void)
108 if (cuda_lib_inited != -1)
109 return cuda_lib_inited;
110 const char *cuda_runtime_lib = "libcuda.so.1";
111 void *h = dlopen (cuda_runtime_lib, RTLD_LAZY);
112 cuda_lib_inited = false;
113 if (h == NULL)
114 return false;
116 # define CUDA_ONE_CALL(call) CUDA_ONE_CALL_1 (call, false)
117 # define CUDA_ONE_CALL_MAYBE_NULL(call) CUDA_ONE_CALL_1 (call, true)
118 # define CUDA_ONE_CALL_1(call, allow_null) \
119 cuda_lib.call = dlsym (h, #call); \
120 if (!allow_null && cuda_lib.call == NULL) \
121 return false;
122 #include "cuda-lib.def"
123 # undef CUDA_ONE_CALL
124 # undef CUDA_ONE_CALL_1
125 # undef CUDA_ONE_CALL_MAYBE_NULL
127 cuda_lib_inited = true;
128 return true;
130 # define CUDA_CALL_PREFIX cuda_lib.
131 #else
133 # define CUDA_ONE_CALL(call)
134 # define CUDA_ONE_CALL_MAYBE_NULL(call) DO_PRAGMA (weak call)
135 #include "cuda-lib.def"
136 #undef CUDA_ONE_CALL_MAYBE_NULL
137 #undef CUDA_ONE_CALL
139 # define CUDA_CALL_PREFIX
140 # define init_cuda_lib() true
141 #endif
143 #include "secure_getenv.h"
145 #undef MIN
146 #undef MAX
147 #define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
148 #define MAX(X,Y) ((X) > (Y) ? (X) : (Y))
150 /* Convenience macros for the frequently used CUDA library call and
151 error handling sequence as well as CUDA library calls that
152 do the error checking themselves or don't do it at all. */
154 #define CUDA_CALL_ERET(ERET, FN, ...) \
155 do { \
156 unsigned __r \
157 = CUDA_CALL_PREFIX FN (__VA_ARGS__); \
158 if (__r != CUDA_SUCCESS) \
160 GOMP_PLUGIN_error (#FN " error: %s", \
161 cuda_error (__r)); \
162 return ERET; \
164 } while (0)
166 #define CUDA_CALL(FN, ...) \
167 CUDA_CALL_ERET (false, FN, __VA_ARGS__)
169 #define CUDA_CALL_ASSERT(FN, ...) \
170 do { \
171 unsigned __r \
172 = CUDA_CALL_PREFIX FN (__VA_ARGS__); \
173 if (__r != CUDA_SUCCESS) \
175 GOMP_PLUGIN_fatal (#FN " error: %s", \
176 cuda_error (__r)); \
178 } while (0)
180 #define CUDA_CALL_NOCHECK(FN, ...) \
181 CUDA_CALL_PREFIX FN (__VA_ARGS__)
183 #define CUDA_CALL_EXISTS(FN) \
184 CUDA_CALL_PREFIX FN
186 static const char *
187 cuda_error (CUresult r)
189 const char *fallback = "unknown cuda error";
190 const char *desc;
192 if (!CUDA_CALL_EXISTS (cuGetErrorString))
193 return fallback;
195 r = CUDA_CALL_NOCHECK (cuGetErrorString, r, &desc);
196 if (r == CUDA_SUCCESS)
197 return desc;
199 return fallback;
202 /* Version of the CUDA Toolkit in the same MAJOR.MINOR format that is used by
203 Nvidia, such as in the 'deviceQuery' program (Nvidia's CUDA samples). */
204 static char cuda_driver_version_s[30];
206 static unsigned int instantiated_devices = 0;
207 static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
209 /* NVPTX/CUDA specific definition of asynchronous queues. */
210 struct goacc_asyncqueue
212 CUstream cuda_stream;
215 struct nvptx_callback
217 void (*fn) (void *);
218 void *ptr;
219 struct goacc_asyncqueue *aq;
220 struct nvptx_callback *next;
223 /* Thread-specific data for PTX. */
225 struct nvptx_thread
227 /* We currently have this embedded inside the plugin because libgomp manages
228 devices through integer target_ids. This might be better if using an
229 opaque target-specific pointer directly from gomp_device_descr. */
230 struct ptx_device *ptx_dev;
233 /* Target data function launch information. */
235 struct targ_fn_launch
237 const char *fn;
238 unsigned short dim[GOMP_DIM_MAX];
241 /* Target PTX object information. */
243 struct targ_ptx_obj
245 const char *code;
246 size_t size;
249 /* Target data image information. */
251 typedef struct nvptx_tdata
253 const struct targ_ptx_obj *ptx_objs;
254 unsigned ptx_num;
256 const char *const *var_names;
257 unsigned var_num;
259 const struct targ_fn_launch *fn_descs;
260 unsigned fn_num;
261 } nvptx_tdata_t;
263 /* Descriptor of a loaded function. */
265 struct targ_fn_descriptor
267 CUfunction fn;
268 const struct targ_fn_launch *launch;
269 int regs_per_thread;
270 int max_threads_per_block;
273 /* A loaded PTX image. */
274 struct ptx_image_data
276 const void *target_data;
277 CUmodule module;
279 struct targ_fn_descriptor *fns; /* Array of functions. */
281 struct ptx_image_data *next;
284 struct ptx_free_block
286 void *ptr;
287 struct ptx_free_block *next;
290 struct ptx_device
292 CUcontext ctx;
293 bool ctx_shared;
294 CUdevice dev;
296 int ord;
297 bool overlap;
298 bool map;
299 bool concur;
300 bool mkern;
301 int mode;
302 int clock_khz;
303 int num_sms;
304 int regs_per_block;
305 int regs_per_sm;
306 int warp_size;
307 int max_threads_per_block;
308 int max_threads_per_multiprocessor;
309 int default_dims[GOMP_DIM_MAX];
311 /* Length as used by the CUDA Runtime API ('struct cudaDeviceProp'). */
312 char name[256];
314 struct ptx_image_data *images; /* Images loaded on device. */
315 pthread_mutex_t image_lock; /* Lock for above list. */
317 struct ptx_free_block *free_blocks;
318 pthread_mutex_t free_blocks_lock;
320 /* OpenMP stacks, cached between kernel invocations. */
321 struct
323 CUdeviceptr ptr;
324 size_t size;
325 pthread_mutex_t lock;
326 } omp_stacks;
328 struct ptx_device *next;
331 static struct ptx_device **ptx_devices;
333 static inline struct nvptx_thread *
334 nvptx_thread (void)
336 return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
339 /* Initialize the device. Return TRUE on success, else FALSE. PTX_DEV_LOCK
340 should be locked on entry and remains locked on exit. */
342 static bool
343 nvptx_init (void)
345 int ndevs;
347 if (instantiated_devices != 0)
348 return true;
350 if (!init_cuda_lib ())
351 return false;
353 CUDA_CALL (cuInit, 0);
355 int cuda_driver_version;
356 CUDA_CALL_ERET (NULL, cuDriverGetVersion, &cuda_driver_version);
357 snprintf (cuda_driver_version_s, sizeof cuda_driver_version_s,
358 "CUDA Driver %u.%u",
359 cuda_driver_version / 1000, cuda_driver_version % 1000 / 10);
361 CUDA_CALL (cuDeviceGetCount, &ndevs);
362 ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *)
363 * ndevs);
365 return true;
368 /* Select the N'th PTX device for the current host thread. The device must
369 have been previously opened before calling this function. */
371 static bool
372 nvptx_attach_host_thread_to_device (int n)
374 CUdevice dev;
375 CUresult r;
376 struct ptx_device *ptx_dev;
377 CUcontext thd_ctx;
379 r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &dev);
380 if (r == CUDA_ERROR_NOT_PERMITTED)
382 /* Assume we're in a CUDA callback, just return true. */
383 return true;
385 if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
387 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
388 return false;
391 if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n)
392 return true;
393 else
395 CUcontext old_ctx;
397 ptx_dev = ptx_devices[n];
398 if (!ptx_dev)
400 GOMP_PLUGIN_error ("device %d not found", n);
401 return false;
404 CUDA_CALL (cuCtxGetCurrent, &thd_ctx);
406 /* We don't necessarily have a current context (e.g. if it has been
407 destroyed. Pop it if we do though. */
408 if (thd_ctx != NULL)
409 CUDA_CALL (cuCtxPopCurrent, &old_ctx);
411 CUDA_CALL (cuCtxPushCurrent, ptx_dev->ctx);
413 return true;
416 static struct ptx_device *
417 nvptx_open_device (int n)
419 struct ptx_device *ptx_dev;
420 CUdevice dev, ctx_dev;
421 CUresult r;
422 int async_engines, pi;
424 CUDA_CALL_ERET (NULL, cuDeviceGet, &dev, n);
426 ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device));
428 ptx_dev->ord = n;
429 ptx_dev->dev = dev;
430 ptx_dev->ctx_shared = false;
432 r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &ctx_dev);
433 if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
435 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
436 return NULL;
439 if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev)
441 /* The current host thread has an active context for a different device.
442 Detach it. */
443 CUcontext old_ctx;
444 CUDA_CALL_ERET (NULL, cuCtxPopCurrent, &old_ctx);
447 CUDA_CALL_ERET (NULL, cuCtxGetCurrent, &ptx_dev->ctx);
449 if (!ptx_dev->ctx)
450 CUDA_CALL_ERET (NULL, cuCtxCreate, &ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
451 else
452 ptx_dev->ctx_shared = true;
454 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
455 &pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
456 ptx_dev->overlap = pi;
458 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
459 &pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
460 ptx_dev->map = pi;
462 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
463 &pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
464 ptx_dev->concur = pi;
466 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
467 &pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
468 ptx_dev->mode = pi;
470 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
471 &pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
472 ptx_dev->mkern = pi;
474 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
475 &pi, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
476 ptx_dev->clock_khz = pi;
478 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
479 &pi, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
480 ptx_dev->num_sms = pi;
482 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
483 &pi, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev);
484 ptx_dev->regs_per_block = pi;
486 /* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR is defined only
487 in CUDA 6.0 and newer. */
488 r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi,
489 CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR,
490 dev);
491 /* Fallback: use limit of registers per block, which is usually equal. */
492 if (r == CUDA_ERROR_INVALID_VALUE)
493 pi = ptx_dev->regs_per_block;
494 else if (r != CUDA_SUCCESS)
496 GOMP_PLUGIN_error ("cuDeviceGetAttribute error: %s", cuda_error (r));
497 return NULL;
499 ptx_dev->regs_per_sm = pi;
501 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
502 &pi, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev);
503 if (pi != 32)
505 GOMP_PLUGIN_error ("Only warp size 32 is supported");
506 return NULL;
508 ptx_dev->warp_size = pi;
510 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
511 CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev);
512 ptx_dev->max_threads_per_block = pi;
514 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
515 CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev);
516 ptx_dev->max_threads_per_multiprocessor = pi;
518 r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &async_engines,
519 CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
520 if (r != CUDA_SUCCESS)
521 async_engines = 1;
523 for (int i = 0; i != GOMP_DIM_MAX; i++)
524 ptx_dev->default_dims[i] = 0;
526 CUDA_CALL_ERET (NULL, cuDeviceGetName, ptx_dev->name, sizeof ptx_dev->name,
527 dev);
529 ptx_dev->images = NULL;
530 pthread_mutex_init (&ptx_dev->image_lock, NULL);
532 ptx_dev->free_blocks = NULL;
533 pthread_mutex_init (&ptx_dev->free_blocks_lock, NULL);
535 ptx_dev->omp_stacks.ptr = 0;
536 ptx_dev->omp_stacks.size = 0;
537 pthread_mutex_init (&ptx_dev->omp_stacks.lock, NULL);
539 return ptx_dev;
542 static bool
543 nvptx_close_device (struct ptx_device *ptx_dev)
545 if (!ptx_dev)
546 return true;
548 for (struct ptx_free_block *b = ptx_dev->free_blocks; b;)
550 struct ptx_free_block *b_next = b->next;
551 CUDA_CALL (cuMemFree, (CUdeviceptr) b->ptr);
552 free (b);
553 b = b_next;
556 pthread_mutex_destroy (&ptx_dev->free_blocks_lock);
557 pthread_mutex_destroy (&ptx_dev->image_lock);
559 pthread_mutex_destroy (&ptx_dev->omp_stacks.lock);
561 if (ptx_dev->omp_stacks.ptr)
562 CUDA_CALL (cuMemFree, ptx_dev->omp_stacks.ptr);
564 if (!ptx_dev->ctx_shared)
565 CUDA_CALL (cuCtxDestroy, ptx_dev->ctx);
567 free (ptx_dev);
568 return true;
571 static int
572 nvptx_get_num_devices (void)
574 int n;
576 /* This function will be called before the plugin has been initialized in
577 order to enumerate available devices, but CUDA API routines can't be used
578 until cuInit has been called. Just call it now (but don't yet do any
579 further initialization). */
580 if (instantiated_devices == 0)
582 if (!init_cuda_lib ())
583 return 0;
584 CUresult r = CUDA_CALL_NOCHECK (cuInit, 0);
585 /* This is not an error: e.g. we may have CUDA libraries installed but
586 no devices available. */
587 if (r != CUDA_SUCCESS)
589 GOMP_PLUGIN_debug (0, "Disabling nvptx offloading; cuInit: %s\n",
590 cuda_error (r));
591 return 0;
595 CUDA_CALL_ERET (-1, cuDeviceGetCount, &n);
596 return n;
599 static void
600 notify_var (const char *var_name, const char *env_var)
602 if (env_var == NULL)
603 GOMP_PLUGIN_debug (0, "%s: <Not defined>\n", var_name);
604 else
605 GOMP_PLUGIN_debug (0, "%s: '%s'\n", var_name, env_var);
608 static void
609 process_GOMP_NVPTX_JIT (intptr_t *gomp_nvptx_o)
611 const char *var_name = "GOMP_NVPTX_JIT";
612 const char *env_var = secure_getenv (var_name);
613 notify_var (var_name, env_var);
615 if (env_var == NULL)
616 return;
618 const char *c = env_var;
619 while (*c != '\0')
621 while (*c == ' ')
622 c++;
624 if (c[0] == '-' && c[1] == 'O'
625 && '0' <= c[2] && c[2] <= '4'
626 && (c[3] == '\0' || c[3] == ' '))
628 *gomp_nvptx_o = c[2] - '0';
629 c += 3;
630 continue;
633 GOMP_PLUGIN_error ("Error parsing %s", var_name);
634 break;
638 static bool
639 link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs,
640 unsigned num_objs)
642 CUjit_option opts[7];
643 void *optvals[7];
644 float elapsed = 0.0;
645 char elog[1024];
646 char ilog[16384];
647 CUlinkState linkstate;
648 CUresult r;
649 void *linkout;
650 size_t linkoutsize __attribute__ ((unused));
652 opts[0] = CU_JIT_WALL_TIME;
653 optvals[0] = &elapsed;
655 opts[1] = CU_JIT_INFO_LOG_BUFFER;
656 optvals[1] = &ilog[0];
658 opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
659 optvals[2] = (void *) sizeof ilog;
661 opts[3] = CU_JIT_ERROR_LOG_BUFFER;
662 optvals[3] = &elog[0];
664 opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
665 optvals[4] = (void *) sizeof elog;
667 opts[5] = CU_JIT_LOG_VERBOSE;
668 optvals[5] = (void *) 1;
670 static intptr_t gomp_nvptx_o = -1;
672 static bool init_done = false;
673 if (!init_done)
675 process_GOMP_NVPTX_JIT (&gomp_nvptx_o);
676 init_done = true;
679 int nopts = 6;
680 if (gomp_nvptx_o != -1)
682 opts[nopts] = CU_JIT_OPTIMIZATION_LEVEL;
683 optvals[nopts] = (void *) gomp_nvptx_o;
684 nopts++;
687 if (CUDA_CALL_EXISTS (cuLinkCreate_v2))
688 CUDA_CALL (cuLinkCreate_v2, nopts, opts, optvals, &linkstate);
689 else
690 CUDA_CALL (cuLinkCreate, nopts, opts, optvals, &linkstate);
692 for (; num_objs--; ptx_objs++)
694 /* cuLinkAddData's 'data' argument erroneously omits the const
695 qualifier. */
696 GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs->code);
697 if (CUDA_CALL_EXISTS (cuLinkAddData_v2))
698 r = CUDA_CALL_NOCHECK (cuLinkAddData_v2, linkstate, CU_JIT_INPUT_PTX,
699 (char *) ptx_objs->code, ptx_objs->size,
700 0, 0, 0, 0);
701 else
702 r = CUDA_CALL_NOCHECK (cuLinkAddData, linkstate, CU_JIT_INPUT_PTX,
703 (char *) ptx_objs->code, ptx_objs->size,
704 0, 0, 0, 0);
705 if (r != CUDA_SUCCESS)
707 GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
708 GOMP_PLUGIN_error ("cuLinkAddData (ptx_code) error: %s",
709 cuda_error (r));
710 return false;
714 GOMP_PLUGIN_debug (0, "Linking\n");
715 r = CUDA_CALL_NOCHECK (cuLinkComplete, linkstate, &linkout, &linkoutsize);
717 GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed);
718 GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]);
720 if (r != CUDA_SUCCESS)
722 GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
723 GOMP_PLUGIN_error ("cuLinkComplete error: %s", cuda_error (r));
724 return false;
727 CUDA_CALL (cuModuleLoadData, module, linkout);
728 CUDA_CALL (cuLinkDestroy, linkstate);
729 return true;
732 static void
733 nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
734 unsigned *dims, void *targ_mem_desc,
735 CUdeviceptr dp, CUstream stream)
737 struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
738 CUfunction function;
739 int i;
740 void *kargs[1];
741 struct nvptx_thread *nvthd = nvptx_thread ();
742 int warp_size = nvthd->ptx_dev->warp_size;
744 function = targ_fn->fn;
746 /* Initialize the launch dimensions. Typically this is constant,
747 provided by the device compiler, but we must permit runtime
748 values. */
749 int seen_zero = 0;
750 for (i = 0; i != GOMP_DIM_MAX; i++)
752 if (targ_fn->launch->dim[i])
753 dims[i] = targ_fn->launch->dim[i];
754 if (!dims[i])
755 seen_zero = 1;
758 if (seen_zero)
760 pthread_mutex_lock (&ptx_dev_lock);
762 static int gomp_openacc_dims[GOMP_DIM_MAX];
763 if (!gomp_openacc_dims[0])
765 /* See if the user provided GOMP_OPENACC_DIM environment
766 variable to specify runtime defaults. */
767 for (int i = 0; i < GOMP_DIM_MAX; ++i)
768 gomp_openacc_dims[i] = GOMP_PLUGIN_acc_default_dim (i);
771 if (!nvthd->ptx_dev->default_dims[0])
773 int default_dims[GOMP_DIM_MAX];
774 for (int i = 0; i < GOMP_DIM_MAX; ++i)
775 default_dims[i] = gomp_openacc_dims[i];
777 int gang, worker, vector;
779 int block_size = nvthd->ptx_dev->max_threads_per_block;
780 int cpu_size = nvthd->ptx_dev->max_threads_per_multiprocessor;
781 int dev_size = nvthd->ptx_dev->num_sms;
782 GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
783 " dev_size=%d, cpu_size=%d\n",
784 warp_size, block_size, dev_size, cpu_size);
786 gang = (cpu_size / block_size) * dev_size;
787 worker = block_size / warp_size;
788 vector = warp_size;
791 /* There is no upper bound on the gang size. The best size
792 matches the hardware configuration. Logical gangs are
793 scheduled onto physical hardware. To maximize usage, we
794 should guess a large number. */
795 if (default_dims[GOMP_DIM_GANG] < 1)
796 default_dims[GOMP_DIM_GANG] = gang ? gang : 1024;
797 /* The worker size must not exceed the hardware. */
798 if (default_dims[GOMP_DIM_WORKER] < 1
799 || (default_dims[GOMP_DIM_WORKER] > worker && gang))
800 default_dims[GOMP_DIM_WORKER] = worker;
801 /* The vector size must exactly match the hardware. */
802 if (default_dims[GOMP_DIM_VECTOR] < 1
803 || (default_dims[GOMP_DIM_VECTOR] != vector && gang))
804 default_dims[GOMP_DIM_VECTOR] = vector;
806 GOMP_PLUGIN_debug (0, " default dimensions [%d,%d,%d]\n",
807 default_dims[GOMP_DIM_GANG],
808 default_dims[GOMP_DIM_WORKER],
809 default_dims[GOMP_DIM_VECTOR]);
811 for (i = 0; i != GOMP_DIM_MAX; i++)
812 nvthd->ptx_dev->default_dims[i] = default_dims[i];
814 pthread_mutex_unlock (&ptx_dev_lock);
817 bool default_dim_p[GOMP_DIM_MAX];
818 for (i = 0; i != GOMP_DIM_MAX; i++)
819 default_dim_p[i] = !dims[i];
821 if (!CUDA_CALL_EXISTS (cuOccupancyMaxPotentialBlockSize))
823 for (i = 0; i != GOMP_DIM_MAX; i++)
824 if (default_dim_p[i])
825 dims[i] = nvthd->ptx_dev->default_dims[i];
827 if (default_dim_p[GOMP_DIM_VECTOR])
828 dims[GOMP_DIM_VECTOR]
829 = MIN (dims[GOMP_DIM_VECTOR],
830 (targ_fn->max_threads_per_block / warp_size
831 * warp_size));
833 if (default_dim_p[GOMP_DIM_WORKER])
834 dims[GOMP_DIM_WORKER]
835 = MIN (dims[GOMP_DIM_WORKER],
836 targ_fn->max_threads_per_block / dims[GOMP_DIM_VECTOR]);
838 else
840 /* Handle the case that the compiler allows the runtime to choose
841 the vector-length conservatively, by ignoring
842 gomp_openacc_dims[GOMP_DIM_VECTOR]. TODO: actually handle
843 it. */
844 int vectors = 0;
845 /* TODO: limit gomp_openacc_dims[GOMP_DIM_WORKER] such that that
846 gomp_openacc_dims[GOMP_DIM_WORKER] * actual_vectors does not
847 exceed targ_fn->max_threads_per_block. */
848 int workers = gomp_openacc_dims[GOMP_DIM_WORKER];
849 int gangs = gomp_openacc_dims[GOMP_DIM_GANG];
850 int grids, blocks;
852 CUDA_CALL_ASSERT (cuOccupancyMaxPotentialBlockSize, &grids,
853 &blocks, function, NULL, 0,
854 dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]);
855 GOMP_PLUGIN_debug (0, "cuOccupancyMaxPotentialBlockSize: "
856 "grid = %d, block = %d\n", grids, blocks);
858 /* Keep the num_gangs proportional to the block size. In
859 the case were a block size is limited by shared-memory
860 or the register file capacity, the runtime will not
861 excessively over assign gangs to the multiprocessor
862 units if their state is going to be swapped out even
863 more than necessary. The constant factor 2 is there to
864 prevent threads from idling when there is insufficient
865 work for them. */
866 if (gangs == 0)
867 gangs = 2 * grids * (blocks / warp_size);
869 if (vectors == 0)
870 vectors = warp_size;
872 if (workers == 0)
874 int actual_vectors = (default_dim_p[GOMP_DIM_VECTOR]
875 ? vectors
876 : dims[GOMP_DIM_VECTOR]);
877 workers = blocks / actual_vectors;
878 workers = MAX (workers, 1);
879 /* If we need a per-worker barrier ... . */
880 if (actual_vectors > 32)
881 /* Don't use more barriers than available. */
882 workers = MIN (workers, 15);
885 for (i = 0; i != GOMP_DIM_MAX; i++)
886 if (default_dim_p[i])
887 switch (i)
889 case GOMP_DIM_GANG: dims[i] = gangs; break;
890 case GOMP_DIM_WORKER: dims[i] = workers; break;
891 case GOMP_DIM_VECTOR: dims[i] = vectors; break;
892 default: GOMP_PLUGIN_fatal ("invalid dim");
898 /* Check if the accelerator has sufficient hardware resources to
899 launch the offloaded kernel. */
900 if (dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]
901 > targ_fn->max_threads_per_block)
903 const char *msg
904 = ("The Nvidia accelerator has insufficient resources to launch '%s'"
905 " with num_workers = %d and vector_length = %d"
906 "; "
907 "recompile the program with 'num_workers = x and vector_length = y'"
908 " on that offloaded region or '-fopenacc-dim=:x:y' where"
909 " x * y <= %d"
910 ".\n");
911 GOMP_PLUGIN_fatal (msg, targ_fn->launch->fn, dims[GOMP_DIM_WORKER],
912 dims[GOMP_DIM_VECTOR], targ_fn->max_threads_per_block);
915 /* Check if the accelerator has sufficient barrier resources to
916 launch the offloaded kernel. */
917 if (dims[GOMP_DIM_WORKER] > 15 && dims[GOMP_DIM_VECTOR] > 32)
919 const char *msg
920 = ("The Nvidia accelerator has insufficient barrier resources to launch"
921 " '%s' with num_workers = %d and vector_length = %d"
922 "; "
923 "recompile the program with 'num_workers = x' on that offloaded"
924 " region or '-fopenacc-dim=:x:' where x <= 15"
925 "; "
926 "or, recompile the program with 'vector_length = 32' on that"
927 " offloaded region or '-fopenacc-dim=::32'"
928 ".\n");
929 GOMP_PLUGIN_fatal (msg, targ_fn->launch->fn, dims[GOMP_DIM_WORKER],
930 dims[GOMP_DIM_VECTOR]);
933 GOMP_PLUGIN_debug (0, " %s: kernel %s: launch"
934 " gangs=%u, workers=%u, vectors=%u\n",
935 __FUNCTION__, targ_fn->launch->fn, dims[GOMP_DIM_GANG],
936 dims[GOMP_DIM_WORKER], dims[GOMP_DIM_VECTOR]);
938 // OpenACC CUDA
940 // num_gangs nctaid.x
941 // num_workers ntid.y
942 // vector length ntid.x
944 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
945 acc_prof_info *prof_info = thr->prof_info;
946 acc_event_info enqueue_launch_event_info;
947 acc_api_info *api_info = thr->api_info;
948 bool profiling_p = __builtin_expect (prof_info != NULL, false);
949 if (profiling_p)
951 prof_info->event_type = acc_ev_enqueue_launch_start;
953 enqueue_launch_event_info.launch_event.event_type
954 = prof_info->event_type;
955 enqueue_launch_event_info.launch_event.valid_bytes
956 = _ACC_LAUNCH_EVENT_INFO_VALID_BYTES;
957 enqueue_launch_event_info.launch_event.parent_construct
958 = acc_construct_parallel;
959 enqueue_launch_event_info.launch_event.implicit = 1;
960 enqueue_launch_event_info.launch_event.tool_info = NULL;
961 enqueue_launch_event_info.launch_event.kernel_name = targ_fn->launch->fn;
962 enqueue_launch_event_info.launch_event.num_gangs
963 = dims[GOMP_DIM_GANG];
964 enqueue_launch_event_info.launch_event.num_workers
965 = dims[GOMP_DIM_WORKER];
966 enqueue_launch_event_info.launch_event.vector_length
967 = dims[GOMP_DIM_VECTOR];
969 api_info->device_api = acc_device_api_cuda;
971 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &enqueue_launch_event_info,
972 api_info);
975 kargs[0] = &dp;
976 CUDA_CALL_ASSERT (cuLaunchKernel, function,
977 dims[GOMP_DIM_GANG], 1, 1,
978 dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1,
979 0, stream, kargs, 0);
981 if (profiling_p)
983 prof_info->event_type = acc_ev_enqueue_launch_end;
984 enqueue_launch_event_info.launch_event.event_type
985 = prof_info->event_type;
986 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &enqueue_launch_event_info,
987 api_info);
990 GOMP_PLUGIN_debug (0, " %s: kernel %s: finished\n", __FUNCTION__,
991 targ_fn->launch->fn);
994 void * openacc_get_current_cuda_context (void);
996 static void
997 goacc_profiling_acc_ev_alloc (struct goacc_thread *thr, void *dp, size_t s)
999 acc_prof_info *prof_info = thr->prof_info;
1000 acc_event_info data_event_info;
1001 acc_api_info *api_info = thr->api_info;
1003 prof_info->event_type = acc_ev_alloc;
1005 data_event_info.data_event.event_type = prof_info->event_type;
1006 data_event_info.data_event.valid_bytes = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1007 data_event_info.data_event.parent_construct = acc_construct_parallel;
1008 data_event_info.data_event.implicit = 1;
1009 data_event_info.data_event.tool_info = NULL;
1010 data_event_info.data_event.var_name = NULL;
1011 data_event_info.data_event.bytes = s;
1012 data_event_info.data_event.host_ptr = NULL;
1013 data_event_info.data_event.device_ptr = dp;
1015 api_info->device_api = acc_device_api_cuda;
1017 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, api_info);
1020 /* Free the cached soft-stacks block if it is above the SOFTSTACK_CACHE_LIMIT
1021 size threshold, or if FORCE is true. */
1023 static void
1024 nvptx_stacks_free (struct ptx_device *ptx_dev, bool force)
1026 pthread_mutex_lock (&ptx_dev->omp_stacks.lock);
1027 if (ptx_dev->omp_stacks.ptr
1028 && (force || ptx_dev->omp_stacks.size > SOFTSTACK_CACHE_LIMIT))
1030 CUresult r = CUDA_CALL_NOCHECK (cuMemFree, ptx_dev->omp_stacks.ptr);
1031 if (r != CUDA_SUCCESS)
1032 GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
1033 ptx_dev->omp_stacks.ptr = 0;
1034 ptx_dev->omp_stacks.size = 0;
1036 pthread_mutex_unlock (&ptx_dev->omp_stacks.lock);
1039 static void *
1040 nvptx_alloc (size_t s, bool suppress_errors)
1042 CUdeviceptr d;
1044 CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &d, s);
1045 if (suppress_errors && r == CUDA_ERROR_OUT_OF_MEMORY)
1046 return NULL;
1047 else if (r != CUDA_SUCCESS)
1049 GOMP_PLUGIN_error ("nvptx_alloc error: %s", cuda_error (r));
1050 return NULL;
1053 /* NOTE: We only do profiling stuff if the memory allocation succeeds. */
1054 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1055 bool profiling_p
1056 = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1057 if (profiling_p)
1058 goacc_profiling_acc_ev_alloc (thr, (void *) d, s);
1060 return (void *) d;
1063 static void
1064 goacc_profiling_acc_ev_free (struct goacc_thread *thr, void *p)
1066 acc_prof_info *prof_info = thr->prof_info;
1067 acc_event_info data_event_info;
1068 acc_api_info *api_info = thr->api_info;
1070 prof_info->event_type = acc_ev_free;
1072 data_event_info.data_event.event_type = prof_info->event_type;
1073 data_event_info.data_event.valid_bytes = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1074 data_event_info.data_event.parent_construct = acc_construct_parallel;
1075 data_event_info.data_event.implicit = 1;
1076 data_event_info.data_event.tool_info = NULL;
1077 data_event_info.data_event.var_name = NULL;
1078 data_event_info.data_event.bytes = -1;
1079 data_event_info.data_event.host_ptr = NULL;
1080 data_event_info.data_event.device_ptr = p;
1082 api_info->device_api = acc_device_api_cuda;
1084 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, api_info);
1087 static bool
1088 nvptx_free (void *p, struct ptx_device *ptx_dev)
1090 CUdeviceptr pb;
1091 size_t ps;
1093 CUresult r = CUDA_CALL_NOCHECK (cuMemGetAddressRange, &pb, &ps,
1094 (CUdeviceptr) p);
1095 if (r == CUDA_ERROR_NOT_PERMITTED)
1097 /* We assume that this error indicates we are in a CUDA callback context,
1098 where all CUDA calls are not allowed (see cuStreamAddCallback
1099 documentation for description). Arrange to free this piece of device
1100 memory later. */
1101 struct ptx_free_block *n
1102 = GOMP_PLUGIN_malloc (sizeof (struct ptx_free_block));
1103 n->ptr = p;
1104 pthread_mutex_lock (&ptx_dev->free_blocks_lock);
1105 n->next = ptx_dev->free_blocks;
1106 ptx_dev->free_blocks = n;
1107 pthread_mutex_unlock (&ptx_dev->free_blocks_lock);
1108 return true;
1110 else if (r != CUDA_SUCCESS)
1112 GOMP_PLUGIN_error ("cuMemGetAddressRange error: %s", cuda_error (r));
1113 return false;
1115 if ((CUdeviceptr) p != pb)
1117 GOMP_PLUGIN_error ("invalid device address");
1118 return false;
1121 CUDA_CALL (cuMemFree, (CUdeviceptr) p);
1122 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1123 bool profiling_p
1124 = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1125 if (profiling_p)
1126 goacc_profiling_acc_ev_free (thr, p);
1128 return true;
1131 static void *
1132 nvptx_get_current_cuda_device (void)
1134 struct nvptx_thread *nvthd = nvptx_thread ();
1136 if (!nvthd || !nvthd->ptx_dev)
1137 return NULL;
1139 return &nvthd->ptx_dev->dev;
1142 static void *
1143 nvptx_get_current_cuda_context (void)
1145 struct nvptx_thread *nvthd = nvptx_thread ();
1147 if (!nvthd || !nvthd->ptx_dev)
1148 return NULL;
1150 return nvthd->ptx_dev->ctx;
1153 /* Plugin entry points. */
1155 const char *
1156 GOMP_OFFLOAD_get_name (void)
1158 return "nvptx";
1161 unsigned int
1162 GOMP_OFFLOAD_get_caps (void)
1164 return GOMP_OFFLOAD_CAP_OPENACC_200 | GOMP_OFFLOAD_CAP_OPENMP_400;
1168 GOMP_OFFLOAD_get_type (void)
1170 return OFFLOAD_TARGET_TYPE_NVIDIA_PTX;
1174 GOMP_OFFLOAD_get_num_devices (void)
1176 return nvptx_get_num_devices ();
1179 bool
1180 GOMP_OFFLOAD_init_device (int n)
1182 struct ptx_device *dev;
1184 pthread_mutex_lock (&ptx_dev_lock);
1186 if (!nvptx_init () || ptx_devices[n] != NULL)
1188 pthread_mutex_unlock (&ptx_dev_lock);
1189 return false;
1192 dev = nvptx_open_device (n);
1193 if (dev)
1195 ptx_devices[n] = dev;
1196 instantiated_devices++;
1199 pthread_mutex_unlock (&ptx_dev_lock);
1201 return dev != NULL;
1204 bool
1205 GOMP_OFFLOAD_fini_device (int n)
1207 pthread_mutex_lock (&ptx_dev_lock);
1209 if (ptx_devices[n] != NULL)
1211 if (!nvptx_attach_host_thread_to_device (n)
1212 || !nvptx_close_device (ptx_devices[n]))
1214 pthread_mutex_unlock (&ptx_dev_lock);
1215 return false;
1217 ptx_devices[n] = NULL;
1218 instantiated_devices--;
1221 if (instantiated_devices == 0)
1223 free (ptx_devices);
1224 ptx_devices = NULL;
1227 pthread_mutex_unlock (&ptx_dev_lock);
1228 return true;
1231 /* Return the libgomp version number we're compatible with. There is
1232 no requirement for cross-version compatibility. */
1234 unsigned
1235 GOMP_OFFLOAD_version (void)
1237 return GOMP_VERSION;
1240 /* Initialize __nvptx_clocktick, if present in MODULE. */
1242 static void
1243 nvptx_set_clocktick (CUmodule module, struct ptx_device *dev)
1245 CUdeviceptr dptr;
1246 CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &dptr, NULL,
1247 module, "__nvptx_clocktick");
1248 if (r == CUDA_ERROR_NOT_FOUND)
1249 return;
1250 if (r != CUDA_SUCCESS)
1251 GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
1252 double __nvptx_clocktick = 1e-3 / dev->clock_khz;
1253 r = CUDA_CALL_NOCHECK (cuMemcpyHtoD, dptr, &__nvptx_clocktick,
1254 sizeof (__nvptx_clocktick));
1255 if (r != CUDA_SUCCESS)
1256 GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
1259 /* Load the (partial) program described by TARGET_DATA to device
1260 number ORD. Allocate and return TARGET_TABLE. */
1263 GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
1264 struct addr_pair **target_table)
1266 CUmodule module;
1267 const char *const *var_names;
1268 const struct targ_fn_launch *fn_descs;
1269 unsigned int fn_entries, var_entries, other_entries, i, j;
1270 struct targ_fn_descriptor *targ_fns;
1271 struct addr_pair *targ_tbl;
1272 const nvptx_tdata_t *img_header = (const nvptx_tdata_t *) target_data;
1273 struct ptx_image_data *new_image;
1274 struct ptx_device *dev;
1276 if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1278 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1279 " (expected %u, received %u)",
1280 GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1281 return -1;
1284 if (!nvptx_attach_host_thread_to_device (ord)
1285 || !link_ptx (&module, img_header->ptx_objs, img_header->ptx_num))
1286 return -1;
1288 dev = ptx_devices[ord];
1290 /* The mkoffload utility emits a struct of pointers/integers at the
1291 start of each offload image. The array of kernel names and the
1292 functions addresses form a one-to-one correspondence. */
1294 var_entries = img_header->var_num;
1295 var_names = img_header->var_names;
1296 fn_entries = img_header->fn_num;
1297 fn_descs = img_header->fn_descs;
1299 /* Currently, the only other entry kind is 'device number'. */
1300 other_entries = 1;
1302 targ_tbl = GOMP_PLUGIN_malloc (sizeof (struct addr_pair)
1303 * (fn_entries + var_entries + other_entries));
1304 targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
1305 * fn_entries);
1307 *target_table = targ_tbl;
1309 new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data));
1310 new_image->target_data = target_data;
1311 new_image->module = module;
1312 new_image->fns = targ_fns;
1314 pthread_mutex_lock (&dev->image_lock);
1315 new_image->next = dev->images;
1316 dev->images = new_image;
1317 pthread_mutex_unlock (&dev->image_lock);
1319 for (i = 0; i < fn_entries; i++, targ_fns++, targ_tbl++)
1321 CUfunction function;
1322 int nregs, mthrs;
1324 CUDA_CALL_ERET (-1, cuModuleGetFunction, &function, module,
1325 fn_descs[i].fn);
1326 CUDA_CALL_ERET (-1, cuFuncGetAttribute, &nregs,
1327 CU_FUNC_ATTRIBUTE_NUM_REGS, function);
1328 CUDA_CALL_ERET (-1, cuFuncGetAttribute, &mthrs,
1329 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, function);
1331 targ_fns->fn = function;
1332 targ_fns->launch = &fn_descs[i];
1333 targ_fns->regs_per_thread = nregs;
1334 targ_fns->max_threads_per_block = mthrs;
1336 targ_tbl->start = (uintptr_t) targ_fns;
1337 targ_tbl->end = targ_tbl->start + 1;
1340 for (j = 0; j < var_entries; j++, targ_tbl++)
1342 CUdeviceptr var;
1343 size_t bytes;
1345 CUDA_CALL_ERET (-1, cuModuleGetGlobal,
1346 &var, &bytes, module, var_names[j]);
1348 targ_tbl->start = (uintptr_t) var;
1349 targ_tbl->end = targ_tbl->start + bytes;
1352 CUdeviceptr device_num_varptr;
1353 size_t device_num_varsize;
1354 CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &device_num_varptr,
1355 &device_num_varsize, module,
1356 STRINGX (GOMP_DEVICE_NUM_VAR));
1357 if (r == CUDA_SUCCESS)
1359 targ_tbl->start = (uintptr_t) device_num_varptr;
1360 targ_tbl->end = (uintptr_t) (device_num_varptr + device_num_varsize);
1362 else
1363 /* The 'GOMP_DEVICE_NUM_VAR' variable was not in this image. */
1364 targ_tbl->start = targ_tbl->end = 0;
1365 targ_tbl++;
1367 nvptx_set_clocktick (module, dev);
1369 return fn_entries + var_entries + other_entries;
1372 /* Unload the program described by TARGET_DATA. DEV_DATA is the
1373 function descriptors allocated by G_O_load_image. */
1375 bool
1376 GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data)
1378 struct ptx_image_data *image, **prev_p;
1379 struct ptx_device *dev = ptx_devices[ord];
1381 if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1383 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1384 " (expected %u, received %u)",
1385 GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1386 return false;
1389 bool ret = true;
1390 pthread_mutex_lock (&dev->image_lock);
1391 for (prev_p = &dev->images; (image = *prev_p) != 0; prev_p = &image->next)
1392 if (image->target_data == target_data)
1394 *prev_p = image->next;
1395 if (CUDA_CALL_NOCHECK (cuModuleUnload, image->module) != CUDA_SUCCESS)
1396 ret = false;
1397 free (image->fns);
1398 free (image);
1399 break;
1401 pthread_mutex_unlock (&dev->image_lock);
1402 return ret;
1405 void *
1406 GOMP_OFFLOAD_alloc (int ord, size_t size)
1408 if (!nvptx_attach_host_thread_to_device (ord))
1409 return NULL;
1411 struct ptx_device *ptx_dev = ptx_devices[ord];
1412 struct ptx_free_block *blocks, *tmp;
1414 pthread_mutex_lock (&ptx_dev->free_blocks_lock);
1415 blocks = ptx_dev->free_blocks;
1416 ptx_dev->free_blocks = NULL;
1417 pthread_mutex_unlock (&ptx_dev->free_blocks_lock);
1419 nvptx_stacks_free (ptx_dev, false);
1421 while (blocks)
1423 tmp = blocks->next;
1424 nvptx_free (blocks->ptr, ptx_dev);
1425 free (blocks);
1426 blocks = tmp;
1429 void *d = nvptx_alloc (size, true);
1430 if (d)
1431 return d;
1432 else
1434 /* Memory allocation failed. Try freeing the stacks block, and
1435 retrying. */
1436 nvptx_stacks_free (ptx_dev, true);
1437 return nvptx_alloc (size, false);
1441 bool
1442 GOMP_OFFLOAD_free (int ord, void *ptr)
1444 return (nvptx_attach_host_thread_to_device (ord)
1445 && nvptx_free (ptr, ptx_devices[ord]));
1448 void
1449 GOMP_OFFLOAD_openacc_exec (void (*fn) (void *), size_t mapnum,
1450 void **hostaddrs, void **devaddrs,
1451 unsigned *dims, void *targ_mem_desc)
1453 GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__);
1455 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1456 acc_prof_info *prof_info = thr->prof_info;
1457 acc_event_info data_event_info;
1458 acc_api_info *api_info = thr->api_info;
1459 bool profiling_p = __builtin_expect (prof_info != NULL, false);
1461 void **hp = NULL;
1462 CUdeviceptr dp = 0;
1464 if (mapnum > 0)
1466 size_t s = mapnum * sizeof (void *);
1467 hp = alloca (s);
1468 for (int i = 0; i < mapnum; i++)
1469 hp[i] = (devaddrs[i] ? devaddrs[i] : hostaddrs[i]);
1470 CUDA_CALL_ASSERT (cuMemAlloc, &dp, s);
1471 if (profiling_p)
1472 goacc_profiling_acc_ev_alloc (thr, (void *) dp, s);
1475 /* Copy the (device) pointers to arguments to the device (dp and hp might in
1476 fact have the same value on a unified-memory system). */
1477 if (mapnum > 0)
1479 if (profiling_p)
1481 prof_info->event_type = acc_ev_enqueue_upload_start;
1483 data_event_info.data_event.event_type = prof_info->event_type;
1484 data_event_info.data_event.valid_bytes
1485 = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1486 data_event_info.data_event.parent_construct
1487 = acc_construct_parallel;
1488 data_event_info.data_event.implicit = 1; /* Always implicit. */
1489 data_event_info.data_event.tool_info = NULL;
1490 data_event_info.data_event.var_name = NULL;
1491 data_event_info.data_event.bytes = mapnum * sizeof (void *);
1492 data_event_info.data_event.host_ptr = hp;
1493 data_event_info.data_event.device_ptr = (const void *) dp;
1495 api_info->device_api = acc_device_api_cuda;
1497 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1498 api_info);
1500 CUDA_CALL_ASSERT (cuMemcpyHtoD, dp, (void *) hp,
1501 mapnum * sizeof (void *));
1502 if (profiling_p)
1504 prof_info->event_type = acc_ev_enqueue_upload_end;
1505 data_event_info.data_event.event_type = prof_info->event_type;
1506 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1507 api_info);
1511 nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc,
1512 dp, NULL);
1514 CUresult r = CUDA_CALL_NOCHECK (cuStreamSynchronize, NULL);
1515 const char *maybe_abort_msg = "(perhaps abort was called)";
1516 if (r == CUDA_ERROR_LAUNCH_FAILED)
1517 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
1518 maybe_abort_msg);
1519 else if (r != CUDA_SUCCESS)
1520 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1522 CUDA_CALL_ASSERT (cuMemFree, dp);
1523 if (profiling_p)
1524 goacc_profiling_acc_ev_free (thr, (void *) dp);
1527 static void
1528 cuda_free_argmem (void *ptr)
1530 void **block = (void **) ptr;
1531 nvptx_free (block[0], (struct ptx_device *) block[1]);
1532 free (block);
1535 void
1536 GOMP_OFFLOAD_openacc_async_exec (void (*fn) (void *), size_t mapnum,
1537 void **hostaddrs, void **devaddrs,
1538 unsigned *dims, void *targ_mem_desc,
1539 struct goacc_asyncqueue *aq)
1541 GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__);
1543 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1544 acc_prof_info *prof_info = thr->prof_info;
1545 acc_event_info data_event_info;
1546 acc_api_info *api_info = thr->api_info;
1547 bool profiling_p = __builtin_expect (prof_info != NULL, false);
1549 void **hp = NULL;
1550 CUdeviceptr dp = 0;
1551 void **block = NULL;
1553 if (mapnum > 0)
1555 size_t s = mapnum * sizeof (void *);
1556 block = (void **) GOMP_PLUGIN_malloc (2 * sizeof (void *) + s);
1557 hp = block + 2;
1558 for (int i = 0; i < mapnum; i++)
1559 hp[i] = (devaddrs[i] ? devaddrs[i] : hostaddrs[i]);
1560 CUDA_CALL_ASSERT (cuMemAlloc, &dp, s);
1561 if (profiling_p)
1562 goacc_profiling_acc_ev_alloc (thr, (void *) dp, s);
1565 /* Copy the (device) pointers to arguments to the device (dp and hp might in
1566 fact have the same value on a unified-memory system). */
1567 if (mapnum > 0)
1569 if (profiling_p)
1571 prof_info->event_type = acc_ev_enqueue_upload_start;
1573 data_event_info.data_event.event_type = prof_info->event_type;
1574 data_event_info.data_event.valid_bytes
1575 = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1576 data_event_info.data_event.parent_construct
1577 = acc_construct_parallel;
1578 data_event_info.data_event.implicit = 1; /* Always implicit. */
1579 data_event_info.data_event.tool_info = NULL;
1580 data_event_info.data_event.var_name = NULL;
1581 data_event_info.data_event.bytes = mapnum * sizeof (void *);
1582 data_event_info.data_event.host_ptr = hp;
1583 data_event_info.data_event.device_ptr = (const void *) dp;
1585 api_info->device_api = acc_device_api_cuda;
1587 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1588 api_info);
1591 CUDA_CALL_ASSERT (cuMemcpyHtoDAsync, dp, (void *) hp,
1592 mapnum * sizeof (void *), aq->cuda_stream);
1593 block[0] = (void *) dp;
1595 struct nvptx_thread *nvthd =
1596 (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
1597 block[1] = (void *) nvthd->ptx_dev;
1599 if (profiling_p)
1601 prof_info->event_type = acc_ev_enqueue_upload_end;
1602 data_event_info.data_event.event_type = prof_info->event_type;
1603 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1604 api_info);
1608 nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc,
1609 dp, aq->cuda_stream);
1611 if (mapnum > 0)
1612 GOMP_OFFLOAD_openacc_async_queue_callback (aq, cuda_free_argmem, block);
1615 void *
1616 GOMP_OFFLOAD_openacc_create_thread_data (int ord)
1618 struct ptx_device *ptx_dev;
1619 struct nvptx_thread *nvthd
1620 = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
1621 CUcontext thd_ctx;
1623 ptx_dev = ptx_devices[ord];
1625 assert (ptx_dev);
1627 CUDA_CALL_ASSERT (cuCtxGetCurrent, &thd_ctx);
1629 assert (ptx_dev->ctx);
1631 if (!thd_ctx)
1632 CUDA_CALL_ASSERT (cuCtxPushCurrent, ptx_dev->ctx);
1634 nvthd->ptx_dev = ptx_dev;
1636 return (void *) nvthd;
1639 void
1640 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
1642 free (data);
1645 void *
1646 GOMP_OFFLOAD_openacc_cuda_get_current_device (void)
1648 return nvptx_get_current_cuda_device ();
1651 void *
1652 GOMP_OFFLOAD_openacc_cuda_get_current_context (void)
1654 return nvptx_get_current_cuda_context ();
1657 /* This returns a CUstream. */
1658 void *
1659 GOMP_OFFLOAD_openacc_cuda_get_stream (struct goacc_asyncqueue *aq)
1661 return (void *) aq->cuda_stream;
1664 /* This takes a CUstream. */
1666 GOMP_OFFLOAD_openacc_cuda_set_stream (struct goacc_asyncqueue *aq, void *stream)
1668 if (aq->cuda_stream)
1670 CUDA_CALL_ASSERT (cuStreamSynchronize, aq->cuda_stream);
1671 CUDA_CALL_ASSERT (cuStreamDestroy, aq->cuda_stream);
1674 aq->cuda_stream = (CUstream) stream;
1675 return 1;
1678 struct goacc_asyncqueue *
1679 GOMP_OFFLOAD_openacc_async_construct (int device __attribute__((unused)))
1681 CUstream stream = NULL;
1682 CUDA_CALL_ERET (NULL, cuStreamCreate, &stream, CU_STREAM_DEFAULT);
1684 struct goacc_asyncqueue *aq
1685 = GOMP_PLUGIN_malloc (sizeof (struct goacc_asyncqueue));
1686 aq->cuda_stream = stream;
1687 return aq;
1690 bool
1691 GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
1693 CUDA_CALL_ERET (false, cuStreamDestroy, aq->cuda_stream);
1694 free (aq);
1695 return true;
1699 GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
1701 CUresult r = CUDA_CALL_NOCHECK (cuStreamQuery, aq->cuda_stream);
1702 if (r == CUDA_SUCCESS)
1703 return 1;
1704 if (r == CUDA_ERROR_NOT_READY)
1705 return 0;
1707 GOMP_PLUGIN_error ("cuStreamQuery error: %s", cuda_error (r));
1708 return -1;
1711 bool
1712 GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
1714 CUDA_CALL_ERET (false, cuStreamSynchronize, aq->cuda_stream);
1715 return true;
1718 bool
1719 GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
1720 struct goacc_asyncqueue *aq2)
1722 CUevent e;
1723 CUDA_CALL_ERET (false, cuEventCreate, &e, CU_EVENT_DISABLE_TIMING);
1724 CUDA_CALL_ERET (false, cuEventRecord, e, aq1->cuda_stream);
1725 CUDA_CALL_ERET (false, cuStreamWaitEvent, aq2->cuda_stream, e, 0);
1726 return true;
1729 static void
1730 cuda_callback_wrapper (CUstream stream, CUresult res, void *ptr)
1732 if (res != CUDA_SUCCESS)
1733 GOMP_PLUGIN_fatal ("%s error: %s", __FUNCTION__, cuda_error (res));
1734 struct nvptx_callback *cb = (struct nvptx_callback *) ptr;
1735 cb->fn (cb->ptr);
1736 free (ptr);
1739 void
1740 GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
1741 void (*callback_fn)(void *),
1742 void *userptr)
1744 struct nvptx_callback *b = GOMP_PLUGIN_malloc (sizeof (*b));
1745 b->fn = callback_fn;
1746 b->ptr = userptr;
1747 b->aq = aq;
1748 CUDA_CALL_ASSERT (cuStreamAddCallback, aq->cuda_stream,
1749 cuda_callback_wrapper, (void *) b, 0);
1752 static bool
1753 cuda_memcpy_sanity_check (const void *h, const void *d, size_t s)
1755 CUdeviceptr pb;
1756 size_t ps;
1757 if (!s)
1758 return true;
1759 if (!d)
1761 GOMP_PLUGIN_error ("invalid device address");
1762 return false;
1764 CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
1765 if (!pb)
1767 GOMP_PLUGIN_error ("invalid device address");
1768 return false;
1770 if (!h)
1772 GOMP_PLUGIN_error ("invalid host address");
1773 return false;
1775 if (d == h)
1777 GOMP_PLUGIN_error ("invalid host or device address");
1778 return false;
1780 if ((void *)(d + s) > (void *)(pb + ps))
1782 GOMP_PLUGIN_error ("invalid size");
1783 return false;
1785 return true;
1788 bool
1789 GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
1791 if (!nvptx_attach_host_thread_to_device (ord)
1792 || !cuda_memcpy_sanity_check (src, dst, n))
1793 return false;
1794 CUDA_CALL (cuMemcpyHtoD, (CUdeviceptr) dst, src, n);
1795 return true;
1798 bool
1799 GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
1801 if (!nvptx_attach_host_thread_to_device (ord)
1802 || !cuda_memcpy_sanity_check (dst, src, n))
1803 return false;
1804 CUDA_CALL (cuMemcpyDtoH, dst, (CUdeviceptr) src, n);
1805 return true;
1808 bool
1809 GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n)
1811 CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n, NULL);
1812 return true;
1815 bool
1816 GOMP_OFFLOAD_openacc_async_host2dev (int ord, void *dst, const void *src,
1817 size_t n, struct goacc_asyncqueue *aq)
1819 if (!nvptx_attach_host_thread_to_device (ord)
1820 || !cuda_memcpy_sanity_check (src, dst, n))
1821 return false;
1822 CUDA_CALL (cuMemcpyHtoDAsync, (CUdeviceptr) dst, src, n, aq->cuda_stream);
1823 return true;
1826 bool
1827 GOMP_OFFLOAD_openacc_async_dev2host (int ord, void *dst, const void *src,
1828 size_t n, struct goacc_asyncqueue *aq)
1830 if (!nvptx_attach_host_thread_to_device (ord)
1831 || !cuda_memcpy_sanity_check (dst, src, n))
1832 return false;
1833 CUDA_CALL (cuMemcpyDtoHAsync, dst, (CUdeviceptr) src, n, aq->cuda_stream);
1834 return true;
1837 union goacc_property_value
1838 GOMP_OFFLOAD_openacc_get_property (int n, enum goacc_property prop)
1840 union goacc_property_value propval = { .val = 0 };
1842 pthread_mutex_lock (&ptx_dev_lock);
1844 if (n >= nvptx_get_num_devices () || n < 0 || ptx_devices[n] == NULL)
1846 pthread_mutex_unlock (&ptx_dev_lock);
1847 return propval;
1850 struct ptx_device *ptx_dev = ptx_devices[n];
1851 switch (prop)
1853 case GOACC_PROPERTY_MEMORY:
1855 size_t total_mem;
1857 CUDA_CALL_ERET (propval, cuDeviceTotalMem, &total_mem, ptx_dev->dev);
1858 propval.val = total_mem;
1860 break;
1861 case GOACC_PROPERTY_FREE_MEMORY:
1863 size_t total_mem;
1864 size_t free_mem;
1865 CUdevice ctxdev;
1867 CUDA_CALL_ERET (propval, cuCtxGetDevice, &ctxdev);
1868 if (ptx_dev->dev == ctxdev)
1869 CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
1870 else if (ptx_dev->ctx)
1872 CUcontext old_ctx;
1874 CUDA_CALL_ERET (propval, cuCtxPushCurrent, ptx_dev->ctx);
1875 CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
1876 CUDA_CALL_ASSERT (cuCtxPopCurrent, &old_ctx);
1878 else
1880 CUcontext new_ctx;
1882 CUDA_CALL_ERET (propval, cuCtxCreate, &new_ctx, CU_CTX_SCHED_AUTO,
1883 ptx_dev->dev);
1884 CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
1885 CUDA_CALL_ASSERT (cuCtxDestroy, new_ctx);
1887 propval.val = free_mem;
1889 break;
1890 case GOACC_PROPERTY_NAME:
1891 propval.ptr = ptx_dev->name;
1892 break;
1893 case GOACC_PROPERTY_VENDOR:
1894 propval.ptr = "Nvidia";
1895 break;
1896 case GOACC_PROPERTY_DRIVER:
1897 propval.ptr = cuda_driver_version_s;
1898 break;
1899 default:
1900 break;
1903 pthread_mutex_unlock (&ptx_dev_lock);
1904 return propval;
1907 /* Adjust launch dimensions: pick good values for number of blocks and warps
1908 and ensure that number of warps does not exceed CUDA limits as well as GCC's
1909 own limits. */
1911 static void
1912 nvptx_adjust_launch_bounds (struct targ_fn_descriptor *fn,
1913 struct ptx_device *ptx_dev,
1914 int *teams_p, int *threads_p)
1916 int max_warps_block = fn->max_threads_per_block / 32;
1917 /* Maximum 32 warps per block is an implementation limit in NVPTX backend
1918 and libgcc, which matches documented limit of all GPUs as of 2015. */
1919 if (max_warps_block > 32)
1920 max_warps_block = 32;
1921 if (*threads_p <= 0)
1922 *threads_p = 8;
1923 if (*threads_p > max_warps_block)
1924 *threads_p = max_warps_block;
1926 int regs_per_block = fn->regs_per_thread * 32 * *threads_p;
1927 /* This is an estimate of how many blocks the device can host simultaneously.
1928 Actual limit, which may be lower, can be queried with "occupancy control"
1929 driver interface (since CUDA 6.0). */
1930 int max_blocks = ptx_dev->regs_per_sm / regs_per_block * ptx_dev->num_sms;
1931 if (*teams_p <= 0 || *teams_p > max_blocks)
1932 *teams_p = max_blocks;
1935 /* Return the size of per-warp stacks (see gcc -msoft-stack) to use for OpenMP
1936 target regions. */
1938 static size_t
1939 nvptx_stacks_size ()
1941 return 128 * 1024;
1944 /* Return contiguous storage for NUM stacks, each SIZE bytes. The lock for
1945 the storage should be held on entry, and remains held on exit. */
1947 static void *
1948 nvptx_stacks_acquire (struct ptx_device *ptx_dev, size_t size, int num)
1950 if (ptx_dev->omp_stacks.ptr && ptx_dev->omp_stacks.size >= size * num)
1951 return (void *) ptx_dev->omp_stacks.ptr;
1953 /* Free the old, too-small stacks. */
1954 if (ptx_dev->omp_stacks.ptr)
1956 CUresult r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
1957 if (r != CUDA_SUCCESS)
1958 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s\n", cuda_error (r));
1959 r = CUDA_CALL_NOCHECK (cuMemFree, ptx_dev->omp_stacks.ptr);
1960 if (r != CUDA_SUCCESS)
1961 GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
1964 /* Make new and bigger stacks, and remember where we put them and how big
1965 they are. */
1966 CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &ptx_dev->omp_stacks.ptr,
1967 size * num);
1968 if (r != CUDA_SUCCESS)
1969 GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
1971 ptx_dev->omp_stacks.size = size * num;
1973 return (void *) ptx_dev->omp_stacks.ptr;
1976 void
1977 GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
1979 struct targ_fn_descriptor *tgt_fn_desc
1980 = (struct targ_fn_descriptor *) tgt_fn;
1981 CUfunction function = tgt_fn_desc->fn;
1982 const struct targ_fn_launch *launch = tgt_fn_desc->launch;
1983 const char *fn_name = launch->fn;
1984 CUresult r;
1985 struct ptx_device *ptx_dev = ptx_devices[ord];
1986 const char *maybe_abort_msg = "(perhaps abort was called)";
1987 int teams = 0, threads = 0;
1989 if (!args)
1990 GOMP_PLUGIN_fatal ("No target arguments provided");
1991 while (*args)
1993 intptr_t id = (intptr_t) *args++, val;
1994 if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
1995 val = (intptr_t) *args++;
1996 else
1997 val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
1998 if ((id & GOMP_TARGET_ARG_DEVICE_MASK) != GOMP_TARGET_ARG_DEVICE_ALL)
1999 continue;
2000 val = val > INT_MAX ? INT_MAX : val;
2001 id &= GOMP_TARGET_ARG_ID_MASK;
2002 if (id == GOMP_TARGET_ARG_NUM_TEAMS)
2003 teams = val;
2004 else if (id == GOMP_TARGET_ARG_THREAD_LIMIT)
2005 threads = val;
2007 nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads);
2009 size_t stack_size = nvptx_stacks_size ();
2011 pthread_mutex_lock (&ptx_dev->omp_stacks.lock);
2012 void *stacks = nvptx_stacks_acquire (ptx_dev, stack_size, teams * threads);
2013 void *fn_args[] = {tgt_vars, stacks, (void *) stack_size};
2014 size_t fn_args_size = sizeof fn_args;
2015 void *config[] = {
2016 CU_LAUNCH_PARAM_BUFFER_POINTER, fn_args,
2017 CU_LAUNCH_PARAM_BUFFER_SIZE, &fn_args_size,
2018 CU_LAUNCH_PARAM_END
2020 GOMP_PLUGIN_debug (0, " %s: kernel %s: launch"
2021 " [(teams: %u), 1, 1] [(lanes: 32), (threads: %u), 1]\n",
2022 __FUNCTION__, fn_name, teams, threads);
2023 r = CUDA_CALL_NOCHECK (cuLaunchKernel, function, teams, 1, 1,
2024 32, threads, 1, 0, NULL, NULL, config);
2025 if (r != CUDA_SUCCESS)
2026 GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
2028 r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
2029 if (r == CUDA_ERROR_LAUNCH_FAILED)
2030 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
2031 maybe_abort_msg);
2032 else if (r != CUDA_SUCCESS)
2033 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
2035 pthread_mutex_unlock (&ptx_dev->omp_stacks.lock);
2038 /* TODO: Implement GOMP_OFFLOAD_async_run. */