i386: Add GENERIC and GIMPLE folders of __builtin_ia32_{min,max}* [PR116738]
[official-gcc.git] / libgomp / plugin / plugin-nvptx.c
blobe9a9d798fe6b88623a774c6521e27ae940951c9f
1 /* Plugin for NVPTX execution.
3 Copyright (C) 2013-2024 Free Software Foundation, Inc.
5 Contributed by Mentor Embedded.
7 This file is part of the GNU Offloading and Multi Processing Library
8 (libgomp).
10 Libgomp is free software; you can redistribute it and/or modify it
11 under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 3, or (at your option)
13 any later version.
15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
18 more details.
20 Under Section 7 of GPL version 3, you are granted additional
21 permissions described in the GCC Runtime Library Exception, version
22 3.1, as published by the Free Software Foundation.
24 You should have received a copy of the GNU General Public License and
25 a copy of the GCC Runtime Library Exception along with this program;
26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
27 <http://www.gnu.org/licenses/>. */
29 /* Nvidia PTX-specific parts of OpenACC support. The cuda driver
30 library appears to hold some implicit state, but the documentation
31 is not clear as to what that state might be. Or how one might
32 propagate it from one thread to another. */
34 #define _GNU_SOURCE
35 #include "openacc.h"
36 #include "config.h"
37 #include "symcat.h"
38 #include "libgomp-plugin.h"
39 #include "oacc-plugin.h"
40 #include "gomp-constants.h"
41 #include "oacc-int.h"
43 /* For struct rev_offload + GOMP_REV_OFFLOAD_VAR. */
44 #include "config/nvptx/libgomp-nvptx.h"
46 #include <pthread.h>
47 #ifndef PLUGIN_NVPTX_INCLUDE_SYSTEM_CUDA_H
48 # include "cuda/cuda.h"
49 #else
50 # include <cuda.h>
51 #endif
52 #include <stdbool.h>
53 #include <limits.h>
54 #include <string.h>
55 #include <stdio.h>
56 #include <unistd.h>
57 #include <assert.h>
58 #include <errno.h>
59 #include <stdlib.h>
61 /* An arbitrary fixed limit (128MB) for the size of the OpenMP soft stacks
62 block to cache between kernel invocations. For soft-stacks blocks bigger
63 than this, we will free the block before attempting another GPU memory
64 allocation (i.e. in GOMP_OFFLOAD_alloc). Otherwise, if an allocation fails,
65 we will free the cached soft-stacks block anyway then retry the
66 allocation. If that fails too, we lose. */
68 #define SOFTSTACK_CACHE_LIMIT 134217728
70 #if CUDA_VERSION < 6000
71 extern CUresult cuGetErrorString (CUresult, const char **);
72 #define CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR 82
73 #endif
75 #if CUDA_VERSION >= 6050
76 #undef cuLinkCreate
77 #undef cuLinkAddData
78 CUresult cuLinkAddData (CUlinkState, CUjitInputType, void *, size_t,
79 const char *, unsigned, CUjit_option *, void **);
80 CUresult cuLinkCreate (unsigned, CUjit_option *, void **, CUlinkState *);
81 #else
82 typedef size_t (*CUoccupancyB2DSize)(int);
83 CUresult cuLinkAddData_v2 (CUlinkState, CUjitInputType, void *, size_t,
84 const char *, unsigned, CUjit_option *, void **);
85 CUresult cuLinkCreate_v2 (unsigned, CUjit_option *, void **, CUlinkState *);
86 CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
87 CUoccupancyB2DSize, size_t, int);
88 #endif
90 #define DO_PRAGMA(x) _Pragma (#x)
92 #ifndef PLUGIN_NVPTX_LINK_LIBCUDA
93 # include <dlfcn.h>
95 struct cuda_lib_s {
97 # define CUDA_ONE_CALL(call) \
98 __typeof (call) *call;
99 # define CUDA_ONE_CALL_MAYBE_NULL(call) \
100 CUDA_ONE_CALL (call)
101 #include "cuda-lib.def"
102 # undef CUDA_ONE_CALL
103 # undef CUDA_ONE_CALL_MAYBE_NULL
105 } cuda_lib;
107 /* -1 if init_cuda_lib has not been called yet, false
108 if it has been and failed, true if it has been and succeeded. */
109 static signed char cuda_lib_inited = -1;
111 /* Dynamically load the CUDA runtime library and initialize function
112 pointers, return false if unsuccessful, true if successful. */
113 static bool
114 init_cuda_lib (void)
116 if (cuda_lib_inited != -1)
117 return cuda_lib_inited;
118 const char *cuda_runtime_lib = "libcuda.so.1";
119 void *h = dlopen (cuda_runtime_lib, RTLD_LAZY);
120 cuda_lib_inited = false;
121 if (h == NULL)
122 return false;
124 # define CUDA_ONE_CALL(call) CUDA_ONE_CALL_1 (call, false)
125 # define CUDA_ONE_CALL_MAYBE_NULL(call) CUDA_ONE_CALL_1 (call, true)
126 # define CUDA_ONE_CALL_1(call, allow_null) \
127 cuda_lib.call = dlsym (h, #call); \
128 if (!allow_null && cuda_lib.call == NULL) \
129 GOMP_PLUGIN_fatal ("'%s' is missing '%s'", cuda_runtime_lib, #call);
130 #include "cuda-lib.def"
131 # undef CUDA_ONE_CALL
132 # undef CUDA_ONE_CALL_1
133 # undef CUDA_ONE_CALL_MAYBE_NULL
135 cuda_lib_inited = true;
136 return true;
138 # define CUDA_CALL_PREFIX cuda_lib.
139 #else
141 # define CUDA_ONE_CALL(call)
142 # define CUDA_ONE_CALL_MAYBE_NULL(call) DO_PRAGMA (weak call)
143 #include "cuda-lib.def"
144 #undef CUDA_ONE_CALL_MAYBE_NULL
145 #undef CUDA_ONE_CALL
147 # define CUDA_CALL_PREFIX
148 # define init_cuda_lib() true
149 #endif
151 #include "secure_getenv.h"
153 static void notify_var (const char *, const char *);
155 #undef MIN
156 #undef MAX
157 #define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
158 #define MAX(X,Y) ((X) > (Y) ? (X) : (Y))
160 /* Convenience macros for the frequently used CUDA library call and
161 error handling sequence as well as CUDA library calls that
162 do the error checking themselves or don't do it at all. */
164 #define CUDA_CALL_ERET(ERET, FN, ...) \
165 do { \
166 unsigned __r \
167 = CUDA_CALL_PREFIX FN (__VA_ARGS__); \
168 if (__r != CUDA_SUCCESS) \
170 GOMP_PLUGIN_error (#FN " error: %s", \
171 cuda_error (__r)); \
172 return ERET; \
174 } while (0)
176 #define CUDA_CALL(FN, ...) \
177 CUDA_CALL_ERET (false, FN, __VA_ARGS__)
179 #define CUDA_CALL_ASSERT(FN, ...) \
180 do { \
181 unsigned __r \
182 = CUDA_CALL_PREFIX FN (__VA_ARGS__); \
183 if (__r != CUDA_SUCCESS) \
185 GOMP_PLUGIN_fatal (#FN " error: %s", \
186 cuda_error (__r)); \
188 } while (0)
190 #define CUDA_CALL_NOCHECK(FN, ...) \
191 CUDA_CALL_PREFIX FN (__VA_ARGS__)
193 #define CUDA_CALL_EXISTS(FN) \
194 CUDA_CALL_PREFIX FN
196 static const char *
197 cuda_error (CUresult r)
199 const char *fallback = "unknown cuda error";
200 const char *desc;
202 if (!CUDA_CALL_EXISTS (cuGetErrorString))
203 return fallback;
205 r = CUDA_CALL_NOCHECK (cuGetErrorString, r, &desc);
206 if (r == CUDA_SUCCESS)
207 return desc;
209 return fallback;
212 /* Version of the CUDA Toolkit in the same MAJOR.MINOR format that is used by
213 Nvidia, such as in the 'deviceQuery' program (Nvidia's CUDA samples). */
214 static char cuda_driver_version_s[30];
216 static unsigned int instantiated_devices = 0;
217 static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
219 /* NVPTX/CUDA specific definition of asynchronous queues. */
220 struct goacc_asyncqueue
222 CUstream cuda_stream;
225 struct nvptx_callback
227 void (*fn) (void *);
228 void *ptr;
229 struct goacc_asyncqueue *aq;
230 struct nvptx_callback *next;
233 /* Thread-specific data for PTX. */
235 struct nvptx_thread
237 /* We currently have this embedded inside the plugin because libgomp manages
238 devices through integer target_ids. This might be better if using an
239 opaque target-specific pointer directly from gomp_device_descr. */
240 struct ptx_device *ptx_dev;
243 /* Target data function launch information. */
245 struct targ_fn_launch
247 const char *fn;
248 unsigned short dim[GOMP_DIM_MAX];
251 /* Target PTX object information. */
253 struct targ_ptx_obj
255 const char *code;
256 size_t size;
259 /* Target data image information. */
261 typedef struct nvptx_tdata
263 const struct targ_ptx_obj *ptx_objs;
264 unsigned ptx_num;
266 const char *const *var_names;
267 unsigned var_num;
269 const struct targ_fn_launch *fn_descs;
270 unsigned fn_num;
272 unsigned ind_fn_num;
273 } nvptx_tdata_t;
275 /* Descriptor of a loaded function. */
277 struct targ_fn_descriptor
279 CUfunction fn;
280 const struct targ_fn_launch *launch;
281 int regs_per_thread;
282 int max_threads_per_block;
285 /* A loaded PTX image. */
286 struct ptx_image_data
288 const void *target_data;
289 CUmodule module;
291 struct targ_fn_descriptor *fns; /* Array of functions. */
293 struct ptx_image_data *next;
296 struct ptx_free_block
298 void *ptr;
299 struct ptx_free_block *next;
302 struct ptx_device
304 CUcontext ctx;
305 bool ctx_shared;
306 CUdevice dev;
308 int ord;
309 bool overlap;
310 bool map;
311 bool concur;
312 bool mkern;
313 int mode;
314 int clock_khz;
315 int num_sms;
316 int regs_per_block;
317 int regs_per_sm;
318 int warp_size;
319 int max_threads_per_block;
320 int max_threads_per_multiprocessor;
321 int default_dims[GOMP_DIM_MAX];
323 /* Length as used by the CUDA Runtime API ('struct cudaDeviceProp'). */
324 char name[256];
326 struct ptx_image_data *images; /* Images loaded on device. */
327 pthread_mutex_t image_lock; /* Lock for above list. */
329 struct ptx_free_block *free_blocks;
330 pthread_mutex_t free_blocks_lock;
332 /* OpenMP stacks, cached between kernel invocations. */
333 struct
335 CUdeviceptr ptr;
336 size_t size;
337 pthread_mutex_t lock;
338 } omp_stacks;
340 struct rev_offload *rev_data;
341 struct ptx_device *next;
344 static struct ptx_device **ptx_devices;
346 /* "Native" GPU thread stack size. */
347 static unsigned native_gpu_thread_stack_size = 0;
349 /* OpenMP kernels reserve a small amount of ".shared" space for use by
350 omp_alloc. The size is configured using GOMP_NVPTX_LOWLAT_POOL, but the
351 default is set here. */
352 static unsigned lowlat_pool_size = 8 * 1024;
354 static bool nvptx_do_global_cdtors (CUmodule, struct ptx_device *,
355 const char *);
356 static size_t nvptx_stacks_size ();
357 static void *nvptx_stacks_acquire (struct ptx_device *, size_t, int);
359 static inline struct nvptx_thread *
360 nvptx_thread (void)
362 return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
365 /* Initialize the device. Return TRUE on success, else FALSE. PTX_DEV_LOCK
366 should be locked on entry and remains locked on exit. */
368 static bool
369 nvptx_init (void)
371 int ndevs;
373 if (instantiated_devices != 0)
374 return true;
376 if (!init_cuda_lib ())
377 return false;
379 CUDA_CALL (cuInit, 0);
381 int cuda_driver_version;
382 CUDA_CALL_ERET (NULL, cuDriverGetVersion, &cuda_driver_version);
383 snprintf (cuda_driver_version_s, sizeof cuda_driver_version_s,
384 "CUDA Driver %u.%u",
385 cuda_driver_version / 1000, cuda_driver_version % 1000 / 10);
387 CUDA_CALL (cuDeviceGetCount, &ndevs);
388 ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *)
389 * ndevs);
391 return true;
394 /* Select the N'th PTX device for the current host thread. The device must
395 have been previously opened before calling this function. */
397 static bool
398 nvptx_attach_host_thread_to_device (int n)
400 CUdevice dev;
401 CUresult r;
402 struct ptx_device *ptx_dev;
403 CUcontext thd_ctx;
405 r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &dev);
406 if (r == CUDA_ERROR_NOT_PERMITTED)
408 /* Assume we're in a CUDA callback, just return true. */
409 return true;
411 if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
413 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
414 return false;
417 if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n)
418 return true;
419 else
421 CUcontext old_ctx;
423 ptx_dev = ptx_devices[n];
424 if (!ptx_dev)
426 GOMP_PLUGIN_error ("device %d not found", n);
427 return false;
430 CUDA_CALL (cuCtxGetCurrent, &thd_ctx);
432 /* We don't necessarily have a current context (e.g. if it has been
433 destroyed. Pop it if we do though. */
434 if (thd_ctx != NULL)
435 CUDA_CALL (cuCtxPopCurrent, &old_ctx);
437 CUDA_CALL (cuCtxPushCurrent, ptx_dev->ctx);
439 return true;
442 static struct ptx_device *
443 nvptx_open_device (int n)
445 struct ptx_device *ptx_dev;
446 CUdevice dev, ctx_dev;
447 CUresult r;
448 int pi;
450 CUDA_CALL_ERET (NULL, cuDeviceGet, &dev, n);
452 ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device));
454 ptx_dev->ord = n;
455 ptx_dev->dev = dev;
456 ptx_dev->ctx_shared = false;
458 r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &ctx_dev);
459 if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
461 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
462 return NULL;
465 if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev)
467 /* The current host thread has an active context for a different device.
468 Detach it. */
469 CUcontext old_ctx;
470 CUDA_CALL_ERET (NULL, cuCtxPopCurrent, &old_ctx);
473 CUDA_CALL_ERET (NULL, cuCtxGetCurrent, &ptx_dev->ctx);
475 if (!ptx_dev->ctx)
476 CUDA_CALL_ERET (NULL, cuCtxCreate, &ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
477 else
478 ptx_dev->ctx_shared = true;
480 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
481 &pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
482 ptx_dev->overlap = pi;
484 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
485 &pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
486 ptx_dev->map = pi;
488 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
489 &pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
490 ptx_dev->concur = pi;
492 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
493 &pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
494 ptx_dev->mode = pi;
496 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
497 &pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
498 ptx_dev->mkern = pi;
500 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
501 &pi, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
502 ptx_dev->clock_khz = pi;
504 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
505 &pi, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
506 ptx_dev->num_sms = pi;
508 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
509 &pi, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev);
510 ptx_dev->regs_per_block = pi;
512 /* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR is defined only
513 in CUDA 6.0 and newer. */
514 r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi,
515 CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR,
516 dev);
517 /* Fallback: use limit of registers per block, which is usually equal. */
518 if (r == CUDA_ERROR_INVALID_VALUE)
519 pi = ptx_dev->regs_per_block;
520 else if (r != CUDA_SUCCESS)
522 GOMP_PLUGIN_error ("cuDeviceGetAttribute error: %s", cuda_error (r));
523 return NULL;
525 ptx_dev->regs_per_sm = pi;
527 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
528 &pi, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev);
529 if (pi != 32)
531 GOMP_PLUGIN_error ("Only warp size 32 is supported");
532 return NULL;
534 ptx_dev->warp_size = pi;
536 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
537 CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev);
538 ptx_dev->max_threads_per_block = pi;
540 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
541 CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev);
542 ptx_dev->max_threads_per_multiprocessor = pi;
544 /* Required below for reverse offload as implemented, but with compute
545 capability >= 2.0 and 64bit device processes, this should be universally be
546 the case; hence, an assert. */
547 r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi,
548 CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, dev);
549 assert (r == CUDA_SUCCESS && pi);
551 for (int i = 0; i != GOMP_DIM_MAX; i++)
552 ptx_dev->default_dims[i] = 0;
554 CUDA_CALL_ERET (NULL, cuDeviceGetName, ptx_dev->name, sizeof ptx_dev->name,
555 dev);
557 ptx_dev->images = NULL;
558 pthread_mutex_init (&ptx_dev->image_lock, NULL);
560 ptx_dev->free_blocks = NULL;
561 pthread_mutex_init (&ptx_dev->free_blocks_lock, NULL);
563 /* "Native" GPU thread stack size. */
565 /* This is intentionally undocumented, until we work out a proper, common
566 scheme (as much as makes sense) between all offload plugins as well
567 as between nvptx offloading use of "native" stacks for OpenACC vs.
568 OpenMP "soft stacks" vs. OpenMP '-msoft-stack-reserve-local=[...]'.
570 GCN offloading has a 'GCN_STACK_SIZE' environment variable (without
571 'GOMP_' prefix): documented; presumably used for all things OpenACC and
572 OpenMP? Based on GCN command-line option '-mstack-size=[...]' (marked
573 "obsolete"), that one may be set via a GCN 'mkoffload'-synthesized
574 'constructor' function. */
575 const char *var_name = "GOMP_NVPTX_NATIVE_GPU_THREAD_STACK_SIZE";
576 const char *env_var = secure_getenv (var_name);
577 notify_var (var_name, env_var);
579 if (env_var != NULL)
581 char *endptr;
582 unsigned long val = strtoul (env_var, &endptr, 10);
583 if (endptr == NULL || *endptr != '\0'
584 || errno == ERANGE || errno == EINVAL
585 || val > UINT_MAX)
586 GOMP_PLUGIN_error ("Error parsing %s", var_name);
587 else
588 native_gpu_thread_stack_size = val;
591 if (native_gpu_thread_stack_size == 0)
592 ; /* Zero means use default. */
593 else
595 GOMP_PLUGIN_debug (0, "Setting \"native\" GPU thread stack size"
596 " ('CU_LIMIT_STACK_SIZE') to %u bytes\n",
597 native_gpu_thread_stack_size);
598 CUDA_CALL (cuCtxSetLimit,
599 CU_LIMIT_STACK_SIZE, (size_t) native_gpu_thread_stack_size);
602 /* OpenMP "soft stacks". */
603 ptx_dev->omp_stacks.ptr = 0;
604 ptx_dev->omp_stacks.size = 0;
605 pthread_mutex_init (&ptx_dev->omp_stacks.lock, NULL);
607 ptx_dev->rev_data = NULL;
609 return ptx_dev;
612 static bool
613 nvptx_close_device (struct ptx_device *ptx_dev)
615 if (!ptx_dev)
616 return true;
618 bool ret = true;
620 for (struct ptx_image_data *image = ptx_dev->images;
621 image != NULL;
622 image = image->next)
624 if (!nvptx_do_global_cdtors (image->module, ptx_dev,
625 "__do_global_dtors__entry"
626 /* or "__do_global_dtors__entry__mgomp" */))
627 ret = false;
630 for (struct ptx_free_block *b = ptx_dev->free_blocks; b;)
632 struct ptx_free_block *b_next = b->next;
633 CUDA_CALL (cuMemFree, (CUdeviceptr) b->ptr);
634 free (b);
635 b = b_next;
638 pthread_mutex_destroy (&ptx_dev->free_blocks_lock);
639 pthread_mutex_destroy (&ptx_dev->image_lock);
641 pthread_mutex_destroy (&ptx_dev->omp_stacks.lock);
643 if (ptx_dev->omp_stacks.ptr)
644 CUDA_CALL (cuMemFree, ptx_dev->omp_stacks.ptr);
646 if (!ptx_dev->ctx_shared)
647 CUDA_CALL (cuCtxDestroy, ptx_dev->ctx);
649 free (ptx_dev);
651 return ret;
654 static int
655 nvptx_get_num_devices (void)
657 int n;
659 /* This function will be called before the plugin has been initialized in
660 order to enumerate available devices, but CUDA API routines can't be used
661 until cuInit has been called. Just call it now (but don't yet do any
662 further initialization). */
663 if (instantiated_devices == 0)
665 if (!init_cuda_lib ())
666 return 0;
667 CUresult r = CUDA_CALL_NOCHECK (cuInit, 0);
668 /* This is not an error: e.g. we may have CUDA libraries installed but
669 no devices available. */
670 if (r == CUDA_ERROR_NO_DEVICE)
672 GOMP_PLUGIN_debug (0, "Disabling nvptx offloading; cuInit: %s\n",
673 cuda_error (r));
674 return 0;
676 else if (r != CUDA_SUCCESS)
677 GOMP_PLUGIN_fatal ("cuInit error: %s", cuda_error (r));
680 CUDA_CALL_ASSERT (cuDeviceGetCount, &n);
681 return n;
684 static void
685 notify_var (const char *var_name, const char *env_var)
687 if (env_var == NULL)
688 GOMP_PLUGIN_debug (0, "%s: <Not defined>\n", var_name);
689 else
690 GOMP_PLUGIN_debug (0, "%s: '%s'\n", var_name, env_var);
693 static void
694 process_GOMP_NVPTX_JIT (intptr_t *gomp_nvptx_o)
696 const char *var_name = "GOMP_NVPTX_JIT";
697 const char *env_var = secure_getenv (var_name);
698 notify_var (var_name, env_var);
700 if (env_var == NULL)
701 return;
703 const char *c = env_var;
704 while (*c != '\0')
706 while (*c == ' ')
707 c++;
709 if (c[0] == '-' && c[1] == 'O'
710 && '0' <= c[2] && c[2] <= '4'
711 && (c[3] == '\0' || c[3] == ' '))
713 *gomp_nvptx_o = c[2] - '0';
714 c += 3;
715 continue;
718 GOMP_PLUGIN_error ("Error parsing %s", var_name);
719 break;
723 static bool
724 link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs,
725 unsigned num_objs)
727 CUjit_option opts[7];
728 void *optvals[7];
729 float elapsed = 0.0;
730 char elog[1024];
731 char ilog[16384];
732 CUlinkState linkstate;
733 CUresult r;
734 void *linkout;
735 size_t linkoutsize __attribute__ ((unused));
737 opts[0] = CU_JIT_WALL_TIME;
738 optvals[0] = &elapsed;
740 opts[1] = CU_JIT_INFO_LOG_BUFFER;
741 optvals[1] = &ilog[0];
743 opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
744 optvals[2] = (void *) sizeof ilog;
746 opts[3] = CU_JIT_ERROR_LOG_BUFFER;
747 optvals[3] = &elog[0];
749 opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
750 optvals[4] = (void *) sizeof elog;
752 opts[5] = CU_JIT_LOG_VERBOSE;
753 optvals[5] = (void *) 1;
755 static intptr_t gomp_nvptx_o = -1;
757 static bool init_done = false;
758 if (!init_done)
760 process_GOMP_NVPTX_JIT (&gomp_nvptx_o);
761 init_done = true;
764 int nopts = 6;
765 if (gomp_nvptx_o != -1)
767 opts[nopts] = CU_JIT_OPTIMIZATION_LEVEL;
768 optvals[nopts] = (void *) gomp_nvptx_o;
769 nopts++;
772 if (CUDA_CALL_EXISTS (cuLinkCreate_v2))
773 CUDA_CALL (cuLinkCreate_v2, nopts, opts, optvals, &linkstate);
774 else
775 CUDA_CALL (cuLinkCreate, nopts, opts, optvals, &linkstate);
777 for (; num_objs--; ptx_objs++)
779 /* cuLinkAddData's 'data' argument erroneously omits the const
780 qualifier. */
781 GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs->code);
782 if (CUDA_CALL_EXISTS (cuLinkAddData_v2))
783 r = CUDA_CALL_NOCHECK (cuLinkAddData_v2, linkstate, CU_JIT_INPUT_PTX,
784 (char *) ptx_objs->code, ptx_objs->size,
785 0, 0, 0, 0);
786 else
787 r = CUDA_CALL_NOCHECK (cuLinkAddData, linkstate, CU_JIT_INPUT_PTX,
788 (char *) ptx_objs->code, ptx_objs->size,
789 0, 0, 0, 0);
790 if (r != CUDA_SUCCESS)
792 GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
793 GOMP_PLUGIN_error ("cuLinkAddData (ptx_code) error: %s",
794 cuda_error (r));
795 return false;
799 GOMP_PLUGIN_debug (0, "Linking\n");
800 r = CUDA_CALL_NOCHECK (cuLinkComplete, linkstate, &linkout, &linkoutsize);
802 GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed);
803 GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]);
805 if (r != CUDA_SUCCESS)
807 GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
808 GOMP_PLUGIN_error ("cuLinkComplete error: %s", cuda_error (r));
809 return false;
812 CUDA_CALL (cuModuleLoadData, module, linkout);
813 CUDA_CALL (cuLinkDestroy, linkstate);
814 return true;
817 static void
818 nvptx_exec (void (*fn), unsigned *dims, void *targ_mem_desc,
819 CUdeviceptr dp, CUstream stream)
821 struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
822 CUfunction function;
823 int i;
824 void *kargs[1];
825 struct nvptx_thread *nvthd = nvptx_thread ();
826 int warp_size = nvthd->ptx_dev->warp_size;
828 function = targ_fn->fn;
830 /* Initialize the launch dimensions. Typically this is constant,
831 provided by the device compiler, but we must permit runtime
832 values. */
833 int seen_zero = 0;
834 for (i = 0; i != GOMP_DIM_MAX; i++)
836 if (targ_fn->launch->dim[i])
837 dims[i] = targ_fn->launch->dim[i];
838 if (!dims[i])
839 seen_zero = 1;
842 if (seen_zero)
844 pthread_mutex_lock (&ptx_dev_lock);
846 static int gomp_openacc_dims[GOMP_DIM_MAX];
847 if (!gomp_openacc_dims[0])
849 /* See if the user provided GOMP_OPENACC_DIM environment
850 variable to specify runtime defaults. */
851 for (int i = 0; i < GOMP_DIM_MAX; ++i)
852 gomp_openacc_dims[i] = GOMP_PLUGIN_acc_default_dim (i);
855 if (!nvthd->ptx_dev->default_dims[0])
857 int default_dims[GOMP_DIM_MAX];
858 for (int i = 0; i < GOMP_DIM_MAX; ++i)
859 default_dims[i] = gomp_openacc_dims[i];
861 int gang, worker, vector;
863 int block_size = nvthd->ptx_dev->max_threads_per_block;
864 int cpu_size = nvthd->ptx_dev->max_threads_per_multiprocessor;
865 int dev_size = nvthd->ptx_dev->num_sms;
866 GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
867 " dev_size=%d, cpu_size=%d\n",
868 warp_size, block_size, dev_size, cpu_size);
870 gang = (cpu_size / block_size) * dev_size;
871 worker = block_size / warp_size;
872 vector = warp_size;
875 /* There is no upper bound on the gang size. The best size
876 matches the hardware configuration. Logical gangs are
877 scheduled onto physical hardware. To maximize usage, we
878 should guess a large number. */
879 if (default_dims[GOMP_DIM_GANG] < 1)
880 default_dims[GOMP_DIM_GANG] = gang ? gang : 1024;
881 /* The worker size must not exceed the hardware. */
882 if (default_dims[GOMP_DIM_WORKER] < 1
883 || (default_dims[GOMP_DIM_WORKER] > worker && gang))
884 default_dims[GOMP_DIM_WORKER] = worker;
885 /* The vector size must exactly match the hardware. */
886 if (default_dims[GOMP_DIM_VECTOR] < 1
887 || (default_dims[GOMP_DIM_VECTOR] != vector && gang))
888 default_dims[GOMP_DIM_VECTOR] = vector;
890 GOMP_PLUGIN_debug (0, " default dimensions [%d,%d,%d]\n",
891 default_dims[GOMP_DIM_GANG],
892 default_dims[GOMP_DIM_WORKER],
893 default_dims[GOMP_DIM_VECTOR]);
895 for (i = 0; i != GOMP_DIM_MAX; i++)
896 nvthd->ptx_dev->default_dims[i] = default_dims[i];
898 pthread_mutex_unlock (&ptx_dev_lock);
901 bool default_dim_p[GOMP_DIM_MAX];
902 for (i = 0; i != GOMP_DIM_MAX; i++)
903 default_dim_p[i] = !dims[i];
905 if (!CUDA_CALL_EXISTS (cuOccupancyMaxPotentialBlockSize))
907 for (i = 0; i != GOMP_DIM_MAX; i++)
908 if (default_dim_p[i])
909 dims[i] = nvthd->ptx_dev->default_dims[i];
911 if (default_dim_p[GOMP_DIM_VECTOR])
912 dims[GOMP_DIM_VECTOR]
913 = MIN (dims[GOMP_DIM_VECTOR],
914 (targ_fn->max_threads_per_block / warp_size
915 * warp_size));
917 if (default_dim_p[GOMP_DIM_WORKER])
918 dims[GOMP_DIM_WORKER]
919 = MIN (dims[GOMP_DIM_WORKER],
920 targ_fn->max_threads_per_block / dims[GOMP_DIM_VECTOR]);
922 else
924 /* Handle the case that the compiler allows the runtime to choose
925 the vector-length conservatively, by ignoring
926 gomp_openacc_dims[GOMP_DIM_VECTOR]. TODO: actually handle
927 it. */
928 int vectors = 0;
929 /* TODO: limit gomp_openacc_dims[GOMP_DIM_WORKER] such that that
930 gomp_openacc_dims[GOMP_DIM_WORKER] * actual_vectors does not
931 exceed targ_fn->max_threads_per_block. */
932 int workers = gomp_openacc_dims[GOMP_DIM_WORKER];
933 int gangs = gomp_openacc_dims[GOMP_DIM_GANG];
934 int grids, blocks;
936 CUDA_CALL_ASSERT (cuOccupancyMaxPotentialBlockSize, &grids,
937 &blocks, function, NULL, 0,
938 dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]);
939 GOMP_PLUGIN_debug (0, "cuOccupancyMaxPotentialBlockSize: "
940 "grid = %d, block = %d\n", grids, blocks);
942 /* Keep the num_gangs proportional to the block size. In
943 the case were a block size is limited by shared-memory
944 or the register file capacity, the runtime will not
945 excessively over assign gangs to the multiprocessor
946 units if their state is going to be swapped out even
947 more than necessary. The constant factor 2 is there to
948 prevent threads from idling when there is insufficient
949 work for them. */
950 if (gangs == 0)
951 gangs = 2 * grids * (blocks / warp_size);
953 if (vectors == 0)
954 vectors = warp_size;
956 if (workers == 0)
958 int actual_vectors = (default_dim_p[GOMP_DIM_VECTOR]
959 ? vectors
960 : dims[GOMP_DIM_VECTOR]);
961 workers = blocks / actual_vectors;
962 workers = MAX (workers, 1);
963 /* If we need a per-worker barrier ... . */
964 if (actual_vectors > 32)
965 /* Don't use more barriers than available. */
966 workers = MIN (workers, 15);
969 for (i = 0; i != GOMP_DIM_MAX; i++)
970 if (default_dim_p[i])
971 switch (i)
973 case GOMP_DIM_GANG: dims[i] = gangs; break;
974 case GOMP_DIM_WORKER: dims[i] = workers; break;
975 case GOMP_DIM_VECTOR: dims[i] = vectors; break;
976 default: GOMP_PLUGIN_fatal ("invalid dim");
982 /* Check if the accelerator has sufficient hardware resources to
983 launch the offloaded kernel. */
984 if (dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]
985 > targ_fn->max_threads_per_block)
987 const char *msg
988 = ("The Nvidia accelerator has insufficient resources to launch '%s'"
989 " with num_workers = %d and vector_length = %d"
990 "; "
991 "recompile the program with 'num_workers = x and vector_length = y'"
992 " on that offloaded region or '-fopenacc-dim=:x:y' where"
993 " x * y <= %d"
994 ".\n");
995 GOMP_PLUGIN_fatal (msg, targ_fn->launch->fn, dims[GOMP_DIM_WORKER],
996 dims[GOMP_DIM_VECTOR], targ_fn->max_threads_per_block);
999 /* Check if the accelerator has sufficient barrier resources to
1000 launch the offloaded kernel. */
1001 if (dims[GOMP_DIM_WORKER] > 15 && dims[GOMP_DIM_VECTOR] > 32)
1003 const char *msg
1004 = ("The Nvidia accelerator has insufficient barrier resources to launch"
1005 " '%s' with num_workers = %d and vector_length = %d"
1006 "; "
1007 "recompile the program with 'num_workers = x' on that offloaded"
1008 " region or '-fopenacc-dim=:x:' where x <= 15"
1009 "; "
1010 "or, recompile the program with 'vector_length = 32' on that"
1011 " offloaded region or '-fopenacc-dim=::32'"
1012 ".\n");
1013 GOMP_PLUGIN_fatal (msg, targ_fn->launch->fn, dims[GOMP_DIM_WORKER],
1014 dims[GOMP_DIM_VECTOR]);
1017 GOMP_PLUGIN_debug (0, " %s: kernel %s: launch"
1018 " gangs=%u, workers=%u, vectors=%u\n",
1019 __FUNCTION__, targ_fn->launch->fn, dims[GOMP_DIM_GANG],
1020 dims[GOMP_DIM_WORKER], dims[GOMP_DIM_VECTOR]);
1022 // OpenACC CUDA
1024 // num_gangs nctaid.x
1025 // num_workers ntid.y
1026 // vector length ntid.x
1028 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1029 acc_prof_info *prof_info = thr->prof_info;
1030 acc_event_info enqueue_launch_event_info;
1031 acc_api_info *api_info = thr->api_info;
1032 bool profiling_p = __builtin_expect (prof_info != NULL, false);
1033 if (profiling_p)
1035 prof_info->event_type = acc_ev_enqueue_launch_start;
1037 enqueue_launch_event_info.launch_event.event_type
1038 = prof_info->event_type;
1039 enqueue_launch_event_info.launch_event.valid_bytes
1040 = _ACC_LAUNCH_EVENT_INFO_VALID_BYTES;
1041 enqueue_launch_event_info.launch_event.parent_construct
1042 = acc_construct_parallel;
1043 enqueue_launch_event_info.launch_event.implicit = 1;
1044 enqueue_launch_event_info.launch_event.tool_info = NULL;
1045 enqueue_launch_event_info.launch_event.kernel_name = targ_fn->launch->fn;
1046 enqueue_launch_event_info.launch_event.num_gangs
1047 = dims[GOMP_DIM_GANG];
1048 enqueue_launch_event_info.launch_event.num_workers
1049 = dims[GOMP_DIM_WORKER];
1050 enqueue_launch_event_info.launch_event.vector_length
1051 = dims[GOMP_DIM_VECTOR];
1053 api_info->device_api = acc_device_api_cuda;
1055 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &enqueue_launch_event_info,
1056 api_info);
1059 kargs[0] = &dp;
1060 CUDA_CALL_ASSERT (cuLaunchKernel, function,
1061 dims[GOMP_DIM_GANG], 1, 1,
1062 dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1,
1063 0, stream, kargs, 0);
1065 if (profiling_p)
1067 prof_info->event_type = acc_ev_enqueue_launch_end;
1068 enqueue_launch_event_info.launch_event.event_type
1069 = prof_info->event_type;
1070 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &enqueue_launch_event_info,
1071 api_info);
1074 GOMP_PLUGIN_debug (0, " %s: kernel %s: finished\n", __FUNCTION__,
1075 targ_fn->launch->fn);
1078 void * openacc_get_current_cuda_context (void);
1080 static void
1081 goacc_profiling_acc_ev_alloc (struct goacc_thread *thr, void *dp, size_t s)
1083 acc_prof_info *prof_info = thr->prof_info;
1084 acc_event_info data_event_info;
1085 acc_api_info *api_info = thr->api_info;
1087 prof_info->event_type = acc_ev_alloc;
1089 data_event_info.data_event.event_type = prof_info->event_type;
1090 data_event_info.data_event.valid_bytes = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1091 data_event_info.data_event.parent_construct = acc_construct_parallel;
1092 data_event_info.data_event.implicit = 1;
1093 data_event_info.data_event.tool_info = NULL;
1094 data_event_info.data_event.var_name = NULL;
1095 data_event_info.data_event.bytes = s;
1096 data_event_info.data_event.host_ptr = NULL;
1097 data_event_info.data_event.device_ptr = dp;
1099 api_info->device_api = acc_device_api_cuda;
1101 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, api_info);
1104 /* Free the cached soft-stacks block if it is above the SOFTSTACK_CACHE_LIMIT
1105 size threshold, or if FORCE is true. */
1107 static void
1108 nvptx_stacks_free (struct ptx_device *ptx_dev, bool force)
1110 pthread_mutex_lock (&ptx_dev->omp_stacks.lock);
1111 if (ptx_dev->omp_stacks.ptr
1112 && (force || ptx_dev->omp_stacks.size > SOFTSTACK_CACHE_LIMIT))
1114 CUresult r = CUDA_CALL_NOCHECK (cuMemFree, ptx_dev->omp_stacks.ptr);
1115 if (r != CUDA_SUCCESS)
1116 GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
1117 ptx_dev->omp_stacks.ptr = 0;
1118 ptx_dev->omp_stacks.size = 0;
1120 pthread_mutex_unlock (&ptx_dev->omp_stacks.lock);
1123 static void *
1124 nvptx_alloc (size_t s, bool suppress_errors)
1126 CUdeviceptr d;
1128 CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &d, s);
1129 if (suppress_errors && r == CUDA_ERROR_OUT_OF_MEMORY)
1130 return NULL;
1131 else if (r != CUDA_SUCCESS)
1133 GOMP_PLUGIN_error ("nvptx_alloc error: %s", cuda_error (r));
1134 return NULL;
1137 /* NOTE: We only do profiling stuff if the memory allocation succeeds. */
1138 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1139 bool profiling_p
1140 = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1141 if (profiling_p)
1142 goacc_profiling_acc_ev_alloc (thr, (void *) d, s);
1144 return (void *) d;
1147 static void
1148 goacc_profiling_acc_ev_free (struct goacc_thread *thr, void *p)
1150 acc_prof_info *prof_info = thr->prof_info;
1151 acc_event_info data_event_info;
1152 acc_api_info *api_info = thr->api_info;
1154 prof_info->event_type = acc_ev_free;
1156 data_event_info.data_event.event_type = prof_info->event_type;
1157 data_event_info.data_event.valid_bytes = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1158 data_event_info.data_event.parent_construct = acc_construct_parallel;
1159 data_event_info.data_event.implicit = 1;
1160 data_event_info.data_event.tool_info = NULL;
1161 data_event_info.data_event.var_name = NULL;
1162 data_event_info.data_event.bytes = -1;
1163 data_event_info.data_event.host_ptr = NULL;
1164 data_event_info.data_event.device_ptr = p;
1166 api_info->device_api = acc_device_api_cuda;
1168 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, api_info);
1171 static bool
1172 nvptx_free (void *p, struct ptx_device *ptx_dev)
1174 CUdeviceptr pb;
1175 size_t ps;
1177 CUresult r = CUDA_CALL_NOCHECK (cuMemGetAddressRange, &pb, &ps,
1178 (CUdeviceptr) p);
1179 if (r == CUDA_ERROR_NOT_PERMITTED)
1181 /* We assume that this error indicates we are in a CUDA callback context,
1182 where all CUDA calls are not allowed (see cuStreamAddCallback
1183 documentation for description). Arrange to free this piece of device
1184 memory later. */
1185 struct ptx_free_block *n
1186 = GOMP_PLUGIN_malloc (sizeof (struct ptx_free_block));
1187 n->ptr = p;
1188 pthread_mutex_lock (&ptx_dev->free_blocks_lock);
1189 n->next = ptx_dev->free_blocks;
1190 ptx_dev->free_blocks = n;
1191 pthread_mutex_unlock (&ptx_dev->free_blocks_lock);
1192 return true;
1194 else if (r != CUDA_SUCCESS)
1196 GOMP_PLUGIN_error ("cuMemGetAddressRange error: %s", cuda_error (r));
1197 return false;
1199 if ((CUdeviceptr) p != pb)
1201 GOMP_PLUGIN_error ("invalid device address");
1202 return false;
1205 CUDA_CALL (cuMemFree, (CUdeviceptr) p);
1206 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1207 bool profiling_p
1208 = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1209 if (profiling_p)
1210 goacc_profiling_acc_ev_free (thr, p);
1212 return true;
1215 static void *
1216 nvptx_get_current_cuda_device (void)
1218 struct nvptx_thread *nvthd = nvptx_thread ();
1220 if (!nvthd || !nvthd->ptx_dev)
1221 return NULL;
1223 return &nvthd->ptx_dev->dev;
1226 static void *
1227 nvptx_get_current_cuda_context (void)
1229 struct nvptx_thread *nvthd = nvptx_thread ();
1231 if (!nvthd || !nvthd->ptx_dev)
1232 return NULL;
1234 return nvthd->ptx_dev->ctx;
1237 /* Plugin entry points. */
1239 const char *
1240 GOMP_OFFLOAD_get_name (void)
1242 return "nvptx";
1245 /* Return the UID; if not available return NULL.
1246 Returns freshly allocated memoy. */
1248 const char *
1249 GOMP_OFFLOAD_get_uid (int ord)
1251 CUresult r;
1252 CUuuid s;
1253 struct ptx_device *dev = ptx_devices[ord];
1255 if (CUDA_CALL_EXISTS (cuDeviceGetUuid_v2))
1256 r = CUDA_CALL_NOCHECK (cuDeviceGetUuid_v2, &s, dev->dev);
1257 else if (CUDA_CALL_EXISTS (cuDeviceGetUuid))
1258 r = CUDA_CALL_NOCHECK (cuDeviceGetUuid, &s, dev->dev);
1259 else
1260 return NULL;
1261 if (r != CUDA_SUCCESS)
1262 NULL;
1264 size_t len = strlen ("GPU-12345678-9abc-defg-hijk-lmniopqrstuv");
1265 char *str = (char *) GOMP_PLUGIN_malloc (len + 1);
1266 sprintf (str,
1267 "GPU-%02x" "%02x" "%02x" "%02x"
1268 "-%02x" "%02x"
1269 "-%02x" "%02x"
1270 "-%02x" "%02x" "%02x" "%02x" "%02x" "%02x" "%02x" "%02x",
1271 (unsigned char) s.bytes[0], (unsigned char) s.bytes[1],
1272 (unsigned char) s.bytes[2], (unsigned char) s.bytes[3],
1273 (unsigned char) s.bytes[4], (unsigned char) s.bytes[5],
1274 (unsigned char) s.bytes[6], (unsigned char) s.bytes[7],
1275 (unsigned char) s.bytes[8], (unsigned char) s.bytes[9],
1276 (unsigned char) s.bytes[10], (unsigned char) s.bytes[11],
1277 (unsigned char) s.bytes[12], (unsigned char) s.bytes[13],
1278 (unsigned char) s.bytes[14], (unsigned char) s.bytes[15]);
1279 return str;
1282 unsigned int
1283 GOMP_OFFLOAD_get_caps (void)
1285 return GOMP_OFFLOAD_CAP_OPENACC_200 | GOMP_OFFLOAD_CAP_OPENMP_400;
1289 GOMP_OFFLOAD_get_type (void)
1291 return OFFLOAD_TARGET_TYPE_NVIDIA_PTX;
1295 GOMP_OFFLOAD_get_num_devices (unsigned int omp_requires_mask)
1297 int num_devices = nvptx_get_num_devices ();
1298 /* Return -1 if no omp_requires_mask cannot be fulfilled but
1299 devices were present. Unified-shared address: see comment in
1300 nvptx_open_device for CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING. */
1301 if (num_devices > 0
1302 && ((omp_requires_mask
1303 & ~(GOMP_REQUIRES_UNIFIED_ADDRESS
1304 | GOMP_REQUIRES_SELF_MAPS
1305 | GOMP_REQUIRES_UNIFIED_SHARED_MEMORY
1306 | GOMP_REQUIRES_REVERSE_OFFLOAD)) != 0))
1307 return -1;
1308 /* Check whether host page access (direct or via migration) is supported;
1309 if so, enable USM. Currently, capabilities is per device type, hence,
1310 check all devices. */
1311 if (num_devices > 0
1312 && (omp_requires_mask
1313 & (GOMP_REQUIRES_UNIFIED_SHARED_MEMORY | GOMP_REQUIRES_SELF_MAPS)))
1314 for (int dev = 0; dev < num_devices; dev++)
1316 int pi;
1317 CUresult r;
1318 r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi,
1319 CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS, dev);
1320 if (r != CUDA_SUCCESS || pi == 0)
1321 return -1;
1323 return num_devices;
1326 bool
1327 GOMP_OFFLOAD_init_device (int n)
1329 struct ptx_device *dev;
1331 pthread_mutex_lock (&ptx_dev_lock);
1333 if (!nvptx_init () || ptx_devices[n] != NULL)
1335 pthread_mutex_unlock (&ptx_dev_lock);
1336 return false;
1339 dev = nvptx_open_device (n);
1340 if (dev)
1342 ptx_devices[n] = dev;
1343 instantiated_devices++;
1346 const char *var_name = "GOMP_NVPTX_LOWLAT_POOL";
1347 const char *env_var = secure_getenv (var_name);
1348 notify_var (var_name, env_var);
1350 if (env_var != NULL)
1352 char *endptr;
1353 unsigned long val = strtoul (env_var, &endptr, 10);
1354 if (endptr == NULL || *endptr != '\0'
1355 || errno == ERANGE || errno == EINVAL
1356 || val > UINT_MAX)
1357 GOMP_PLUGIN_error ("Error parsing %s", var_name);
1358 else
1359 lowlat_pool_size = val;
1362 pthread_mutex_unlock (&ptx_dev_lock);
1364 return dev != NULL;
1367 bool
1368 GOMP_OFFLOAD_fini_device (int n)
1370 pthread_mutex_lock (&ptx_dev_lock);
1372 if (ptx_devices[n] != NULL)
1374 if (!nvptx_attach_host_thread_to_device (n)
1375 || !nvptx_close_device (ptx_devices[n]))
1377 pthread_mutex_unlock (&ptx_dev_lock);
1378 return false;
1380 ptx_devices[n] = NULL;
1381 instantiated_devices--;
1384 if (instantiated_devices == 0)
1386 free (ptx_devices);
1387 ptx_devices = NULL;
1390 pthread_mutex_unlock (&ptx_dev_lock);
1391 return true;
1394 /* Return the libgomp version number we're compatible with. There is
1395 no requirement for cross-version compatibility. */
1397 unsigned
1398 GOMP_OFFLOAD_version (void)
1400 return GOMP_VERSION;
1403 /* Initialize __nvptx_clocktick, if present in MODULE. */
1405 static void
1406 nvptx_set_clocktick (CUmodule module, struct ptx_device *dev)
1408 CUdeviceptr dptr;
1409 CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &dptr, NULL,
1410 module, "__nvptx_clocktick");
1411 if (r == CUDA_ERROR_NOT_FOUND)
1412 return;
1413 if (r != CUDA_SUCCESS)
1414 GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
1415 double __nvptx_clocktick = 1e-3 / dev->clock_khz;
1416 r = CUDA_CALL_NOCHECK (cuMemcpyHtoD, dptr, &__nvptx_clocktick,
1417 sizeof (__nvptx_clocktick));
1418 if (r != CUDA_SUCCESS)
1419 GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
1422 /* Invoke MODULE's global constructors/destructors. */
1424 static bool
1425 nvptx_do_global_cdtors (CUmodule module, struct ptx_device *ptx_dev,
1426 const char *funcname)
1428 bool ret = true;
1429 char *funcname_mgomp = NULL;
1430 CUresult r;
1431 CUfunction funcptr;
1432 r = CUDA_CALL_NOCHECK (cuModuleGetFunction,
1433 &funcptr, module, funcname);
1434 GOMP_PLUGIN_debug (0, "cuModuleGetFunction (%s): %s\n",
1435 funcname, cuda_error (r));
1436 if (r == CUDA_ERROR_NOT_FOUND)
1438 /* Try '[funcname]__mgomp'. */
1440 size_t funcname_len = strlen (funcname);
1441 const char *mgomp_suffix = "__mgomp";
1442 size_t mgomp_suffix_len = strlen (mgomp_suffix);
1443 funcname_mgomp
1444 = GOMP_PLUGIN_malloc (funcname_len + mgomp_suffix_len + 1);
1445 memcpy (funcname_mgomp, funcname, funcname_len);
1446 memcpy (funcname_mgomp + funcname_len,
1447 mgomp_suffix, mgomp_suffix_len + 1);
1448 funcname = funcname_mgomp;
1450 r = CUDA_CALL_NOCHECK (cuModuleGetFunction,
1451 &funcptr, module, funcname);
1452 GOMP_PLUGIN_debug (0, "cuModuleGetFunction (%s): %s\n",
1453 funcname, cuda_error (r));
1455 if (r == CUDA_ERROR_NOT_FOUND)
1457 else if (r != CUDA_SUCCESS)
1459 GOMP_PLUGIN_error ("cuModuleGetFunction (%s) error: %s",
1460 funcname, cuda_error (r));
1461 ret = false;
1463 else
1465 /* If necessary, set up soft stack. */
1466 void *nvptx_stacks_0;
1467 void *kargs[1];
1468 if (funcname_mgomp)
1470 size_t stack_size = nvptx_stacks_size ();
1471 pthread_mutex_lock (&ptx_dev->omp_stacks.lock);
1472 nvptx_stacks_0 = nvptx_stacks_acquire (ptx_dev, stack_size, 1);
1473 nvptx_stacks_0 += stack_size;
1474 kargs[0] = &nvptx_stacks_0;
1476 r = CUDA_CALL_NOCHECK (cuLaunchKernel,
1477 funcptr,
1478 1, 1, 1, 1, 1, 1,
1479 /* sharedMemBytes */ 0,
1480 /* hStream */ NULL,
1481 /* kernelParams */ funcname_mgomp ? kargs : NULL,
1482 /* extra */ NULL);
1483 if (r != CUDA_SUCCESS)
1485 GOMP_PLUGIN_error ("cuLaunchKernel (%s) error: %s",
1486 funcname, cuda_error (r));
1487 ret = false;
1490 r = CUDA_CALL_NOCHECK (cuStreamSynchronize,
1491 NULL);
1492 if (r != CUDA_SUCCESS)
1494 GOMP_PLUGIN_error ("cuStreamSynchronize (%s) error: %s",
1495 funcname, cuda_error (r));
1496 ret = false;
1499 if (funcname_mgomp)
1500 pthread_mutex_unlock (&ptx_dev->omp_stacks.lock);
1503 if (funcname_mgomp)
1504 free (funcname_mgomp);
1506 return ret;
1509 /* Load the (partial) program described by TARGET_DATA to device
1510 number ORD. Allocate and return TARGET_TABLE. If not NULL, REV_FN_TABLE
1511 will contain the on-device addresses of the functions for reverse offload.
1512 To be freed by the caller. */
1515 GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
1516 struct addr_pair **target_table,
1517 uint64_t **rev_fn_table,
1518 uint64_t *host_ind_fn_table)
1520 CUmodule module;
1521 const char *const *var_names;
1522 const struct targ_fn_launch *fn_descs;
1523 unsigned int fn_entries, var_entries, ind_fn_entries, other_entries, i, j;
1524 struct targ_fn_descriptor *targ_fns;
1525 struct addr_pair *targ_tbl;
1526 const nvptx_tdata_t *img_header = (const nvptx_tdata_t *) target_data;
1527 struct ptx_image_data *new_image;
1528 struct ptx_device *dev;
1530 if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1532 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1533 " (expected %u, received %u)",
1534 GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1535 return -1;
1538 if (!nvptx_attach_host_thread_to_device (ord)
1539 || !link_ptx (&module, img_header->ptx_objs, img_header->ptx_num))
1540 return -1;
1542 dev = ptx_devices[ord];
1544 /* The mkoffload utility emits a struct of pointers/integers at the
1545 start of each offload image. The array of kernel names and the
1546 functions addresses form a one-to-one correspondence. */
1548 var_entries = img_header->var_num;
1549 var_names = img_header->var_names;
1550 fn_entries = img_header->fn_num;
1551 fn_descs = img_header->fn_descs;
1552 ind_fn_entries = GOMP_VERSION_SUPPORTS_INDIRECT_FUNCS (version)
1553 ? img_header->ind_fn_num : 0;
1555 /* Currently, other_entries contains only the struct of ICVs. */
1556 other_entries = 1;
1558 targ_tbl = GOMP_PLUGIN_malloc (sizeof (struct addr_pair)
1559 * (fn_entries + var_entries + other_entries));
1560 targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
1561 * fn_entries);
1563 *target_table = targ_tbl;
1565 new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data));
1566 new_image->target_data = target_data;
1567 new_image->module = module;
1568 new_image->fns = targ_fns;
1570 pthread_mutex_lock (&dev->image_lock);
1571 new_image->next = dev->images;
1572 dev->images = new_image;
1573 pthread_mutex_unlock (&dev->image_lock);
1575 for (i = 0; i < fn_entries; i++, targ_fns++, targ_tbl++)
1577 CUfunction function;
1578 int nregs, mthrs;
1580 CUDA_CALL_ERET (-1, cuModuleGetFunction, &function, module,
1581 fn_descs[i].fn);
1582 CUDA_CALL_ERET (-1, cuFuncGetAttribute, &nregs,
1583 CU_FUNC_ATTRIBUTE_NUM_REGS, function);
1584 CUDA_CALL_ERET (-1, cuFuncGetAttribute, &mthrs,
1585 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, function);
1587 targ_fns->fn = function;
1588 targ_fns->launch = &fn_descs[i];
1589 targ_fns->regs_per_thread = nregs;
1590 targ_fns->max_threads_per_block = mthrs;
1592 targ_tbl->start = (uintptr_t) targ_fns;
1593 targ_tbl->end = targ_tbl->start + 1;
1596 for (j = 0; j < var_entries; j++, targ_tbl++)
1598 CUdeviceptr var;
1599 size_t bytes;
1601 CUDA_CALL_ERET (-1, cuModuleGetGlobal,
1602 &var, &bytes, module, var_names[j]);
1604 targ_tbl->start = (uintptr_t) var;
1605 targ_tbl->end = targ_tbl->start + bytes;
1608 if (ind_fn_entries > 0)
1610 CUdeviceptr var;
1611 size_t bytes;
1613 /* Read indirect function table from image. */
1614 CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &var, &bytes, module,
1615 "$offload_ind_func_table");
1616 if (r != CUDA_SUCCESS)
1617 GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
1618 assert (bytes == sizeof (uint64_t) * ind_fn_entries);
1620 uint64_t ind_fn_table[ind_fn_entries];
1621 r = CUDA_CALL_NOCHECK (cuMemcpyDtoH, ind_fn_table, var, bytes);
1622 if (r != CUDA_SUCCESS)
1623 GOMP_PLUGIN_fatal ("cuMemcpyDtoH error: %s", cuda_error (r));
1625 /* Build host->target address map for indirect functions. */
1626 uint64_t ind_fn_map[ind_fn_entries * 2 + 1];
1627 for (unsigned k = 0; k < ind_fn_entries; k++)
1629 ind_fn_map[k * 2] = host_ind_fn_table[k];
1630 ind_fn_map[k * 2 + 1] = ind_fn_table[k];
1631 GOMP_PLUGIN_debug (0, "Indirect function %d: %lx->%lx\n",
1632 k, host_ind_fn_table[k], ind_fn_table[k]);
1634 ind_fn_map[ind_fn_entries * 2] = 0;
1636 /* Write the map onto the target. */
1637 void *map_target_addr
1638 = GOMP_OFFLOAD_alloc (ord, sizeof (ind_fn_map));
1639 GOMP_PLUGIN_debug (0, "Allocated indirect map at %p\n", map_target_addr);
1641 GOMP_OFFLOAD_host2dev (ord, map_target_addr,
1642 (void*) ind_fn_map,
1643 sizeof (ind_fn_map));
1645 /* Write address of the map onto the target. */
1646 CUdeviceptr varptr;
1647 size_t varsize;
1648 r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &varptr, &varsize,
1649 module, XSTRING (GOMP_INDIRECT_ADDR_MAP));
1650 if (r != CUDA_SUCCESS)
1651 GOMP_PLUGIN_fatal ("Indirect map variable not found in image: %s",
1652 cuda_error (r));
1654 GOMP_PLUGIN_debug (0,
1655 "Indirect map variable found at %llx with size %ld\n",
1656 varptr, varsize);
1658 GOMP_OFFLOAD_host2dev (ord, (void *) varptr, &map_target_addr,
1659 sizeof (map_target_addr));
1662 CUdeviceptr varptr;
1663 size_t varsize;
1664 CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &varptr, &varsize,
1665 module, XSTRING (GOMP_ADDITIONAL_ICVS));
1667 if (r == CUDA_SUCCESS)
1669 targ_tbl->start = (uintptr_t) varptr;
1670 targ_tbl->end = (uintptr_t) (varptr + varsize);
1672 else
1673 /* The variable was not in this image. */
1674 targ_tbl->start = targ_tbl->end = 0;
1676 if (rev_fn_table && fn_entries == 0)
1677 *rev_fn_table = NULL;
1678 else if (rev_fn_table)
1680 CUdeviceptr var;
1681 size_t bytes;
1682 unsigned int i;
1683 r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &var, &bytes, module,
1684 "$offload_func_table");
1685 if (r != CUDA_SUCCESS)
1686 GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
1687 assert (bytes == sizeof (uint64_t) * fn_entries);
1688 *rev_fn_table = GOMP_PLUGIN_malloc (sizeof (uint64_t) * fn_entries);
1689 r = CUDA_CALL_NOCHECK (cuMemcpyDtoH, *rev_fn_table, var, bytes);
1690 if (r != CUDA_SUCCESS)
1691 GOMP_PLUGIN_fatal ("cuMemcpyDtoH error: %s", cuda_error (r));
1692 /* Free if only NULL entries. */
1693 for (i = 0; i < fn_entries; ++i)
1694 if ((*rev_fn_table)[i] != 0)
1695 break;
1696 if (i == fn_entries)
1698 free (*rev_fn_table);
1699 *rev_fn_table = NULL;
1703 if (rev_fn_table && *rev_fn_table && dev->rev_data == NULL)
1705 /* Get the on-device GOMP_REV_OFFLOAD_VAR variable. It should be
1706 available but it might be not. One reason could be: if the user code
1707 has 'omp target device(ancestor:1)' in pure hostcode, GOMP_target_ext
1708 is not called on the device and, hence, it and GOMP_REV_OFFLOAD_VAR
1709 are not linked in. */
1710 CUdeviceptr device_rev_offload_var;
1711 size_t device_rev_offload_size;
1712 CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal,
1713 &device_rev_offload_var,
1714 &device_rev_offload_size, module,
1715 XSTRING (GOMP_REV_OFFLOAD_VAR));
1716 if (r != CUDA_SUCCESS)
1718 free (*rev_fn_table);
1719 *rev_fn_table = NULL;
1721 else
1723 /* cuMemHostAlloc memory is accessible on the device, if
1724 unified-shared address is supported; this is assumed - see comment
1725 in nvptx_open_device for CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING. */
1726 CUDA_CALL_ASSERT (cuMemHostAlloc, (void **) &dev->rev_data,
1727 sizeof (*dev->rev_data), CU_MEMHOSTALLOC_DEVICEMAP);
1728 CUdeviceptr dp = (CUdeviceptr) dev->rev_data;
1729 r = CUDA_CALL_NOCHECK (cuMemcpyHtoD, device_rev_offload_var, &dp,
1730 sizeof (dp));
1731 if (r != CUDA_SUCCESS)
1732 GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
1736 nvptx_set_clocktick (module, dev);
1738 if (!nvptx_do_global_cdtors (module, dev,
1739 "__do_global_ctors__entry"
1740 /* or "__do_global_ctors__entry__mgomp" */))
1741 return -1;
1743 return fn_entries + var_entries + other_entries;
1746 /* Unload the program described by TARGET_DATA. DEV_DATA is the
1747 function descriptors allocated by G_O_load_image. */
1749 bool
1750 GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data)
1752 struct ptx_image_data *image, **prev_p;
1753 struct ptx_device *dev = ptx_devices[ord];
1755 if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1757 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1758 " (expected %u, received %u)",
1759 GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1760 return false;
1763 bool ret = true;
1764 pthread_mutex_lock (&dev->image_lock);
1765 for (prev_p = &dev->images; (image = *prev_p) != 0; prev_p = &image->next)
1766 if (image->target_data == target_data)
1768 if (!nvptx_do_global_cdtors (image->module, dev,
1769 "__do_global_dtors__entry"
1770 /* or "__do_global_dtors__entry__mgomp" */))
1771 ret = false;
1773 *prev_p = image->next;
1774 if (CUDA_CALL_NOCHECK (cuModuleUnload, image->module) != CUDA_SUCCESS)
1775 ret = false;
1776 free (image->fns);
1777 free (image);
1778 break;
1780 pthread_mutex_unlock (&dev->image_lock);
1781 return ret;
1784 void *
1785 GOMP_OFFLOAD_alloc (int ord, size_t size)
1787 if (!nvptx_attach_host_thread_to_device (ord))
1788 return NULL;
1790 struct ptx_device *ptx_dev = ptx_devices[ord];
1791 struct ptx_free_block *blocks, *tmp;
1793 pthread_mutex_lock (&ptx_dev->free_blocks_lock);
1794 blocks = ptx_dev->free_blocks;
1795 ptx_dev->free_blocks = NULL;
1796 pthread_mutex_unlock (&ptx_dev->free_blocks_lock);
1798 nvptx_stacks_free (ptx_dev, false);
1800 while (blocks)
1802 tmp = blocks->next;
1803 nvptx_free (blocks->ptr, ptx_dev);
1804 free (blocks);
1805 blocks = tmp;
1808 void *d = nvptx_alloc (size, true);
1809 if (d)
1810 return d;
1811 else
1813 /* Memory allocation failed. Try freeing the stacks block, and
1814 retrying. */
1815 nvptx_stacks_free (ptx_dev, true);
1816 return nvptx_alloc (size, false);
1820 bool
1821 GOMP_OFFLOAD_free (int ord, void *ptr)
1823 return (nvptx_attach_host_thread_to_device (ord)
1824 && nvptx_free (ptr, ptx_devices[ord]));
1827 void
1828 GOMP_OFFLOAD_openacc_exec (void (*fn) (void *),
1829 size_t mapnum __attribute__((unused)),
1830 void **hostaddrs __attribute__((unused)),
1831 void **devaddrs,
1832 unsigned *dims, void *targ_mem_desc)
1834 GOMP_PLUGIN_debug (0, "nvptx %s\n", __FUNCTION__);
1836 CUdeviceptr dp = (CUdeviceptr) devaddrs;
1837 nvptx_exec (fn, dims, targ_mem_desc, dp, NULL);
1839 CUresult r = CUDA_CALL_NOCHECK (cuStreamSynchronize, NULL);
1840 const char *maybe_abort_msg = "(perhaps abort was called)";
1841 if (r == CUDA_ERROR_LAUNCH_FAILED)
1842 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
1843 maybe_abort_msg);
1844 else if (r != CUDA_SUCCESS)
1845 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1848 void
1849 GOMP_OFFLOAD_openacc_async_exec (void (*fn) (void *),
1850 size_t mapnum __attribute__((unused)),
1851 void **hostaddrs __attribute__((unused)),
1852 void **devaddrs,
1853 unsigned *dims, void *targ_mem_desc,
1854 struct goacc_asyncqueue *aq)
1856 GOMP_PLUGIN_debug (0, "nvptx %s\n", __FUNCTION__);
1858 CUdeviceptr dp = (CUdeviceptr) devaddrs;
1859 nvptx_exec (fn, dims, targ_mem_desc, dp, aq->cuda_stream);
1862 void *
1863 GOMP_OFFLOAD_openacc_create_thread_data (int ord)
1865 struct ptx_device *ptx_dev;
1866 struct nvptx_thread *nvthd
1867 = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
1868 CUcontext thd_ctx;
1870 ptx_dev = ptx_devices[ord];
1872 assert (ptx_dev);
1874 CUDA_CALL_ASSERT (cuCtxGetCurrent, &thd_ctx);
1876 assert (ptx_dev->ctx);
1878 if (!thd_ctx)
1879 CUDA_CALL_ASSERT (cuCtxPushCurrent, ptx_dev->ctx);
1881 nvthd->ptx_dev = ptx_dev;
1883 return (void *) nvthd;
1886 void
1887 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
1889 free (data);
1892 void *
1893 GOMP_OFFLOAD_openacc_cuda_get_current_device (void)
1895 return nvptx_get_current_cuda_device ();
1898 void *
1899 GOMP_OFFLOAD_openacc_cuda_get_current_context (void)
1901 return nvptx_get_current_cuda_context ();
1904 /* This returns a CUstream. */
1905 void *
1906 GOMP_OFFLOAD_openacc_cuda_get_stream (struct goacc_asyncqueue *aq)
1908 return (void *) aq->cuda_stream;
1911 /* This takes a CUstream. */
1913 GOMP_OFFLOAD_openacc_cuda_set_stream (struct goacc_asyncqueue *aq, void *stream)
1915 if (aq->cuda_stream)
1917 CUDA_CALL_ASSERT (cuStreamSynchronize, aq->cuda_stream);
1918 CUDA_CALL_ASSERT (cuStreamDestroy, aq->cuda_stream);
1921 aq->cuda_stream = (CUstream) stream;
1922 return 1;
1925 static struct goacc_asyncqueue *
1926 nvptx_goacc_asyncqueue_construct (unsigned int flags)
1928 CUstream stream = NULL;
1929 CUDA_CALL_ERET (NULL, cuStreamCreate, &stream, flags);
1931 struct goacc_asyncqueue *aq
1932 = GOMP_PLUGIN_malloc (sizeof (struct goacc_asyncqueue));
1933 aq->cuda_stream = stream;
1934 return aq;
1937 struct goacc_asyncqueue *
1938 GOMP_OFFLOAD_openacc_async_construct (int device __attribute__((unused)))
1940 return nvptx_goacc_asyncqueue_construct (CU_STREAM_DEFAULT);
1943 static bool
1944 nvptx_goacc_asyncqueue_destruct (struct goacc_asyncqueue *aq)
1946 CUDA_CALL_ERET (false, cuStreamDestroy, aq->cuda_stream);
1947 free (aq);
1948 return true;
1951 bool
1952 GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
1954 return nvptx_goacc_asyncqueue_destruct (aq);
1958 GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
1960 CUresult r = CUDA_CALL_NOCHECK (cuStreamQuery, aq->cuda_stream);
1961 if (r == CUDA_SUCCESS)
1962 return 1;
1963 if (r == CUDA_ERROR_NOT_READY)
1964 return 0;
1966 GOMP_PLUGIN_error ("cuStreamQuery error: %s", cuda_error (r));
1967 return -1;
1970 static bool
1971 nvptx_goacc_asyncqueue_synchronize (struct goacc_asyncqueue *aq)
1973 CUDA_CALL_ERET (false, cuStreamSynchronize, aq->cuda_stream);
1974 return true;
1977 bool
1978 GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
1980 return nvptx_goacc_asyncqueue_synchronize (aq);
1983 bool
1984 GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
1985 struct goacc_asyncqueue *aq2)
1987 CUevent e;
1988 CUDA_CALL_ERET (false, cuEventCreate, &e, CU_EVENT_DISABLE_TIMING);
1989 CUDA_CALL_ERET (false, cuEventRecord, e, aq1->cuda_stream);
1990 CUDA_CALL_ERET (false, cuStreamWaitEvent, aq2->cuda_stream, e, 0);
1991 return true;
1994 static void
1995 cuda_callback_wrapper (CUstream stream, CUresult res, void *ptr)
1997 if (res != CUDA_SUCCESS)
1998 GOMP_PLUGIN_fatal ("%s error: %s", __FUNCTION__, cuda_error (res));
1999 struct nvptx_callback *cb = (struct nvptx_callback *) ptr;
2000 cb->fn (cb->ptr);
2001 free (ptr);
2004 void
2005 GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
2006 void (*callback_fn)(void *),
2007 void *userptr)
2009 struct nvptx_callback *b = GOMP_PLUGIN_malloc (sizeof (*b));
2010 b->fn = callback_fn;
2011 b->ptr = userptr;
2012 b->aq = aq;
2013 CUDA_CALL_ASSERT (cuStreamAddCallback, aq->cuda_stream,
2014 cuda_callback_wrapper, (void *) b, 0);
2017 static bool
2018 cuda_memcpy_sanity_check (const void *h, const void *d, size_t s)
2020 CUdeviceptr pb;
2021 size_t ps;
2022 if (!s)
2023 return true;
2024 if (!d)
2026 GOMP_PLUGIN_error ("invalid device address");
2027 return false;
2029 CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
2030 if (!pb)
2032 GOMP_PLUGIN_error ("invalid device address");
2033 return false;
2035 if (!h)
2037 GOMP_PLUGIN_error ("invalid host address");
2038 return false;
2040 if (d == h)
2042 GOMP_PLUGIN_error ("invalid host or device address");
2043 return false;
2045 if ((void *)(d + s) > (void *)(pb + ps))
2047 GOMP_PLUGIN_error ("invalid size");
2048 return false;
2050 return true;
2053 bool
2054 GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
2056 if (!nvptx_attach_host_thread_to_device (ord)
2057 || !cuda_memcpy_sanity_check (src, dst, n))
2058 return false;
2059 CUDA_CALL (cuMemcpyHtoD, (CUdeviceptr) dst, src, n);
2060 return true;
2063 bool
2064 GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
2066 if (!nvptx_attach_host_thread_to_device (ord)
2067 || !cuda_memcpy_sanity_check (dst, src, n))
2068 return false;
2069 CUDA_CALL (cuMemcpyDtoH, dst, (CUdeviceptr) src, n);
2070 return true;
2073 bool
2074 GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n)
2076 CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n, NULL);
2077 return true;
2081 GOMP_OFFLOAD_memcpy2d (int dst_ord, int src_ord, size_t dim1_size,
2082 size_t dim0_len, void *dst, size_t dst_offset1_size,
2083 size_t dst_offset0_len, size_t dst_dim1_size,
2084 const void *src, size_t src_offset1_size,
2085 size_t src_offset0_len, size_t src_dim1_size)
2087 if (!nvptx_attach_host_thread_to_device (src_ord != -1 ? src_ord : dst_ord))
2088 return false;
2090 /* TODO: Consider using CU_MEMORYTYPE_UNIFIED if supported. */
2092 CUDA_MEMCPY2D data;
2094 memset (&data, 0, sizeof (data));
2095 data.WidthInBytes = dim1_size;
2096 data.Height = dim0_len;
2098 if (dst_ord == -1)
2100 data.dstMemoryType = CU_MEMORYTYPE_HOST;
2101 data.dstHost = dst;
2103 else
2105 data.dstMemoryType = CU_MEMORYTYPE_DEVICE;
2106 data.dstDevice = (CUdeviceptr) dst;
2108 data.dstPitch = dst_dim1_size;
2109 data.dstXInBytes = dst_offset1_size;
2110 data.dstY = dst_offset0_len;
2112 if (src_ord == -1)
2114 data.srcMemoryType = CU_MEMORYTYPE_HOST;
2115 data.srcHost = src;
2117 else
2119 data.srcMemoryType = CU_MEMORYTYPE_DEVICE;
2120 data.srcDevice = (CUdeviceptr) src;
2122 data.srcPitch = src_dim1_size;
2123 data.srcXInBytes = src_offset1_size;
2124 data.srcY = src_offset0_len;
2126 if (data.srcXInBytes != 0 || data.srcY != 0)
2128 /* Adjust origin to the actual array data, else the CUDA 2D memory
2129 copy API calls below may fail to validate source/dest pointers
2130 correctly (especially for Fortran where the "virtual origin" of an
2131 array is often outside the stored data). */
2132 if (src_ord == -1)
2133 data.srcHost = (const void *) ((const char *) data.srcHost
2134 + data.srcY * data.srcPitch
2135 + data.srcXInBytes);
2136 else
2137 data.srcDevice += data.srcY * data.srcPitch + data.srcXInBytes;
2138 data.srcXInBytes = 0;
2139 data.srcY = 0;
2142 if (data.dstXInBytes != 0 || data.dstY != 0)
2144 /* As above. */
2145 if (dst_ord == -1)
2146 data.dstHost = (void *) ((char *) data.dstHost
2147 + data.dstY * data.dstPitch
2148 + data.dstXInBytes);
2149 else
2150 data.dstDevice += data.dstY * data.dstPitch + data.dstXInBytes;
2151 data.dstXInBytes = 0;
2152 data.dstY = 0;
2155 CUresult res = CUDA_CALL_NOCHECK (cuMemcpy2D, &data);
2156 if (res == CUDA_ERROR_INVALID_VALUE)
2157 /* If pitch > CU_DEVICE_ATTRIBUTE_MAX_PITCH or for device-to-device
2158 for (some) memory not allocated by cuMemAllocPitch, cuMemcpy2D fails
2159 with an error; try the slower cuMemcpy2DUnaligned now. */
2160 CUDA_CALL (cuMemcpy2DUnaligned, &data);
2161 else if (res != CUDA_SUCCESS)
2163 GOMP_PLUGIN_error ("cuMemcpy2D error: %s", cuda_error (res));
2164 return false;
2166 return true;
2170 GOMP_OFFLOAD_memcpy3d (int dst_ord, int src_ord, size_t dim2_size,
2171 size_t dim1_len, size_t dim0_len, void *dst,
2172 size_t dst_offset2_size, size_t dst_offset1_len,
2173 size_t dst_offset0_len, size_t dst_dim2_size,
2174 size_t dst_dim1_len, const void *src,
2175 size_t src_offset2_size, size_t src_offset1_len,
2176 size_t src_offset0_len, size_t src_dim2_size,
2177 size_t src_dim1_len)
2179 if (!nvptx_attach_host_thread_to_device (src_ord != -1 ? src_ord : dst_ord))
2180 return false;
2182 /* TODO: Consider using CU_MEMORYTYPE_UNIFIED if supported. */
2184 CUDA_MEMCPY3D data;
2186 memset (&data, 0, sizeof (data));
2187 data.WidthInBytes = dim2_size;
2188 data.Height = dim1_len;
2189 data.Depth = dim0_len;
2191 if (dst_ord == -1)
2193 data.dstMemoryType = CU_MEMORYTYPE_HOST;
2194 data.dstHost = dst;
2196 else
2198 data.dstMemoryType = CU_MEMORYTYPE_DEVICE;
2199 data.dstDevice = (CUdeviceptr) dst;
2201 data.dstPitch = dst_dim2_size;
2202 data.dstHeight = dst_dim1_len;
2203 data.dstXInBytes = dst_offset2_size;
2204 data.dstY = dst_offset1_len;
2205 data.dstZ = dst_offset0_len;
2207 if (src_ord == -1)
2209 data.srcMemoryType = CU_MEMORYTYPE_HOST;
2210 data.srcHost = src;
2212 else
2214 data.srcMemoryType = CU_MEMORYTYPE_DEVICE;
2215 data.srcDevice = (CUdeviceptr) src;
2217 data.srcPitch = src_dim2_size;
2218 data.srcHeight = src_dim1_len;
2219 data.srcXInBytes = src_offset2_size;
2220 data.srcY = src_offset1_len;
2221 data.srcZ = src_offset0_len;
2223 if (data.srcXInBytes != 0 || data.srcY != 0 || data.srcZ != 0)
2225 /* Adjust origin to the actual array data, else the CUDA 3D memory
2226 copy API call below may fail to validate source/dest pointers
2227 correctly (especially for Fortran where the "virtual origin" of an
2228 array is often outside the stored data). */
2229 if (src_ord == -1)
2230 data.srcHost
2231 = (const void *) ((const char *) data.srcHost
2232 + (data.srcZ * data.srcHeight + data.srcY)
2233 * data.srcPitch
2234 + data.srcXInBytes);
2235 else
2236 data.srcDevice
2237 += (data.srcZ * data.srcHeight + data.srcY) * data.srcPitch
2238 + data.srcXInBytes;
2239 data.srcXInBytes = 0;
2240 data.srcY = 0;
2241 data.srcZ = 0;
2244 if (data.dstXInBytes != 0 || data.dstY != 0 || data.dstZ != 0)
2246 /* As above. */
2247 if (dst_ord == -1)
2248 data.dstHost = (void *) ((char *) data.dstHost
2249 + (data.dstZ * data.dstHeight + data.dstY)
2250 * data.dstPitch
2251 + data.dstXInBytes);
2252 else
2253 data.dstDevice
2254 += (data.dstZ * data.dstHeight + data.dstY) * data.dstPitch
2255 + data.dstXInBytes;
2256 data.dstXInBytes = 0;
2257 data.dstY = 0;
2258 data.dstZ = 0;
2261 CUDA_CALL (cuMemcpy3D, &data);
2262 return true;
2265 bool
2266 GOMP_OFFLOAD_openacc_async_host2dev (int ord, void *dst, const void *src,
2267 size_t n, struct goacc_asyncqueue *aq)
2269 if (!nvptx_attach_host_thread_to_device (ord)
2270 || !cuda_memcpy_sanity_check (src, dst, n))
2271 return false;
2272 CUDA_CALL (cuMemcpyHtoDAsync, (CUdeviceptr) dst, src, n, aq->cuda_stream);
2273 return true;
2276 bool
2277 GOMP_OFFLOAD_openacc_async_dev2host (int ord, void *dst, const void *src,
2278 size_t n, struct goacc_asyncqueue *aq)
2280 if (!nvptx_attach_host_thread_to_device (ord)
2281 || !cuda_memcpy_sanity_check (dst, src, n))
2282 return false;
2283 CUDA_CALL (cuMemcpyDtoHAsync, dst, (CUdeviceptr) src, n, aq->cuda_stream);
2284 return true;
2287 union goacc_property_value
2288 GOMP_OFFLOAD_openacc_get_property (int n, enum goacc_property prop)
2290 union goacc_property_value propval = { .val = 0 };
2292 pthread_mutex_lock (&ptx_dev_lock);
2294 if (n >= nvptx_get_num_devices () || n < 0 || ptx_devices[n] == NULL)
2296 pthread_mutex_unlock (&ptx_dev_lock);
2297 return propval;
2300 struct ptx_device *ptx_dev = ptx_devices[n];
2301 switch (prop)
2303 case GOACC_PROPERTY_MEMORY:
2305 size_t total_mem;
2307 CUDA_CALL_ERET (propval, cuDeviceTotalMem, &total_mem, ptx_dev->dev);
2308 propval.val = total_mem;
2310 break;
2311 case GOACC_PROPERTY_FREE_MEMORY:
2313 size_t total_mem;
2314 size_t free_mem;
2315 CUdevice ctxdev;
2317 CUDA_CALL_ERET (propval, cuCtxGetDevice, &ctxdev);
2318 if (ptx_dev->dev == ctxdev)
2319 CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
2320 else if (ptx_dev->ctx)
2322 CUcontext old_ctx;
2324 CUDA_CALL_ERET (propval, cuCtxPushCurrent, ptx_dev->ctx);
2325 CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
2326 CUDA_CALL_ASSERT (cuCtxPopCurrent, &old_ctx);
2328 else
2330 CUcontext new_ctx;
2332 CUDA_CALL_ERET (propval, cuCtxCreate, &new_ctx, CU_CTX_SCHED_AUTO,
2333 ptx_dev->dev);
2334 CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
2335 CUDA_CALL_ASSERT (cuCtxDestroy, new_ctx);
2337 propval.val = free_mem;
2339 break;
2340 case GOACC_PROPERTY_NAME:
2341 propval.ptr = ptx_dev->name;
2342 break;
2343 case GOACC_PROPERTY_VENDOR:
2344 propval.ptr = "Nvidia";
2345 break;
2346 case GOACC_PROPERTY_DRIVER:
2347 propval.ptr = cuda_driver_version_s;
2348 break;
2349 default:
2350 break;
2353 pthread_mutex_unlock (&ptx_dev_lock);
2354 return propval;
2357 /* Adjust launch dimensions: pick good values for number of blocks and warps
2358 and ensure that number of warps does not exceed CUDA limits as well as GCC's
2359 own limits. */
2361 static void
2362 nvptx_adjust_launch_bounds (struct targ_fn_descriptor *fn,
2363 struct ptx_device *ptx_dev,
2364 int *teams_p, int *threads_p)
2366 int max_warps_block = fn->max_threads_per_block / 32;
2367 /* Maximum 32 warps per block is an implementation limit in NVPTX backend
2368 and libgcc, which matches documented limit of all GPUs as of 2015. */
2369 if (max_warps_block > 32)
2370 max_warps_block = 32;
2371 if (*threads_p <= 0)
2372 *threads_p = 8;
2373 if (*threads_p > max_warps_block)
2374 *threads_p = max_warps_block;
2376 int regs_per_block = fn->regs_per_thread * 32 * *threads_p;
2377 /* This is an estimate of how many blocks the device can host simultaneously.
2378 Actual limit, which may be lower, can be queried with "occupancy control"
2379 driver interface (since CUDA 6.0). */
2380 int max_blocks = ptx_dev->regs_per_sm / regs_per_block * ptx_dev->num_sms;
2381 if (*teams_p <= 0 || *teams_p > max_blocks)
2382 *teams_p = max_blocks;
2385 /* Return the size of per-warp stacks (see gcc -msoft-stack) to use for OpenMP
2386 target regions. */
2388 static size_t
2389 nvptx_stacks_size ()
2391 return 128 * 1024;
2394 /* Return contiguous storage for NUM stacks, each SIZE bytes. The lock for
2395 the storage should be held on entry, and remains held on exit. */
2397 static void *
2398 nvptx_stacks_acquire (struct ptx_device *ptx_dev, size_t size, int num)
2400 if (ptx_dev->omp_stacks.ptr && ptx_dev->omp_stacks.size >= size * num)
2401 return (void *) ptx_dev->omp_stacks.ptr;
2403 /* Free the old, too-small stacks. */
2404 if (ptx_dev->omp_stacks.ptr)
2406 CUresult r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
2407 if (r != CUDA_SUCCESS)
2408 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s\n", cuda_error (r));
2409 r = CUDA_CALL_NOCHECK (cuMemFree, ptx_dev->omp_stacks.ptr);
2410 if (r != CUDA_SUCCESS)
2411 GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
2414 /* Make new and bigger stacks, and remember where we put them and how big
2415 they are. */
2416 CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &ptx_dev->omp_stacks.ptr,
2417 size * num);
2418 if (r != CUDA_SUCCESS)
2419 GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
2421 ptx_dev->omp_stacks.size = size * num;
2423 return (void *) ptx_dev->omp_stacks.ptr;
2427 void
2428 GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
2430 struct targ_fn_descriptor *tgt_fn_desc
2431 = (struct targ_fn_descriptor *) tgt_fn;
2432 CUfunction function = tgt_fn_desc->fn;
2433 const struct targ_fn_launch *launch = tgt_fn_desc->launch;
2434 const char *fn_name = launch->fn;
2435 CUresult r;
2436 struct ptx_device *ptx_dev = ptx_devices[ord];
2437 const char *maybe_abort_msg = "(perhaps abort was called)";
2438 int teams = 0, threads = 0;
2440 if (!args)
2441 GOMP_PLUGIN_fatal ("No target arguments provided");
2442 while (*args)
2444 intptr_t id = (intptr_t) *args++, val;
2445 if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
2446 val = (intptr_t) *args++;
2447 else
2448 val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
2449 if ((id & GOMP_TARGET_ARG_DEVICE_MASK) != GOMP_TARGET_ARG_DEVICE_ALL)
2450 continue;
2451 val = val > INT_MAX ? INT_MAX : val;
2452 id &= GOMP_TARGET_ARG_ID_MASK;
2453 if (id == GOMP_TARGET_ARG_NUM_TEAMS)
2454 teams = val;
2455 else if (id == GOMP_TARGET_ARG_THREAD_LIMIT)
2456 threads = val;
2458 nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads);
2460 bool reverse_offload = ptx_dev->rev_data != NULL;
2461 struct goacc_asyncqueue *reverse_offload_aq = NULL;
2462 if (reverse_offload)
2464 reverse_offload_aq
2465 = nvptx_goacc_asyncqueue_construct (CU_STREAM_NON_BLOCKING);
2466 if (!reverse_offload_aq)
2467 exit (EXIT_FAILURE);
2470 size_t stack_size = nvptx_stacks_size ();
2472 pthread_mutex_lock (&ptx_dev->omp_stacks.lock);
2473 void *stacks = nvptx_stacks_acquire (ptx_dev, stack_size, teams * threads);
2474 void *fn_args[] = {tgt_vars, stacks, (void *) stack_size};
2475 size_t fn_args_size = sizeof fn_args;
2476 void *config[] = {
2477 CU_LAUNCH_PARAM_BUFFER_POINTER, fn_args,
2478 CU_LAUNCH_PARAM_BUFFER_SIZE, &fn_args_size,
2479 CU_LAUNCH_PARAM_END
2481 GOMP_PLUGIN_debug (0, " %s: kernel %s: launch"
2482 " [(teams: %u), 1, 1] [(lanes: 32), (threads: %u), 1]\n",
2483 __FUNCTION__, fn_name, teams, threads);
2484 r = CUDA_CALL_NOCHECK (cuLaunchKernel, function, teams, 1, 1,
2485 32, threads, 1, lowlat_pool_size, NULL, NULL, config);
2486 if (r != CUDA_SUCCESS)
2487 GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
2488 if (reverse_offload)
2489 while (true)
2491 r = CUDA_CALL_NOCHECK (cuStreamQuery, NULL);
2492 if (r == CUDA_SUCCESS)
2493 break;
2494 if (r == CUDA_ERROR_LAUNCH_FAILED)
2495 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s %s\n", cuda_error (r),
2496 maybe_abort_msg);
2497 else if (r != CUDA_ERROR_NOT_READY)
2498 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
2500 if (__atomic_load_n (&ptx_dev->rev_data->fn, __ATOMIC_ACQUIRE) != 0)
2502 struct rev_offload *rev_data = ptx_dev->rev_data;
2503 GOMP_PLUGIN_target_rev (rev_data->fn, rev_data->mapnum,
2504 rev_data->addrs, rev_data->sizes,
2505 rev_data->kinds, rev_data->dev_num,
2506 reverse_offload_aq);
2507 if (!nvptx_goacc_asyncqueue_synchronize (reverse_offload_aq))
2508 exit (EXIT_FAILURE);
2509 __atomic_store_n (&rev_data->fn, 0, __ATOMIC_RELEASE);
2511 usleep (1);
2513 else
2514 r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
2515 if (r == CUDA_ERROR_LAUNCH_FAILED)
2516 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
2517 maybe_abort_msg);
2518 else if (r != CUDA_SUCCESS)
2519 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
2521 pthread_mutex_unlock (&ptx_dev->omp_stacks.lock);
2523 if (reverse_offload)
2525 if (!nvptx_goacc_asyncqueue_destruct (reverse_offload_aq))
2526 exit (EXIT_FAILURE);
2530 /* TODO: Implement GOMP_OFFLOAD_async_run. */