xfail scan-tree-dump-not throw in g++.dg/pr99966.C on hppa*64*-*-*
[official-gcc.git] / libgomp / plugin / plugin-nvptx.c
blobc04c3acd67926529f154cbbeec560035b6eca86d
1 /* Plugin for NVPTX execution.
3 Copyright (C) 2013-2024 Free Software Foundation, Inc.
5 Contributed by Mentor Embedded.
7 This file is part of the GNU Offloading and Multi Processing Library
8 (libgomp).
10 Libgomp is free software; you can redistribute it and/or modify it
11 under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 3, or (at your option)
13 any later version.
15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
18 more details.
20 Under Section 7 of GPL version 3, you are granted additional
21 permissions described in the GCC Runtime Library Exception, version
22 3.1, as published by the Free Software Foundation.
24 You should have received a copy of the GNU General Public License and
25 a copy of the GCC Runtime Library Exception along with this program;
26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
27 <http://www.gnu.org/licenses/>. */
29 /* Nvidia PTX-specific parts of OpenACC support. The cuda driver
30 library appears to hold some implicit state, but the documentation
31 is not clear as to what that state might be. Or how one might
32 propagate it from one thread to another. */
34 #define _GNU_SOURCE
35 #include "openacc.h"
36 #include "config.h"
37 #include "symcat.h"
38 #include "libgomp-plugin.h"
39 #include "oacc-plugin.h"
40 #include "gomp-constants.h"
41 #include "oacc-int.h"
43 /* For struct rev_offload + GOMP_REV_OFFLOAD_VAR. */
44 #include "config/nvptx/libgomp-nvptx.h"
46 #include <pthread.h>
47 #ifndef PLUGIN_NVPTX_INCLUDE_SYSTEM_CUDA_H
48 # include "cuda/cuda.h"
49 #else
50 # include <cuda.h>
51 #endif
52 #include <stdbool.h>
53 #include <limits.h>
54 #include <string.h>
55 #include <stdio.h>
56 #include <unistd.h>
57 #include <assert.h>
58 #include <errno.h>
59 #include <stdlib.h>
61 /* An arbitrary fixed limit (128MB) for the size of the OpenMP soft stacks
62 block to cache between kernel invocations. For soft-stacks blocks bigger
63 than this, we will free the block before attempting another GPU memory
64 allocation (i.e. in GOMP_OFFLOAD_alloc). Otherwise, if an allocation fails,
65 we will free the cached soft-stacks block anyway then retry the
66 allocation. If that fails too, we lose. */
68 #define SOFTSTACK_CACHE_LIMIT 134217728
70 #if CUDA_VERSION < 6000
71 extern CUresult cuGetErrorString (CUresult, const char **);
72 #define CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR 82
73 #endif
75 #if CUDA_VERSION >= 6050
76 #undef cuLinkCreate
77 #undef cuLinkAddData
78 CUresult cuLinkAddData (CUlinkState, CUjitInputType, void *, size_t,
79 const char *, unsigned, CUjit_option *, void **);
80 CUresult cuLinkCreate (unsigned, CUjit_option *, void **, CUlinkState *);
81 #else
82 typedef size_t (*CUoccupancyB2DSize)(int);
83 CUresult cuLinkAddData_v2 (CUlinkState, CUjitInputType, void *, size_t,
84 const char *, unsigned, CUjit_option *, void **);
85 CUresult cuLinkCreate_v2 (unsigned, CUjit_option *, void **, CUlinkState *);
86 CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
87 CUoccupancyB2DSize, size_t, int);
88 #endif
90 #define DO_PRAGMA(x) _Pragma (#x)
92 #ifndef PLUGIN_NVPTX_LINK_LIBCUDA
93 # include <dlfcn.h>
95 struct cuda_lib_s {
97 # define CUDA_ONE_CALL(call) \
98 __typeof (call) *call;
99 # define CUDA_ONE_CALL_MAYBE_NULL(call) \
100 CUDA_ONE_CALL (call)
101 #include "cuda-lib.def"
102 # undef CUDA_ONE_CALL
103 # undef CUDA_ONE_CALL_MAYBE_NULL
105 } cuda_lib;
107 /* -1 if init_cuda_lib has not been called yet, false
108 if it has been and failed, true if it has been and succeeded. */
109 static signed char cuda_lib_inited = -1;
111 /* Dynamically load the CUDA runtime library and initialize function
112 pointers, return false if unsuccessful, true if successful. */
113 static bool
114 init_cuda_lib (void)
116 if (cuda_lib_inited != -1)
117 return cuda_lib_inited;
118 const char *cuda_runtime_lib = "libcuda.so.1";
119 void *h = dlopen (cuda_runtime_lib, RTLD_LAZY);
120 cuda_lib_inited = false;
121 if (h == NULL)
122 return false;
124 # define CUDA_ONE_CALL(call) CUDA_ONE_CALL_1 (call, false)
125 # define CUDA_ONE_CALL_MAYBE_NULL(call) CUDA_ONE_CALL_1 (call, true)
126 # define CUDA_ONE_CALL_1(call, allow_null) \
127 cuda_lib.call = dlsym (h, #call); \
128 if (!allow_null && cuda_lib.call == NULL) \
129 return false;
130 #include "cuda-lib.def"
131 # undef CUDA_ONE_CALL
132 # undef CUDA_ONE_CALL_1
133 # undef CUDA_ONE_CALL_MAYBE_NULL
135 cuda_lib_inited = true;
136 return true;
138 # define CUDA_CALL_PREFIX cuda_lib.
139 #else
141 # define CUDA_ONE_CALL(call)
142 # define CUDA_ONE_CALL_MAYBE_NULL(call) DO_PRAGMA (weak call)
143 #include "cuda-lib.def"
144 #undef CUDA_ONE_CALL_MAYBE_NULL
145 #undef CUDA_ONE_CALL
147 # define CUDA_CALL_PREFIX
148 # define init_cuda_lib() true
149 #endif
151 #include "secure_getenv.h"
153 #undef MIN
154 #undef MAX
155 #define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
156 #define MAX(X,Y) ((X) > (Y) ? (X) : (Y))
158 /* Convenience macros for the frequently used CUDA library call and
159 error handling sequence as well as CUDA library calls that
160 do the error checking themselves or don't do it at all. */
162 #define CUDA_CALL_ERET(ERET, FN, ...) \
163 do { \
164 unsigned __r \
165 = CUDA_CALL_PREFIX FN (__VA_ARGS__); \
166 if (__r != CUDA_SUCCESS) \
168 GOMP_PLUGIN_error (#FN " error: %s", \
169 cuda_error (__r)); \
170 return ERET; \
172 } while (0)
174 #define CUDA_CALL(FN, ...) \
175 CUDA_CALL_ERET (false, FN, __VA_ARGS__)
177 #define CUDA_CALL_ASSERT(FN, ...) \
178 do { \
179 unsigned __r \
180 = CUDA_CALL_PREFIX FN (__VA_ARGS__); \
181 if (__r != CUDA_SUCCESS) \
183 GOMP_PLUGIN_fatal (#FN " error: %s", \
184 cuda_error (__r)); \
186 } while (0)
188 #define CUDA_CALL_NOCHECK(FN, ...) \
189 CUDA_CALL_PREFIX FN (__VA_ARGS__)
191 #define CUDA_CALL_EXISTS(FN) \
192 CUDA_CALL_PREFIX FN
194 static const char *
195 cuda_error (CUresult r)
197 const char *fallback = "unknown cuda error";
198 const char *desc;
200 if (!CUDA_CALL_EXISTS (cuGetErrorString))
201 return fallback;
203 r = CUDA_CALL_NOCHECK (cuGetErrorString, r, &desc);
204 if (r == CUDA_SUCCESS)
205 return desc;
207 return fallback;
210 /* Version of the CUDA Toolkit in the same MAJOR.MINOR format that is used by
211 Nvidia, such as in the 'deviceQuery' program (Nvidia's CUDA samples). */
212 static char cuda_driver_version_s[30];
214 static unsigned int instantiated_devices = 0;
215 static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
217 /* NVPTX/CUDA specific definition of asynchronous queues. */
218 struct goacc_asyncqueue
220 CUstream cuda_stream;
223 struct nvptx_callback
225 void (*fn) (void *);
226 void *ptr;
227 struct goacc_asyncqueue *aq;
228 struct nvptx_callback *next;
231 /* Thread-specific data for PTX. */
233 struct nvptx_thread
235 /* We currently have this embedded inside the plugin because libgomp manages
236 devices through integer target_ids. This might be better if using an
237 opaque target-specific pointer directly from gomp_device_descr. */
238 struct ptx_device *ptx_dev;
241 /* Target data function launch information. */
243 struct targ_fn_launch
245 const char *fn;
246 unsigned short dim[GOMP_DIM_MAX];
249 /* Target PTX object information. */
251 struct targ_ptx_obj
253 const char *code;
254 size_t size;
257 /* Target data image information. */
259 typedef struct nvptx_tdata
261 const struct targ_ptx_obj *ptx_objs;
262 unsigned ptx_num;
264 const char *const *var_names;
265 unsigned var_num;
267 const struct targ_fn_launch *fn_descs;
268 unsigned fn_num;
270 unsigned ind_fn_num;
271 } nvptx_tdata_t;
273 /* Descriptor of a loaded function. */
275 struct targ_fn_descriptor
277 CUfunction fn;
278 const struct targ_fn_launch *launch;
279 int regs_per_thread;
280 int max_threads_per_block;
283 /* A loaded PTX image. */
284 struct ptx_image_data
286 const void *target_data;
287 CUmodule module;
289 struct targ_fn_descriptor *fns; /* Array of functions. */
291 struct ptx_image_data *next;
294 struct ptx_free_block
296 void *ptr;
297 struct ptx_free_block *next;
300 struct ptx_device
302 CUcontext ctx;
303 bool ctx_shared;
304 CUdevice dev;
306 int ord;
307 bool overlap;
308 bool map;
309 bool concur;
310 bool mkern;
311 int mode;
312 int clock_khz;
313 int num_sms;
314 int regs_per_block;
315 int regs_per_sm;
316 int warp_size;
317 int max_threads_per_block;
318 int max_threads_per_multiprocessor;
319 int default_dims[GOMP_DIM_MAX];
321 /* Length as used by the CUDA Runtime API ('struct cudaDeviceProp'). */
322 char name[256];
324 struct ptx_image_data *images; /* Images loaded on device. */
325 pthread_mutex_t image_lock; /* Lock for above list. */
327 struct ptx_free_block *free_blocks;
328 pthread_mutex_t free_blocks_lock;
330 /* OpenMP stacks, cached between kernel invocations. */
331 struct
333 CUdeviceptr ptr;
334 size_t size;
335 pthread_mutex_t lock;
336 } omp_stacks;
338 struct rev_offload *rev_data;
339 struct ptx_device *next;
342 static struct ptx_device **ptx_devices;
344 /* OpenMP kernels reserve a small amount of ".shared" space for use by
345 omp_alloc. The size is configured using GOMP_NVPTX_LOWLAT_POOL, but the
346 default is set here. */
347 static unsigned lowlat_pool_size = 8 * 1024;
349 static inline struct nvptx_thread *
350 nvptx_thread (void)
352 return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
355 /* Initialize the device. Return TRUE on success, else FALSE. PTX_DEV_LOCK
356 should be locked on entry and remains locked on exit. */
358 static bool
359 nvptx_init (void)
361 int ndevs;
363 if (instantiated_devices != 0)
364 return true;
366 if (!init_cuda_lib ())
367 return false;
369 CUDA_CALL (cuInit, 0);
371 int cuda_driver_version;
372 CUDA_CALL_ERET (NULL, cuDriverGetVersion, &cuda_driver_version);
373 snprintf (cuda_driver_version_s, sizeof cuda_driver_version_s,
374 "CUDA Driver %u.%u",
375 cuda_driver_version / 1000, cuda_driver_version % 1000 / 10);
377 CUDA_CALL (cuDeviceGetCount, &ndevs);
378 ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *)
379 * ndevs);
381 return true;
384 /* Select the N'th PTX device for the current host thread. The device must
385 have been previously opened before calling this function. */
387 static bool
388 nvptx_attach_host_thread_to_device (int n)
390 CUdevice dev;
391 CUresult r;
392 struct ptx_device *ptx_dev;
393 CUcontext thd_ctx;
395 r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &dev);
396 if (r == CUDA_ERROR_NOT_PERMITTED)
398 /* Assume we're in a CUDA callback, just return true. */
399 return true;
401 if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
403 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
404 return false;
407 if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n)
408 return true;
409 else
411 CUcontext old_ctx;
413 ptx_dev = ptx_devices[n];
414 if (!ptx_dev)
416 GOMP_PLUGIN_error ("device %d not found", n);
417 return false;
420 CUDA_CALL (cuCtxGetCurrent, &thd_ctx);
422 /* We don't necessarily have a current context (e.g. if it has been
423 destroyed. Pop it if we do though. */
424 if (thd_ctx != NULL)
425 CUDA_CALL (cuCtxPopCurrent, &old_ctx);
427 CUDA_CALL (cuCtxPushCurrent, ptx_dev->ctx);
429 return true;
432 static struct ptx_device *
433 nvptx_open_device (int n)
435 struct ptx_device *ptx_dev;
436 CUdevice dev, ctx_dev;
437 CUresult r;
438 int pi;
440 CUDA_CALL_ERET (NULL, cuDeviceGet, &dev, n);
442 ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device));
444 ptx_dev->ord = n;
445 ptx_dev->dev = dev;
446 ptx_dev->ctx_shared = false;
448 r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &ctx_dev);
449 if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
451 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
452 return NULL;
455 if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev)
457 /* The current host thread has an active context for a different device.
458 Detach it. */
459 CUcontext old_ctx;
460 CUDA_CALL_ERET (NULL, cuCtxPopCurrent, &old_ctx);
463 CUDA_CALL_ERET (NULL, cuCtxGetCurrent, &ptx_dev->ctx);
465 if (!ptx_dev->ctx)
466 CUDA_CALL_ERET (NULL, cuCtxCreate, &ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
467 else
468 ptx_dev->ctx_shared = true;
470 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
471 &pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
472 ptx_dev->overlap = pi;
474 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
475 &pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
476 ptx_dev->map = pi;
478 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
479 &pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
480 ptx_dev->concur = pi;
482 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
483 &pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
484 ptx_dev->mode = pi;
486 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
487 &pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
488 ptx_dev->mkern = pi;
490 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
491 &pi, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
492 ptx_dev->clock_khz = pi;
494 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
495 &pi, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
496 ptx_dev->num_sms = pi;
498 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
499 &pi, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev);
500 ptx_dev->regs_per_block = pi;
502 /* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR is defined only
503 in CUDA 6.0 and newer. */
504 r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi,
505 CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR,
506 dev);
507 /* Fallback: use limit of registers per block, which is usually equal. */
508 if (r == CUDA_ERROR_INVALID_VALUE)
509 pi = ptx_dev->regs_per_block;
510 else if (r != CUDA_SUCCESS)
512 GOMP_PLUGIN_error ("cuDeviceGetAttribute error: %s", cuda_error (r));
513 return NULL;
515 ptx_dev->regs_per_sm = pi;
517 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
518 &pi, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev);
519 if (pi != 32)
521 GOMP_PLUGIN_error ("Only warp size 32 is supported");
522 return NULL;
524 ptx_dev->warp_size = pi;
526 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
527 CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev);
528 ptx_dev->max_threads_per_block = pi;
530 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
531 CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev);
532 ptx_dev->max_threads_per_multiprocessor = pi;
534 /* Required below for reverse offload as implemented, but with compute
535 capability >= 2.0 and 64bit device processes, this should be universally be
536 the case; hence, an assert. */
537 r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi,
538 CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, dev);
539 assert (r == CUDA_SUCCESS && pi);
541 for (int i = 0; i != GOMP_DIM_MAX; i++)
542 ptx_dev->default_dims[i] = 0;
544 CUDA_CALL_ERET (NULL, cuDeviceGetName, ptx_dev->name, sizeof ptx_dev->name,
545 dev);
547 ptx_dev->images = NULL;
548 pthread_mutex_init (&ptx_dev->image_lock, NULL);
550 ptx_dev->free_blocks = NULL;
551 pthread_mutex_init (&ptx_dev->free_blocks_lock, NULL);
553 ptx_dev->omp_stacks.ptr = 0;
554 ptx_dev->omp_stacks.size = 0;
555 pthread_mutex_init (&ptx_dev->omp_stacks.lock, NULL);
557 ptx_dev->rev_data = NULL;
559 return ptx_dev;
562 static bool
563 nvptx_close_device (struct ptx_device *ptx_dev)
565 if (!ptx_dev)
566 return true;
568 for (struct ptx_free_block *b = ptx_dev->free_blocks; b;)
570 struct ptx_free_block *b_next = b->next;
571 CUDA_CALL (cuMemFree, (CUdeviceptr) b->ptr);
572 free (b);
573 b = b_next;
576 pthread_mutex_destroy (&ptx_dev->free_blocks_lock);
577 pthread_mutex_destroy (&ptx_dev->image_lock);
579 pthread_mutex_destroy (&ptx_dev->omp_stacks.lock);
581 if (ptx_dev->omp_stacks.ptr)
582 CUDA_CALL (cuMemFree, ptx_dev->omp_stacks.ptr);
584 if (!ptx_dev->ctx_shared)
585 CUDA_CALL (cuCtxDestroy, ptx_dev->ctx);
587 free (ptx_dev);
588 return true;
591 static int
592 nvptx_get_num_devices (void)
594 int n;
596 /* This function will be called before the plugin has been initialized in
597 order to enumerate available devices, but CUDA API routines can't be used
598 until cuInit has been called. Just call it now (but don't yet do any
599 further initialization). */
600 if (instantiated_devices == 0)
602 if (!init_cuda_lib ())
603 return 0;
604 CUresult r = CUDA_CALL_NOCHECK (cuInit, 0);
605 /* This is not an error: e.g. we may have CUDA libraries installed but
606 no devices available. */
607 if (r != CUDA_SUCCESS)
609 GOMP_PLUGIN_debug (0, "Disabling nvptx offloading; cuInit: %s\n",
610 cuda_error (r));
611 return 0;
615 CUDA_CALL_ERET (-1, cuDeviceGetCount, &n);
616 return n;
619 static void
620 notify_var (const char *var_name, const char *env_var)
622 if (env_var == NULL)
623 GOMP_PLUGIN_debug (0, "%s: <Not defined>\n", var_name);
624 else
625 GOMP_PLUGIN_debug (0, "%s: '%s'\n", var_name, env_var);
628 static void
629 process_GOMP_NVPTX_JIT (intptr_t *gomp_nvptx_o)
631 const char *var_name = "GOMP_NVPTX_JIT";
632 const char *env_var = secure_getenv (var_name);
633 notify_var (var_name, env_var);
635 if (env_var == NULL)
636 return;
638 const char *c = env_var;
639 while (*c != '\0')
641 while (*c == ' ')
642 c++;
644 if (c[0] == '-' && c[1] == 'O'
645 && '0' <= c[2] && c[2] <= '4'
646 && (c[3] == '\0' || c[3] == ' '))
648 *gomp_nvptx_o = c[2] - '0';
649 c += 3;
650 continue;
653 GOMP_PLUGIN_error ("Error parsing %s", var_name);
654 break;
658 static bool
659 link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs,
660 unsigned num_objs)
662 CUjit_option opts[7];
663 void *optvals[7];
664 float elapsed = 0.0;
665 char elog[1024];
666 char ilog[16384];
667 CUlinkState linkstate;
668 CUresult r;
669 void *linkout;
670 size_t linkoutsize __attribute__ ((unused));
672 opts[0] = CU_JIT_WALL_TIME;
673 optvals[0] = &elapsed;
675 opts[1] = CU_JIT_INFO_LOG_BUFFER;
676 optvals[1] = &ilog[0];
678 opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
679 optvals[2] = (void *) sizeof ilog;
681 opts[3] = CU_JIT_ERROR_LOG_BUFFER;
682 optvals[3] = &elog[0];
684 opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
685 optvals[4] = (void *) sizeof elog;
687 opts[5] = CU_JIT_LOG_VERBOSE;
688 optvals[5] = (void *) 1;
690 static intptr_t gomp_nvptx_o = -1;
692 static bool init_done = false;
693 if (!init_done)
695 process_GOMP_NVPTX_JIT (&gomp_nvptx_o);
696 init_done = true;
699 int nopts = 6;
700 if (gomp_nvptx_o != -1)
702 opts[nopts] = CU_JIT_OPTIMIZATION_LEVEL;
703 optvals[nopts] = (void *) gomp_nvptx_o;
704 nopts++;
707 if (CUDA_CALL_EXISTS (cuLinkCreate_v2))
708 CUDA_CALL (cuLinkCreate_v2, nopts, opts, optvals, &linkstate);
709 else
710 CUDA_CALL (cuLinkCreate, nopts, opts, optvals, &linkstate);
712 for (; num_objs--; ptx_objs++)
714 /* cuLinkAddData's 'data' argument erroneously omits the const
715 qualifier. */
716 GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs->code);
717 if (CUDA_CALL_EXISTS (cuLinkAddData_v2))
718 r = CUDA_CALL_NOCHECK (cuLinkAddData_v2, linkstate, CU_JIT_INPUT_PTX,
719 (char *) ptx_objs->code, ptx_objs->size,
720 0, 0, 0, 0);
721 else
722 r = CUDA_CALL_NOCHECK (cuLinkAddData, linkstate, CU_JIT_INPUT_PTX,
723 (char *) ptx_objs->code, ptx_objs->size,
724 0, 0, 0, 0);
725 if (r != CUDA_SUCCESS)
727 GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
728 GOMP_PLUGIN_error ("cuLinkAddData (ptx_code) error: %s",
729 cuda_error (r));
730 return false;
734 GOMP_PLUGIN_debug (0, "Linking\n");
735 r = CUDA_CALL_NOCHECK (cuLinkComplete, linkstate, &linkout, &linkoutsize);
737 GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed);
738 GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]);
740 if (r != CUDA_SUCCESS)
742 GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
743 GOMP_PLUGIN_error ("cuLinkComplete error: %s", cuda_error (r));
744 return false;
747 CUDA_CALL (cuModuleLoadData, module, linkout);
748 CUDA_CALL (cuLinkDestroy, linkstate);
749 return true;
752 static void
753 nvptx_exec (void (*fn), unsigned *dims, void *targ_mem_desc,
754 CUdeviceptr dp, CUstream stream)
756 struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
757 CUfunction function;
758 int i;
759 void *kargs[1];
760 struct nvptx_thread *nvthd = nvptx_thread ();
761 int warp_size = nvthd->ptx_dev->warp_size;
763 function = targ_fn->fn;
765 /* Initialize the launch dimensions. Typically this is constant,
766 provided by the device compiler, but we must permit runtime
767 values. */
768 int seen_zero = 0;
769 for (i = 0; i != GOMP_DIM_MAX; i++)
771 if (targ_fn->launch->dim[i])
772 dims[i] = targ_fn->launch->dim[i];
773 if (!dims[i])
774 seen_zero = 1;
777 if (seen_zero)
779 pthread_mutex_lock (&ptx_dev_lock);
781 static int gomp_openacc_dims[GOMP_DIM_MAX];
782 if (!gomp_openacc_dims[0])
784 /* See if the user provided GOMP_OPENACC_DIM environment
785 variable to specify runtime defaults. */
786 for (int i = 0; i < GOMP_DIM_MAX; ++i)
787 gomp_openacc_dims[i] = GOMP_PLUGIN_acc_default_dim (i);
790 if (!nvthd->ptx_dev->default_dims[0])
792 int default_dims[GOMP_DIM_MAX];
793 for (int i = 0; i < GOMP_DIM_MAX; ++i)
794 default_dims[i] = gomp_openacc_dims[i];
796 int gang, worker, vector;
798 int block_size = nvthd->ptx_dev->max_threads_per_block;
799 int cpu_size = nvthd->ptx_dev->max_threads_per_multiprocessor;
800 int dev_size = nvthd->ptx_dev->num_sms;
801 GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
802 " dev_size=%d, cpu_size=%d\n",
803 warp_size, block_size, dev_size, cpu_size);
805 gang = (cpu_size / block_size) * dev_size;
806 worker = block_size / warp_size;
807 vector = warp_size;
810 /* There is no upper bound on the gang size. The best size
811 matches the hardware configuration. Logical gangs are
812 scheduled onto physical hardware. To maximize usage, we
813 should guess a large number. */
814 if (default_dims[GOMP_DIM_GANG] < 1)
815 default_dims[GOMP_DIM_GANG] = gang ? gang : 1024;
816 /* The worker size must not exceed the hardware. */
817 if (default_dims[GOMP_DIM_WORKER] < 1
818 || (default_dims[GOMP_DIM_WORKER] > worker && gang))
819 default_dims[GOMP_DIM_WORKER] = worker;
820 /* The vector size must exactly match the hardware. */
821 if (default_dims[GOMP_DIM_VECTOR] < 1
822 || (default_dims[GOMP_DIM_VECTOR] != vector && gang))
823 default_dims[GOMP_DIM_VECTOR] = vector;
825 GOMP_PLUGIN_debug (0, " default dimensions [%d,%d,%d]\n",
826 default_dims[GOMP_DIM_GANG],
827 default_dims[GOMP_DIM_WORKER],
828 default_dims[GOMP_DIM_VECTOR]);
830 for (i = 0; i != GOMP_DIM_MAX; i++)
831 nvthd->ptx_dev->default_dims[i] = default_dims[i];
833 pthread_mutex_unlock (&ptx_dev_lock);
836 bool default_dim_p[GOMP_DIM_MAX];
837 for (i = 0; i != GOMP_DIM_MAX; i++)
838 default_dim_p[i] = !dims[i];
840 if (!CUDA_CALL_EXISTS (cuOccupancyMaxPotentialBlockSize))
842 for (i = 0; i != GOMP_DIM_MAX; i++)
843 if (default_dim_p[i])
844 dims[i] = nvthd->ptx_dev->default_dims[i];
846 if (default_dim_p[GOMP_DIM_VECTOR])
847 dims[GOMP_DIM_VECTOR]
848 = MIN (dims[GOMP_DIM_VECTOR],
849 (targ_fn->max_threads_per_block / warp_size
850 * warp_size));
852 if (default_dim_p[GOMP_DIM_WORKER])
853 dims[GOMP_DIM_WORKER]
854 = MIN (dims[GOMP_DIM_WORKER],
855 targ_fn->max_threads_per_block / dims[GOMP_DIM_VECTOR]);
857 else
859 /* Handle the case that the compiler allows the runtime to choose
860 the vector-length conservatively, by ignoring
861 gomp_openacc_dims[GOMP_DIM_VECTOR]. TODO: actually handle
862 it. */
863 int vectors = 0;
864 /* TODO: limit gomp_openacc_dims[GOMP_DIM_WORKER] such that that
865 gomp_openacc_dims[GOMP_DIM_WORKER] * actual_vectors does not
866 exceed targ_fn->max_threads_per_block. */
867 int workers = gomp_openacc_dims[GOMP_DIM_WORKER];
868 int gangs = gomp_openacc_dims[GOMP_DIM_GANG];
869 int grids, blocks;
871 CUDA_CALL_ASSERT (cuOccupancyMaxPotentialBlockSize, &grids,
872 &blocks, function, NULL, 0,
873 dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]);
874 GOMP_PLUGIN_debug (0, "cuOccupancyMaxPotentialBlockSize: "
875 "grid = %d, block = %d\n", grids, blocks);
877 /* Keep the num_gangs proportional to the block size. In
878 the case were a block size is limited by shared-memory
879 or the register file capacity, the runtime will not
880 excessively over assign gangs to the multiprocessor
881 units if their state is going to be swapped out even
882 more than necessary. The constant factor 2 is there to
883 prevent threads from idling when there is insufficient
884 work for them. */
885 if (gangs == 0)
886 gangs = 2 * grids * (blocks / warp_size);
888 if (vectors == 0)
889 vectors = warp_size;
891 if (workers == 0)
893 int actual_vectors = (default_dim_p[GOMP_DIM_VECTOR]
894 ? vectors
895 : dims[GOMP_DIM_VECTOR]);
896 workers = blocks / actual_vectors;
897 workers = MAX (workers, 1);
898 /* If we need a per-worker barrier ... . */
899 if (actual_vectors > 32)
900 /* Don't use more barriers than available. */
901 workers = MIN (workers, 15);
904 for (i = 0; i != GOMP_DIM_MAX; i++)
905 if (default_dim_p[i])
906 switch (i)
908 case GOMP_DIM_GANG: dims[i] = gangs; break;
909 case GOMP_DIM_WORKER: dims[i] = workers; break;
910 case GOMP_DIM_VECTOR: dims[i] = vectors; break;
911 default: GOMP_PLUGIN_fatal ("invalid dim");
917 /* Check if the accelerator has sufficient hardware resources to
918 launch the offloaded kernel. */
919 if (dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]
920 > targ_fn->max_threads_per_block)
922 const char *msg
923 = ("The Nvidia accelerator has insufficient resources to launch '%s'"
924 " with num_workers = %d and vector_length = %d"
925 "; "
926 "recompile the program with 'num_workers = x and vector_length = y'"
927 " on that offloaded region or '-fopenacc-dim=:x:y' where"
928 " x * y <= %d"
929 ".\n");
930 GOMP_PLUGIN_fatal (msg, targ_fn->launch->fn, dims[GOMP_DIM_WORKER],
931 dims[GOMP_DIM_VECTOR], targ_fn->max_threads_per_block);
934 /* Check if the accelerator has sufficient barrier resources to
935 launch the offloaded kernel. */
936 if (dims[GOMP_DIM_WORKER] > 15 && dims[GOMP_DIM_VECTOR] > 32)
938 const char *msg
939 = ("The Nvidia accelerator has insufficient barrier resources to launch"
940 " '%s' with num_workers = %d and vector_length = %d"
941 "; "
942 "recompile the program with 'num_workers = x' on that offloaded"
943 " region or '-fopenacc-dim=:x:' where x <= 15"
944 "; "
945 "or, recompile the program with 'vector_length = 32' on that"
946 " offloaded region or '-fopenacc-dim=::32'"
947 ".\n");
948 GOMP_PLUGIN_fatal (msg, targ_fn->launch->fn, dims[GOMP_DIM_WORKER],
949 dims[GOMP_DIM_VECTOR]);
952 GOMP_PLUGIN_debug (0, " %s: kernel %s: launch"
953 " gangs=%u, workers=%u, vectors=%u\n",
954 __FUNCTION__, targ_fn->launch->fn, dims[GOMP_DIM_GANG],
955 dims[GOMP_DIM_WORKER], dims[GOMP_DIM_VECTOR]);
957 // OpenACC CUDA
959 // num_gangs nctaid.x
960 // num_workers ntid.y
961 // vector length ntid.x
963 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
964 acc_prof_info *prof_info = thr->prof_info;
965 acc_event_info enqueue_launch_event_info;
966 acc_api_info *api_info = thr->api_info;
967 bool profiling_p = __builtin_expect (prof_info != NULL, false);
968 if (profiling_p)
970 prof_info->event_type = acc_ev_enqueue_launch_start;
972 enqueue_launch_event_info.launch_event.event_type
973 = prof_info->event_type;
974 enqueue_launch_event_info.launch_event.valid_bytes
975 = _ACC_LAUNCH_EVENT_INFO_VALID_BYTES;
976 enqueue_launch_event_info.launch_event.parent_construct
977 = acc_construct_parallel;
978 enqueue_launch_event_info.launch_event.implicit = 1;
979 enqueue_launch_event_info.launch_event.tool_info = NULL;
980 enqueue_launch_event_info.launch_event.kernel_name = targ_fn->launch->fn;
981 enqueue_launch_event_info.launch_event.num_gangs
982 = dims[GOMP_DIM_GANG];
983 enqueue_launch_event_info.launch_event.num_workers
984 = dims[GOMP_DIM_WORKER];
985 enqueue_launch_event_info.launch_event.vector_length
986 = dims[GOMP_DIM_VECTOR];
988 api_info->device_api = acc_device_api_cuda;
990 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &enqueue_launch_event_info,
991 api_info);
994 kargs[0] = &dp;
995 CUDA_CALL_ASSERT (cuLaunchKernel, function,
996 dims[GOMP_DIM_GANG], 1, 1,
997 dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1,
998 0, stream, kargs, 0);
1000 if (profiling_p)
1002 prof_info->event_type = acc_ev_enqueue_launch_end;
1003 enqueue_launch_event_info.launch_event.event_type
1004 = prof_info->event_type;
1005 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &enqueue_launch_event_info,
1006 api_info);
1009 GOMP_PLUGIN_debug (0, " %s: kernel %s: finished\n", __FUNCTION__,
1010 targ_fn->launch->fn);
1013 void * openacc_get_current_cuda_context (void);
1015 static void
1016 goacc_profiling_acc_ev_alloc (struct goacc_thread *thr, void *dp, size_t s)
1018 acc_prof_info *prof_info = thr->prof_info;
1019 acc_event_info data_event_info;
1020 acc_api_info *api_info = thr->api_info;
1022 prof_info->event_type = acc_ev_alloc;
1024 data_event_info.data_event.event_type = prof_info->event_type;
1025 data_event_info.data_event.valid_bytes = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1026 data_event_info.data_event.parent_construct = acc_construct_parallel;
1027 data_event_info.data_event.implicit = 1;
1028 data_event_info.data_event.tool_info = NULL;
1029 data_event_info.data_event.var_name = NULL;
1030 data_event_info.data_event.bytes = s;
1031 data_event_info.data_event.host_ptr = NULL;
1032 data_event_info.data_event.device_ptr = dp;
1034 api_info->device_api = acc_device_api_cuda;
1036 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, api_info);
1039 /* Free the cached soft-stacks block if it is above the SOFTSTACK_CACHE_LIMIT
1040 size threshold, or if FORCE is true. */
1042 static void
1043 nvptx_stacks_free (struct ptx_device *ptx_dev, bool force)
1045 pthread_mutex_lock (&ptx_dev->omp_stacks.lock);
1046 if (ptx_dev->omp_stacks.ptr
1047 && (force || ptx_dev->omp_stacks.size > SOFTSTACK_CACHE_LIMIT))
1049 CUresult r = CUDA_CALL_NOCHECK (cuMemFree, ptx_dev->omp_stacks.ptr);
1050 if (r != CUDA_SUCCESS)
1051 GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
1052 ptx_dev->omp_stacks.ptr = 0;
1053 ptx_dev->omp_stacks.size = 0;
1055 pthread_mutex_unlock (&ptx_dev->omp_stacks.lock);
1058 static void *
1059 nvptx_alloc (size_t s, bool suppress_errors)
1061 CUdeviceptr d;
1063 CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &d, s);
1064 if (suppress_errors && r == CUDA_ERROR_OUT_OF_MEMORY)
1065 return NULL;
1066 else if (r != CUDA_SUCCESS)
1068 GOMP_PLUGIN_error ("nvptx_alloc error: %s", cuda_error (r));
1069 return NULL;
1072 /* NOTE: We only do profiling stuff if the memory allocation succeeds. */
1073 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1074 bool profiling_p
1075 = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1076 if (profiling_p)
1077 goacc_profiling_acc_ev_alloc (thr, (void *) d, s);
1079 return (void *) d;
1082 static void
1083 goacc_profiling_acc_ev_free (struct goacc_thread *thr, void *p)
1085 acc_prof_info *prof_info = thr->prof_info;
1086 acc_event_info data_event_info;
1087 acc_api_info *api_info = thr->api_info;
1089 prof_info->event_type = acc_ev_free;
1091 data_event_info.data_event.event_type = prof_info->event_type;
1092 data_event_info.data_event.valid_bytes = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1093 data_event_info.data_event.parent_construct = acc_construct_parallel;
1094 data_event_info.data_event.implicit = 1;
1095 data_event_info.data_event.tool_info = NULL;
1096 data_event_info.data_event.var_name = NULL;
1097 data_event_info.data_event.bytes = -1;
1098 data_event_info.data_event.host_ptr = NULL;
1099 data_event_info.data_event.device_ptr = p;
1101 api_info->device_api = acc_device_api_cuda;
1103 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, api_info);
1106 static bool
1107 nvptx_free (void *p, struct ptx_device *ptx_dev)
1109 CUdeviceptr pb;
1110 size_t ps;
1112 CUresult r = CUDA_CALL_NOCHECK (cuMemGetAddressRange, &pb, &ps,
1113 (CUdeviceptr) p);
1114 if (r == CUDA_ERROR_NOT_PERMITTED)
1116 /* We assume that this error indicates we are in a CUDA callback context,
1117 where all CUDA calls are not allowed (see cuStreamAddCallback
1118 documentation for description). Arrange to free this piece of device
1119 memory later. */
1120 struct ptx_free_block *n
1121 = GOMP_PLUGIN_malloc (sizeof (struct ptx_free_block));
1122 n->ptr = p;
1123 pthread_mutex_lock (&ptx_dev->free_blocks_lock);
1124 n->next = ptx_dev->free_blocks;
1125 ptx_dev->free_blocks = n;
1126 pthread_mutex_unlock (&ptx_dev->free_blocks_lock);
1127 return true;
1129 else if (r != CUDA_SUCCESS)
1131 GOMP_PLUGIN_error ("cuMemGetAddressRange error: %s", cuda_error (r));
1132 return false;
1134 if ((CUdeviceptr) p != pb)
1136 GOMP_PLUGIN_error ("invalid device address");
1137 return false;
1140 CUDA_CALL (cuMemFree, (CUdeviceptr) p);
1141 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1142 bool profiling_p
1143 = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1144 if (profiling_p)
1145 goacc_profiling_acc_ev_free (thr, p);
1147 return true;
1150 static void *
1151 nvptx_get_current_cuda_device (void)
1153 struct nvptx_thread *nvthd = nvptx_thread ();
1155 if (!nvthd || !nvthd->ptx_dev)
1156 return NULL;
1158 return &nvthd->ptx_dev->dev;
1161 static void *
1162 nvptx_get_current_cuda_context (void)
1164 struct nvptx_thread *nvthd = nvptx_thread ();
1166 if (!nvthd || !nvthd->ptx_dev)
1167 return NULL;
1169 return nvthd->ptx_dev->ctx;
1172 /* Plugin entry points. */
1174 const char *
1175 GOMP_OFFLOAD_get_name (void)
1177 return "nvptx";
1180 unsigned int
1181 GOMP_OFFLOAD_get_caps (void)
1183 return GOMP_OFFLOAD_CAP_OPENACC_200 | GOMP_OFFLOAD_CAP_OPENMP_400;
1187 GOMP_OFFLOAD_get_type (void)
1189 return OFFLOAD_TARGET_TYPE_NVIDIA_PTX;
1193 GOMP_OFFLOAD_get_num_devices (unsigned int omp_requires_mask)
1195 int num_devices = nvptx_get_num_devices ();
1196 /* Return -1 if no omp_requires_mask cannot be fulfilled but
1197 devices were present. Unified-shared address: see comment in
1198 nvptx_open_device for CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING. */
1199 if (num_devices > 0
1200 && ((omp_requires_mask
1201 & ~(GOMP_REQUIRES_UNIFIED_ADDRESS
1202 | GOMP_REQUIRES_REVERSE_OFFLOAD)) != 0))
1203 return -1;
1204 return num_devices;
1207 bool
1208 GOMP_OFFLOAD_init_device (int n)
1210 struct ptx_device *dev;
1212 pthread_mutex_lock (&ptx_dev_lock);
1214 if (!nvptx_init () || ptx_devices[n] != NULL)
1216 pthread_mutex_unlock (&ptx_dev_lock);
1217 return false;
1220 dev = nvptx_open_device (n);
1221 if (dev)
1223 ptx_devices[n] = dev;
1224 instantiated_devices++;
1227 const char *var_name = "GOMP_NVPTX_LOWLAT_POOL";
1228 const char *env_var = secure_getenv (var_name);
1229 notify_var (var_name, env_var);
1231 if (env_var != NULL)
1233 char *endptr;
1234 unsigned long val = strtoul (env_var, &endptr, 10);
1235 if (endptr == NULL || *endptr != '\0'
1236 || errno == ERANGE || errno == EINVAL
1237 || val > UINT_MAX)
1238 GOMP_PLUGIN_error ("Error parsing %s", var_name);
1239 else
1240 lowlat_pool_size = val;
1243 pthread_mutex_unlock (&ptx_dev_lock);
1245 return dev != NULL;
1248 bool
1249 GOMP_OFFLOAD_fini_device (int n)
1251 pthread_mutex_lock (&ptx_dev_lock);
1253 if (ptx_devices[n] != NULL)
1255 if (!nvptx_attach_host_thread_to_device (n)
1256 || !nvptx_close_device (ptx_devices[n]))
1258 pthread_mutex_unlock (&ptx_dev_lock);
1259 return false;
1261 ptx_devices[n] = NULL;
1262 instantiated_devices--;
1265 if (instantiated_devices == 0)
1267 free (ptx_devices);
1268 ptx_devices = NULL;
1271 pthread_mutex_unlock (&ptx_dev_lock);
1272 return true;
1275 /* Return the libgomp version number we're compatible with. There is
1276 no requirement for cross-version compatibility. */
1278 unsigned
1279 GOMP_OFFLOAD_version (void)
1281 return GOMP_VERSION;
1284 /* Initialize __nvptx_clocktick, if present in MODULE. */
1286 static void
1287 nvptx_set_clocktick (CUmodule module, struct ptx_device *dev)
1289 CUdeviceptr dptr;
1290 CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &dptr, NULL,
1291 module, "__nvptx_clocktick");
1292 if (r == CUDA_ERROR_NOT_FOUND)
1293 return;
1294 if (r != CUDA_SUCCESS)
1295 GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
1296 double __nvptx_clocktick = 1e-3 / dev->clock_khz;
1297 r = CUDA_CALL_NOCHECK (cuMemcpyHtoD, dptr, &__nvptx_clocktick,
1298 sizeof (__nvptx_clocktick));
1299 if (r != CUDA_SUCCESS)
1300 GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
1303 /* Load the (partial) program described by TARGET_DATA to device
1304 number ORD. Allocate and return TARGET_TABLE. If not NULL, REV_FN_TABLE
1305 will contain the on-device addresses of the functions for reverse offload.
1306 To be freed by the caller. */
1309 GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
1310 struct addr_pair **target_table,
1311 uint64_t **rev_fn_table,
1312 uint64_t *host_ind_fn_table)
1314 CUmodule module;
1315 const char *const *var_names;
1316 const struct targ_fn_launch *fn_descs;
1317 unsigned int fn_entries, var_entries, ind_fn_entries, other_entries, i, j;
1318 struct targ_fn_descriptor *targ_fns;
1319 struct addr_pair *targ_tbl;
1320 const nvptx_tdata_t *img_header = (const nvptx_tdata_t *) target_data;
1321 struct ptx_image_data *new_image;
1322 struct ptx_device *dev;
1324 if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1326 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1327 " (expected %u, received %u)",
1328 GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1329 return -1;
1332 if (!nvptx_attach_host_thread_to_device (ord)
1333 || !link_ptx (&module, img_header->ptx_objs, img_header->ptx_num))
1334 return -1;
1336 dev = ptx_devices[ord];
1338 /* The mkoffload utility emits a struct of pointers/integers at the
1339 start of each offload image. The array of kernel names and the
1340 functions addresses form a one-to-one correspondence. */
1342 var_entries = img_header->var_num;
1343 var_names = img_header->var_names;
1344 fn_entries = img_header->fn_num;
1345 fn_descs = img_header->fn_descs;
1346 ind_fn_entries = GOMP_VERSION_SUPPORTS_INDIRECT_FUNCS (version)
1347 ? img_header->ind_fn_num : 0;
1349 /* Currently, other_entries contains only the struct of ICVs. */
1350 other_entries = 1;
1352 targ_tbl = GOMP_PLUGIN_malloc (sizeof (struct addr_pair)
1353 * (fn_entries + var_entries + other_entries));
1354 targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
1355 * fn_entries);
1357 *target_table = targ_tbl;
1359 new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data));
1360 new_image->target_data = target_data;
1361 new_image->module = module;
1362 new_image->fns = targ_fns;
1364 pthread_mutex_lock (&dev->image_lock);
1365 new_image->next = dev->images;
1366 dev->images = new_image;
1367 pthread_mutex_unlock (&dev->image_lock);
1369 for (i = 0; i < fn_entries; i++, targ_fns++, targ_tbl++)
1371 CUfunction function;
1372 int nregs, mthrs;
1374 CUDA_CALL_ERET (-1, cuModuleGetFunction, &function, module,
1375 fn_descs[i].fn);
1376 CUDA_CALL_ERET (-1, cuFuncGetAttribute, &nregs,
1377 CU_FUNC_ATTRIBUTE_NUM_REGS, function);
1378 CUDA_CALL_ERET (-1, cuFuncGetAttribute, &mthrs,
1379 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, function);
1381 targ_fns->fn = function;
1382 targ_fns->launch = &fn_descs[i];
1383 targ_fns->regs_per_thread = nregs;
1384 targ_fns->max_threads_per_block = mthrs;
1386 targ_tbl->start = (uintptr_t) targ_fns;
1387 targ_tbl->end = targ_tbl->start + 1;
1390 for (j = 0; j < var_entries; j++, targ_tbl++)
1392 CUdeviceptr var;
1393 size_t bytes;
1395 CUDA_CALL_ERET (-1, cuModuleGetGlobal,
1396 &var, &bytes, module, var_names[j]);
1398 targ_tbl->start = (uintptr_t) var;
1399 targ_tbl->end = targ_tbl->start + bytes;
1402 if (ind_fn_entries > 0)
1404 CUdeviceptr var;
1405 size_t bytes;
1407 /* Read indirect function table from image. */
1408 CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &var, &bytes, module,
1409 "$offload_ind_func_table");
1410 if (r != CUDA_SUCCESS)
1411 GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
1412 assert (bytes == sizeof (uint64_t) * ind_fn_entries);
1414 uint64_t ind_fn_table[ind_fn_entries];
1415 r = CUDA_CALL_NOCHECK (cuMemcpyDtoH, ind_fn_table, var, bytes);
1416 if (r != CUDA_SUCCESS)
1417 GOMP_PLUGIN_fatal ("cuMemcpyDtoH error: %s", cuda_error (r));
1419 /* Build host->target address map for indirect functions. */
1420 uint64_t ind_fn_map[ind_fn_entries * 2 + 1];
1421 for (unsigned k = 0; k < ind_fn_entries; k++)
1423 ind_fn_map[k * 2] = host_ind_fn_table[k];
1424 ind_fn_map[k * 2 + 1] = ind_fn_table[k];
1425 GOMP_PLUGIN_debug (0, "Indirect function %d: %lx->%lx\n",
1426 k, host_ind_fn_table[k], ind_fn_table[k]);
1428 ind_fn_map[ind_fn_entries * 2] = 0;
1430 /* Write the map onto the target. */
1431 void *map_target_addr
1432 = GOMP_OFFLOAD_alloc (ord, sizeof (ind_fn_map));
1433 GOMP_PLUGIN_debug (0, "Allocated indirect map at %p\n", map_target_addr);
1435 GOMP_OFFLOAD_host2dev (ord, map_target_addr,
1436 (void*) ind_fn_map,
1437 sizeof (ind_fn_map));
1439 /* Write address of the map onto the target. */
1440 CUdeviceptr varptr;
1441 size_t varsize;
1442 r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &varptr, &varsize,
1443 module, XSTRING (GOMP_INDIRECT_ADDR_MAP));
1444 if (r != CUDA_SUCCESS)
1445 GOMP_PLUGIN_fatal ("Indirect map variable not found in image: %s",
1446 cuda_error (r));
1448 GOMP_PLUGIN_debug (0,
1449 "Indirect map variable found at %llx with size %ld\n",
1450 varptr, varsize);
1452 GOMP_OFFLOAD_host2dev (ord, (void *) varptr, &map_target_addr,
1453 sizeof (map_target_addr));
1456 CUdeviceptr varptr;
1457 size_t varsize;
1458 CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &varptr, &varsize,
1459 module, XSTRING (GOMP_ADDITIONAL_ICVS));
1461 if (r == CUDA_SUCCESS)
1463 targ_tbl->start = (uintptr_t) varptr;
1464 targ_tbl->end = (uintptr_t) (varptr + varsize);
1466 else
1467 /* The variable was not in this image. */
1468 targ_tbl->start = targ_tbl->end = 0;
1470 if (rev_fn_table && fn_entries == 0)
1471 *rev_fn_table = NULL;
1472 else if (rev_fn_table)
1474 CUdeviceptr var;
1475 size_t bytes;
1476 unsigned int i;
1477 r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &var, &bytes, module,
1478 "$offload_func_table");
1479 if (r != CUDA_SUCCESS)
1480 GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
1481 assert (bytes == sizeof (uint64_t) * fn_entries);
1482 *rev_fn_table = GOMP_PLUGIN_malloc (sizeof (uint64_t) * fn_entries);
1483 r = CUDA_CALL_NOCHECK (cuMemcpyDtoH, *rev_fn_table, var, bytes);
1484 if (r != CUDA_SUCCESS)
1485 GOMP_PLUGIN_fatal ("cuMemcpyDtoH error: %s", cuda_error (r));
1486 /* Free if only NULL entries. */
1487 for (i = 0; i < fn_entries; ++i)
1488 if ((*rev_fn_table)[i] != 0)
1489 break;
1490 if (i == fn_entries)
1492 free (*rev_fn_table);
1493 *rev_fn_table = NULL;
1497 if (rev_fn_table && *rev_fn_table && dev->rev_data == NULL)
1499 /* Get the on-device GOMP_REV_OFFLOAD_VAR variable. It should be
1500 available but it might be not. One reason could be: if the user code
1501 has 'omp target device(ancestor:1)' in pure hostcode, GOMP_target_ext
1502 is not called on the device and, hence, it and GOMP_REV_OFFLOAD_VAR
1503 are not linked in. */
1504 CUdeviceptr device_rev_offload_var;
1505 size_t device_rev_offload_size;
1506 CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal,
1507 &device_rev_offload_var,
1508 &device_rev_offload_size, module,
1509 XSTRING (GOMP_REV_OFFLOAD_VAR));
1510 if (r != CUDA_SUCCESS)
1512 free (*rev_fn_table);
1513 *rev_fn_table = NULL;
1515 else
1517 /* cuMemHostAlloc memory is accessible on the device, if
1518 unified-shared address is supported; this is assumed - see comment
1519 in nvptx_open_device for CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING. */
1520 CUDA_CALL_ASSERT (cuMemHostAlloc, (void **) &dev->rev_data,
1521 sizeof (*dev->rev_data), CU_MEMHOSTALLOC_DEVICEMAP);
1522 CUdeviceptr dp = (CUdeviceptr) dev->rev_data;
1523 r = CUDA_CALL_NOCHECK (cuMemcpyHtoD, device_rev_offload_var, &dp,
1524 sizeof (dp));
1525 if (r != CUDA_SUCCESS)
1526 GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
1530 nvptx_set_clocktick (module, dev);
1532 return fn_entries + var_entries + other_entries;
1535 /* Unload the program described by TARGET_DATA. DEV_DATA is the
1536 function descriptors allocated by G_O_load_image. */
1538 bool
1539 GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data)
1541 struct ptx_image_data *image, **prev_p;
1542 struct ptx_device *dev = ptx_devices[ord];
1544 if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1546 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1547 " (expected %u, received %u)",
1548 GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1549 return false;
1552 bool ret = true;
1553 pthread_mutex_lock (&dev->image_lock);
1554 for (prev_p = &dev->images; (image = *prev_p) != 0; prev_p = &image->next)
1555 if (image->target_data == target_data)
1557 *prev_p = image->next;
1558 if (CUDA_CALL_NOCHECK (cuModuleUnload, image->module) != CUDA_SUCCESS)
1559 ret = false;
1560 free (image->fns);
1561 free (image);
1562 break;
1564 pthread_mutex_unlock (&dev->image_lock);
1565 return ret;
1568 void *
1569 GOMP_OFFLOAD_alloc (int ord, size_t size)
1571 if (!nvptx_attach_host_thread_to_device (ord))
1572 return NULL;
1574 struct ptx_device *ptx_dev = ptx_devices[ord];
1575 struct ptx_free_block *blocks, *tmp;
1577 pthread_mutex_lock (&ptx_dev->free_blocks_lock);
1578 blocks = ptx_dev->free_blocks;
1579 ptx_dev->free_blocks = NULL;
1580 pthread_mutex_unlock (&ptx_dev->free_blocks_lock);
1582 nvptx_stacks_free (ptx_dev, false);
1584 while (blocks)
1586 tmp = blocks->next;
1587 nvptx_free (blocks->ptr, ptx_dev);
1588 free (blocks);
1589 blocks = tmp;
1592 void *d = nvptx_alloc (size, true);
1593 if (d)
1594 return d;
1595 else
1597 /* Memory allocation failed. Try freeing the stacks block, and
1598 retrying. */
1599 nvptx_stacks_free (ptx_dev, true);
1600 return nvptx_alloc (size, false);
1604 bool
1605 GOMP_OFFLOAD_free (int ord, void *ptr)
1607 return (nvptx_attach_host_thread_to_device (ord)
1608 && nvptx_free (ptr, ptx_devices[ord]));
1611 void
1612 GOMP_OFFLOAD_openacc_exec (void (*fn) (void *),
1613 size_t mapnum __attribute__((unused)),
1614 void **hostaddrs __attribute__((unused)),
1615 void **devaddrs,
1616 unsigned *dims, void *targ_mem_desc)
1618 GOMP_PLUGIN_debug (0, "nvptx %s\n", __FUNCTION__);
1620 CUdeviceptr dp = (CUdeviceptr) devaddrs;
1621 nvptx_exec (fn, dims, targ_mem_desc, dp, NULL);
1623 CUresult r = CUDA_CALL_NOCHECK (cuStreamSynchronize, NULL);
1624 const char *maybe_abort_msg = "(perhaps abort was called)";
1625 if (r == CUDA_ERROR_LAUNCH_FAILED)
1626 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
1627 maybe_abort_msg);
1628 else if (r != CUDA_SUCCESS)
1629 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1632 void
1633 GOMP_OFFLOAD_openacc_async_exec (void (*fn) (void *),
1634 size_t mapnum __attribute__((unused)),
1635 void **hostaddrs __attribute__((unused)),
1636 void **devaddrs,
1637 unsigned *dims, void *targ_mem_desc,
1638 struct goacc_asyncqueue *aq)
1640 GOMP_PLUGIN_debug (0, "nvptx %s\n", __FUNCTION__);
1642 CUdeviceptr dp = (CUdeviceptr) devaddrs;
1643 nvptx_exec (fn, dims, targ_mem_desc, dp, aq->cuda_stream);
1646 void *
1647 GOMP_OFFLOAD_openacc_create_thread_data (int ord)
1649 struct ptx_device *ptx_dev;
1650 struct nvptx_thread *nvthd
1651 = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
1652 CUcontext thd_ctx;
1654 ptx_dev = ptx_devices[ord];
1656 assert (ptx_dev);
1658 CUDA_CALL_ASSERT (cuCtxGetCurrent, &thd_ctx);
1660 assert (ptx_dev->ctx);
1662 if (!thd_ctx)
1663 CUDA_CALL_ASSERT (cuCtxPushCurrent, ptx_dev->ctx);
1665 nvthd->ptx_dev = ptx_dev;
1667 return (void *) nvthd;
1670 void
1671 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
1673 free (data);
1676 void *
1677 GOMP_OFFLOAD_openacc_cuda_get_current_device (void)
1679 return nvptx_get_current_cuda_device ();
1682 void *
1683 GOMP_OFFLOAD_openacc_cuda_get_current_context (void)
1685 return nvptx_get_current_cuda_context ();
1688 /* This returns a CUstream. */
1689 void *
1690 GOMP_OFFLOAD_openacc_cuda_get_stream (struct goacc_asyncqueue *aq)
1692 return (void *) aq->cuda_stream;
1695 /* This takes a CUstream. */
1697 GOMP_OFFLOAD_openacc_cuda_set_stream (struct goacc_asyncqueue *aq, void *stream)
1699 if (aq->cuda_stream)
1701 CUDA_CALL_ASSERT (cuStreamSynchronize, aq->cuda_stream);
1702 CUDA_CALL_ASSERT (cuStreamDestroy, aq->cuda_stream);
1705 aq->cuda_stream = (CUstream) stream;
1706 return 1;
1709 static struct goacc_asyncqueue *
1710 nvptx_goacc_asyncqueue_construct (unsigned int flags)
1712 CUstream stream = NULL;
1713 CUDA_CALL_ERET (NULL, cuStreamCreate, &stream, flags);
1715 struct goacc_asyncqueue *aq
1716 = GOMP_PLUGIN_malloc (sizeof (struct goacc_asyncqueue));
1717 aq->cuda_stream = stream;
1718 return aq;
1721 struct goacc_asyncqueue *
1722 GOMP_OFFLOAD_openacc_async_construct (int device __attribute__((unused)))
1724 return nvptx_goacc_asyncqueue_construct (CU_STREAM_DEFAULT);
1727 static bool
1728 nvptx_goacc_asyncqueue_destruct (struct goacc_asyncqueue *aq)
1730 CUDA_CALL_ERET (false, cuStreamDestroy, aq->cuda_stream);
1731 free (aq);
1732 return true;
1735 bool
1736 GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
1738 return nvptx_goacc_asyncqueue_destruct (aq);
1742 GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
1744 CUresult r = CUDA_CALL_NOCHECK (cuStreamQuery, aq->cuda_stream);
1745 if (r == CUDA_SUCCESS)
1746 return 1;
1747 if (r == CUDA_ERROR_NOT_READY)
1748 return 0;
1750 GOMP_PLUGIN_error ("cuStreamQuery error: %s", cuda_error (r));
1751 return -1;
1754 static bool
1755 nvptx_goacc_asyncqueue_synchronize (struct goacc_asyncqueue *aq)
1757 CUDA_CALL_ERET (false, cuStreamSynchronize, aq->cuda_stream);
1758 return true;
1761 bool
1762 GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
1764 return nvptx_goacc_asyncqueue_synchronize (aq);
1767 bool
1768 GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
1769 struct goacc_asyncqueue *aq2)
1771 CUevent e;
1772 CUDA_CALL_ERET (false, cuEventCreate, &e, CU_EVENT_DISABLE_TIMING);
1773 CUDA_CALL_ERET (false, cuEventRecord, e, aq1->cuda_stream);
1774 CUDA_CALL_ERET (false, cuStreamWaitEvent, aq2->cuda_stream, e, 0);
1775 return true;
1778 static void
1779 cuda_callback_wrapper (CUstream stream, CUresult res, void *ptr)
1781 if (res != CUDA_SUCCESS)
1782 GOMP_PLUGIN_fatal ("%s error: %s", __FUNCTION__, cuda_error (res));
1783 struct nvptx_callback *cb = (struct nvptx_callback *) ptr;
1784 cb->fn (cb->ptr);
1785 free (ptr);
1788 void
1789 GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
1790 void (*callback_fn)(void *),
1791 void *userptr)
1793 struct nvptx_callback *b = GOMP_PLUGIN_malloc (sizeof (*b));
1794 b->fn = callback_fn;
1795 b->ptr = userptr;
1796 b->aq = aq;
1797 CUDA_CALL_ASSERT (cuStreamAddCallback, aq->cuda_stream,
1798 cuda_callback_wrapper, (void *) b, 0);
1801 static bool
1802 cuda_memcpy_sanity_check (const void *h, const void *d, size_t s)
1804 CUdeviceptr pb;
1805 size_t ps;
1806 if (!s)
1807 return true;
1808 if (!d)
1810 GOMP_PLUGIN_error ("invalid device address");
1811 return false;
1813 CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
1814 if (!pb)
1816 GOMP_PLUGIN_error ("invalid device address");
1817 return false;
1819 if (!h)
1821 GOMP_PLUGIN_error ("invalid host address");
1822 return false;
1824 if (d == h)
1826 GOMP_PLUGIN_error ("invalid host or device address");
1827 return false;
1829 if ((void *)(d + s) > (void *)(pb + ps))
1831 GOMP_PLUGIN_error ("invalid size");
1832 return false;
1834 return true;
1837 bool
1838 GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
1840 if (!nvptx_attach_host_thread_to_device (ord)
1841 || !cuda_memcpy_sanity_check (src, dst, n))
1842 return false;
1843 CUDA_CALL (cuMemcpyHtoD, (CUdeviceptr) dst, src, n);
1844 return true;
1847 bool
1848 GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
1850 if (!nvptx_attach_host_thread_to_device (ord)
1851 || !cuda_memcpy_sanity_check (dst, src, n))
1852 return false;
1853 CUDA_CALL (cuMemcpyDtoH, dst, (CUdeviceptr) src, n);
1854 return true;
1857 bool
1858 GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n)
1860 CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n, NULL);
1861 return true;
1865 GOMP_OFFLOAD_memcpy2d (int dst_ord, int src_ord, size_t dim1_size,
1866 size_t dim0_len, void *dst, size_t dst_offset1_size,
1867 size_t dst_offset0_len, size_t dst_dim1_size,
1868 const void *src, size_t src_offset1_size,
1869 size_t src_offset0_len, size_t src_dim1_size)
1871 if (!nvptx_attach_host_thread_to_device (src_ord != -1 ? src_ord : dst_ord))
1872 return false;
1874 /* TODO: Consider using CU_MEMORYTYPE_UNIFIED if supported. */
1876 CUDA_MEMCPY2D data;
1878 memset (&data, 0, sizeof (data));
1879 data.WidthInBytes = dim1_size;
1880 data.Height = dim0_len;
1882 if (dst_ord == -1)
1884 data.dstMemoryType = CU_MEMORYTYPE_HOST;
1885 data.dstHost = dst;
1887 else
1889 data.dstMemoryType = CU_MEMORYTYPE_DEVICE;
1890 data.dstDevice = (CUdeviceptr) dst;
1892 data.dstPitch = dst_dim1_size;
1893 data.dstXInBytes = dst_offset1_size;
1894 data.dstY = dst_offset0_len;
1896 if (src_ord == -1)
1898 data.srcMemoryType = CU_MEMORYTYPE_HOST;
1899 data.srcHost = src;
1901 else
1903 data.srcMemoryType = CU_MEMORYTYPE_DEVICE;
1904 data.srcDevice = (CUdeviceptr) src;
1906 data.srcPitch = src_dim1_size;
1907 data.srcXInBytes = src_offset1_size;
1908 data.srcY = src_offset0_len;
1910 if (data.srcXInBytes != 0 || data.srcY != 0)
1912 /* Adjust origin to the actual array data, else the CUDA 2D memory
1913 copy API calls below may fail to validate source/dest pointers
1914 correctly (especially for Fortran where the "virtual origin" of an
1915 array is often outside the stored data). */
1916 if (src_ord == -1)
1917 data.srcHost = (const void *) ((const char *) data.srcHost
1918 + data.srcY * data.srcPitch
1919 + data.srcXInBytes);
1920 else
1921 data.srcDevice += data.srcY * data.srcPitch + data.srcXInBytes;
1922 data.srcXInBytes = 0;
1923 data.srcY = 0;
1926 if (data.dstXInBytes != 0 || data.dstY != 0)
1928 /* As above. */
1929 if (dst_ord == -1)
1930 data.dstHost = (void *) ((char *) data.dstHost
1931 + data.dstY * data.dstPitch
1932 + data.dstXInBytes);
1933 else
1934 data.dstDevice += data.dstY * data.dstPitch + data.dstXInBytes;
1935 data.dstXInBytes = 0;
1936 data.dstY = 0;
1939 CUresult res = CUDA_CALL_NOCHECK (cuMemcpy2D, &data);
1940 if (res == CUDA_ERROR_INVALID_VALUE)
1941 /* If pitch > CU_DEVICE_ATTRIBUTE_MAX_PITCH or for device-to-device
1942 for (some) memory not allocated by cuMemAllocPitch, cuMemcpy2D fails
1943 with an error; try the slower cuMemcpy2DUnaligned now. */
1944 CUDA_CALL (cuMemcpy2DUnaligned, &data);
1945 else if (res != CUDA_SUCCESS)
1947 GOMP_PLUGIN_error ("cuMemcpy2D error: %s", cuda_error (res));
1948 return false;
1950 return true;
1954 GOMP_OFFLOAD_memcpy3d (int dst_ord, int src_ord, size_t dim2_size,
1955 size_t dim1_len, size_t dim0_len, void *dst,
1956 size_t dst_offset2_size, size_t dst_offset1_len,
1957 size_t dst_offset0_len, size_t dst_dim2_size,
1958 size_t dst_dim1_len, const void *src,
1959 size_t src_offset2_size, size_t src_offset1_len,
1960 size_t src_offset0_len, size_t src_dim2_size,
1961 size_t src_dim1_len)
1963 if (!nvptx_attach_host_thread_to_device (src_ord != -1 ? src_ord : dst_ord))
1964 return false;
1966 /* TODO: Consider using CU_MEMORYTYPE_UNIFIED if supported. */
1968 CUDA_MEMCPY3D data;
1970 memset (&data, 0, sizeof (data));
1971 data.WidthInBytes = dim2_size;
1972 data.Height = dim1_len;
1973 data.Depth = dim0_len;
1975 if (dst_ord == -1)
1977 data.dstMemoryType = CU_MEMORYTYPE_HOST;
1978 data.dstHost = dst;
1980 else
1982 data.dstMemoryType = CU_MEMORYTYPE_DEVICE;
1983 data.dstDevice = (CUdeviceptr) dst;
1985 data.dstPitch = dst_dim2_size;
1986 data.dstHeight = dst_dim1_len;
1987 data.dstXInBytes = dst_offset2_size;
1988 data.dstY = dst_offset1_len;
1989 data.dstZ = dst_offset0_len;
1991 if (src_ord == -1)
1993 data.srcMemoryType = CU_MEMORYTYPE_HOST;
1994 data.srcHost = src;
1996 else
1998 data.srcMemoryType = CU_MEMORYTYPE_DEVICE;
1999 data.srcDevice = (CUdeviceptr) src;
2001 data.srcPitch = src_dim2_size;
2002 data.srcHeight = src_dim1_len;
2003 data.srcXInBytes = src_offset2_size;
2004 data.srcY = src_offset1_len;
2005 data.srcZ = src_offset0_len;
2007 if (data.srcXInBytes != 0 || data.srcY != 0 || data.srcZ != 0)
2009 /* Adjust origin to the actual array data, else the CUDA 3D memory
2010 copy API call below may fail to validate source/dest pointers
2011 correctly (especially for Fortran where the "virtual origin" of an
2012 array is often outside the stored data). */
2013 if (src_ord == -1)
2014 data.srcHost
2015 = (const void *) ((const char *) data.srcHost
2016 + (data.srcZ * data.srcHeight + data.srcY)
2017 * data.srcPitch
2018 + data.srcXInBytes);
2019 else
2020 data.srcDevice
2021 += (data.srcZ * data.srcHeight + data.srcY) * data.srcPitch
2022 + data.srcXInBytes;
2023 data.srcXInBytes = 0;
2024 data.srcY = 0;
2025 data.srcZ = 0;
2028 if (data.dstXInBytes != 0 || data.dstY != 0 || data.dstZ != 0)
2030 /* As above. */
2031 if (dst_ord == -1)
2032 data.dstHost = (void *) ((char *) data.dstHost
2033 + (data.dstZ * data.dstHeight + data.dstY)
2034 * data.dstPitch
2035 + data.dstXInBytes);
2036 else
2037 data.dstDevice
2038 += (data.dstZ * data.dstHeight + data.dstY) * data.dstPitch
2039 + data.dstXInBytes;
2040 data.dstXInBytes = 0;
2041 data.dstY = 0;
2042 data.dstZ = 0;
2045 CUDA_CALL (cuMemcpy3D, &data);
2046 return true;
2049 bool
2050 GOMP_OFFLOAD_openacc_async_host2dev (int ord, void *dst, const void *src,
2051 size_t n, struct goacc_asyncqueue *aq)
2053 if (!nvptx_attach_host_thread_to_device (ord)
2054 || !cuda_memcpy_sanity_check (src, dst, n))
2055 return false;
2056 CUDA_CALL (cuMemcpyHtoDAsync, (CUdeviceptr) dst, src, n, aq->cuda_stream);
2057 return true;
2060 bool
2061 GOMP_OFFLOAD_openacc_async_dev2host (int ord, void *dst, const void *src,
2062 size_t n, struct goacc_asyncqueue *aq)
2064 if (!nvptx_attach_host_thread_to_device (ord)
2065 || !cuda_memcpy_sanity_check (dst, src, n))
2066 return false;
2067 CUDA_CALL (cuMemcpyDtoHAsync, dst, (CUdeviceptr) src, n, aq->cuda_stream);
2068 return true;
2071 union goacc_property_value
2072 GOMP_OFFLOAD_openacc_get_property (int n, enum goacc_property prop)
2074 union goacc_property_value propval = { .val = 0 };
2076 pthread_mutex_lock (&ptx_dev_lock);
2078 if (n >= nvptx_get_num_devices () || n < 0 || ptx_devices[n] == NULL)
2080 pthread_mutex_unlock (&ptx_dev_lock);
2081 return propval;
2084 struct ptx_device *ptx_dev = ptx_devices[n];
2085 switch (prop)
2087 case GOACC_PROPERTY_MEMORY:
2089 size_t total_mem;
2091 CUDA_CALL_ERET (propval, cuDeviceTotalMem, &total_mem, ptx_dev->dev);
2092 propval.val = total_mem;
2094 break;
2095 case GOACC_PROPERTY_FREE_MEMORY:
2097 size_t total_mem;
2098 size_t free_mem;
2099 CUdevice ctxdev;
2101 CUDA_CALL_ERET (propval, cuCtxGetDevice, &ctxdev);
2102 if (ptx_dev->dev == ctxdev)
2103 CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
2104 else if (ptx_dev->ctx)
2106 CUcontext old_ctx;
2108 CUDA_CALL_ERET (propval, cuCtxPushCurrent, ptx_dev->ctx);
2109 CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
2110 CUDA_CALL_ASSERT (cuCtxPopCurrent, &old_ctx);
2112 else
2114 CUcontext new_ctx;
2116 CUDA_CALL_ERET (propval, cuCtxCreate, &new_ctx, CU_CTX_SCHED_AUTO,
2117 ptx_dev->dev);
2118 CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
2119 CUDA_CALL_ASSERT (cuCtxDestroy, new_ctx);
2121 propval.val = free_mem;
2123 break;
2124 case GOACC_PROPERTY_NAME:
2125 propval.ptr = ptx_dev->name;
2126 break;
2127 case GOACC_PROPERTY_VENDOR:
2128 propval.ptr = "Nvidia";
2129 break;
2130 case GOACC_PROPERTY_DRIVER:
2131 propval.ptr = cuda_driver_version_s;
2132 break;
2133 default:
2134 break;
2137 pthread_mutex_unlock (&ptx_dev_lock);
2138 return propval;
2141 /* Adjust launch dimensions: pick good values for number of blocks and warps
2142 and ensure that number of warps does not exceed CUDA limits as well as GCC's
2143 own limits. */
2145 static void
2146 nvptx_adjust_launch_bounds (struct targ_fn_descriptor *fn,
2147 struct ptx_device *ptx_dev,
2148 int *teams_p, int *threads_p)
2150 int max_warps_block = fn->max_threads_per_block / 32;
2151 /* Maximum 32 warps per block is an implementation limit in NVPTX backend
2152 and libgcc, which matches documented limit of all GPUs as of 2015. */
2153 if (max_warps_block > 32)
2154 max_warps_block = 32;
2155 if (*threads_p <= 0)
2156 *threads_p = 8;
2157 if (*threads_p > max_warps_block)
2158 *threads_p = max_warps_block;
2160 int regs_per_block = fn->regs_per_thread * 32 * *threads_p;
2161 /* This is an estimate of how many blocks the device can host simultaneously.
2162 Actual limit, which may be lower, can be queried with "occupancy control"
2163 driver interface (since CUDA 6.0). */
2164 int max_blocks = ptx_dev->regs_per_sm / regs_per_block * ptx_dev->num_sms;
2165 if (*teams_p <= 0 || *teams_p > max_blocks)
2166 *teams_p = max_blocks;
2169 /* Return the size of per-warp stacks (see gcc -msoft-stack) to use for OpenMP
2170 target regions. */
2172 static size_t
2173 nvptx_stacks_size ()
2175 return 128 * 1024;
2178 /* Return contiguous storage for NUM stacks, each SIZE bytes. The lock for
2179 the storage should be held on entry, and remains held on exit. */
2181 static void *
2182 nvptx_stacks_acquire (struct ptx_device *ptx_dev, size_t size, int num)
2184 if (ptx_dev->omp_stacks.ptr && ptx_dev->omp_stacks.size >= size * num)
2185 return (void *) ptx_dev->omp_stacks.ptr;
2187 /* Free the old, too-small stacks. */
2188 if (ptx_dev->omp_stacks.ptr)
2190 CUresult r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
2191 if (r != CUDA_SUCCESS)
2192 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s\n", cuda_error (r));
2193 r = CUDA_CALL_NOCHECK (cuMemFree, ptx_dev->omp_stacks.ptr);
2194 if (r != CUDA_SUCCESS)
2195 GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
2198 /* Make new and bigger stacks, and remember where we put them and how big
2199 they are. */
2200 CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &ptx_dev->omp_stacks.ptr,
2201 size * num);
2202 if (r != CUDA_SUCCESS)
2203 GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
2205 ptx_dev->omp_stacks.size = size * num;
2207 return (void *) ptx_dev->omp_stacks.ptr;
2211 void
2212 GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
2214 struct targ_fn_descriptor *tgt_fn_desc
2215 = (struct targ_fn_descriptor *) tgt_fn;
2216 CUfunction function = tgt_fn_desc->fn;
2217 const struct targ_fn_launch *launch = tgt_fn_desc->launch;
2218 const char *fn_name = launch->fn;
2219 CUresult r;
2220 struct ptx_device *ptx_dev = ptx_devices[ord];
2221 const char *maybe_abort_msg = "(perhaps abort was called)";
2222 int teams = 0, threads = 0;
2224 if (!args)
2225 GOMP_PLUGIN_fatal ("No target arguments provided");
2226 while (*args)
2228 intptr_t id = (intptr_t) *args++, val;
2229 if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
2230 val = (intptr_t) *args++;
2231 else
2232 val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
2233 if ((id & GOMP_TARGET_ARG_DEVICE_MASK) != GOMP_TARGET_ARG_DEVICE_ALL)
2234 continue;
2235 val = val > INT_MAX ? INT_MAX : val;
2236 id &= GOMP_TARGET_ARG_ID_MASK;
2237 if (id == GOMP_TARGET_ARG_NUM_TEAMS)
2238 teams = val;
2239 else if (id == GOMP_TARGET_ARG_THREAD_LIMIT)
2240 threads = val;
2242 nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads);
2244 bool reverse_offload = ptx_dev->rev_data != NULL;
2245 struct goacc_asyncqueue *reverse_offload_aq = NULL;
2246 if (reverse_offload)
2248 reverse_offload_aq
2249 = nvptx_goacc_asyncqueue_construct (CU_STREAM_NON_BLOCKING);
2250 if (!reverse_offload_aq)
2251 exit (EXIT_FAILURE);
2254 size_t stack_size = nvptx_stacks_size ();
2256 pthread_mutex_lock (&ptx_dev->omp_stacks.lock);
2257 void *stacks = nvptx_stacks_acquire (ptx_dev, stack_size, teams * threads);
2258 void *fn_args[] = {tgt_vars, stacks, (void *) stack_size};
2259 size_t fn_args_size = sizeof fn_args;
2260 void *config[] = {
2261 CU_LAUNCH_PARAM_BUFFER_POINTER, fn_args,
2262 CU_LAUNCH_PARAM_BUFFER_SIZE, &fn_args_size,
2263 CU_LAUNCH_PARAM_END
2265 GOMP_PLUGIN_debug (0, " %s: kernel %s: launch"
2266 " [(teams: %u), 1, 1] [(lanes: 32), (threads: %u), 1]\n",
2267 __FUNCTION__, fn_name, teams, threads);
2268 r = CUDA_CALL_NOCHECK (cuLaunchKernel, function, teams, 1, 1,
2269 32, threads, 1, lowlat_pool_size, NULL, NULL, config);
2270 if (r != CUDA_SUCCESS)
2271 GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
2272 if (reverse_offload)
2273 while (true)
2275 r = CUDA_CALL_NOCHECK (cuStreamQuery, NULL);
2276 if (r == CUDA_SUCCESS)
2277 break;
2278 if (r == CUDA_ERROR_LAUNCH_FAILED)
2279 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s %s\n", cuda_error (r),
2280 maybe_abort_msg);
2281 else if (r != CUDA_ERROR_NOT_READY)
2282 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
2284 if (__atomic_load_n (&ptx_dev->rev_data->fn, __ATOMIC_ACQUIRE) != 0)
2286 struct rev_offload *rev_data = ptx_dev->rev_data;
2287 GOMP_PLUGIN_target_rev (rev_data->fn, rev_data->mapnum,
2288 rev_data->addrs, rev_data->sizes,
2289 rev_data->kinds, rev_data->dev_num,
2290 reverse_offload_aq);
2291 if (!nvptx_goacc_asyncqueue_synchronize (reverse_offload_aq))
2292 exit (EXIT_FAILURE);
2293 __atomic_store_n (&rev_data->fn, 0, __ATOMIC_RELEASE);
2295 usleep (1);
2297 else
2298 r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
2299 if (r == CUDA_ERROR_LAUNCH_FAILED)
2300 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
2301 maybe_abort_msg);
2302 else if (r != CUDA_SUCCESS)
2303 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
2305 pthread_mutex_unlock (&ptx_dev->omp_stacks.lock);
2307 if (reverse_offload)
2309 if (!nvptx_goacc_asyncqueue_destruct (reverse_offload_aq))
2310 exit (EXIT_FAILURE);
2314 /* TODO: Implement GOMP_OFFLOAD_async_run. */