c-family: Enable -fpermissive for C and ObjC
[official-gcc.git] / libgomp / plugin / plugin-nvptx.c
blob0548e7e09e59c2451f3afcbd07780b1eb6fb2017
1 /* Plugin for NVPTX execution.
3 Copyright (C) 2013-2023 Free Software Foundation, Inc.
5 Contributed by Mentor Embedded.
7 This file is part of the GNU Offloading and Multi Processing Library
8 (libgomp).
10 Libgomp is free software; you can redistribute it and/or modify it
11 under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 3, or (at your option)
13 any later version.
15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
18 more details.
20 Under Section 7 of GPL version 3, you are granted additional
21 permissions described in the GCC Runtime Library Exception, version
22 3.1, as published by the Free Software Foundation.
24 You should have received a copy of the GNU General Public License and
25 a copy of the GCC Runtime Library Exception along with this program;
26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
27 <http://www.gnu.org/licenses/>. */
29 /* Nvidia PTX-specific parts of OpenACC support. The cuda driver
30 library appears to hold some implicit state, but the documentation
31 is not clear as to what that state might be. Or how one might
32 propagate it from one thread to another. */
34 #define _GNU_SOURCE
35 #include "openacc.h"
36 #include "config.h"
37 #include "symcat.h"
38 #include "libgomp-plugin.h"
39 #include "oacc-plugin.h"
40 #include "gomp-constants.h"
41 #include "oacc-int.h"
43 /* For struct rev_offload + GOMP_REV_OFFLOAD_VAR. */
44 #include "config/nvptx/libgomp-nvptx.h"
46 #include <pthread.h>
47 #ifndef PLUGIN_NVPTX_INCLUDE_SYSTEM_CUDA_H
48 # include "cuda/cuda.h"
49 #else
50 # include <cuda.h>
51 #endif
52 #include <stdbool.h>
53 #include <limits.h>
54 #include <string.h>
55 #include <stdio.h>
56 #include <unistd.h>
57 #include <assert.h>
58 #include <errno.h>
59 #include <stdlib.h>
61 /* An arbitrary fixed limit (128MB) for the size of the OpenMP soft stacks
62 block to cache between kernel invocations. For soft-stacks blocks bigger
63 than this, we will free the block before attempting another GPU memory
64 allocation (i.e. in GOMP_OFFLOAD_alloc). Otherwise, if an allocation fails,
65 we will free the cached soft-stacks block anyway then retry the
66 allocation. If that fails too, we lose. */
68 #define SOFTSTACK_CACHE_LIMIT 134217728
70 #if CUDA_VERSION < 6000
71 extern CUresult cuGetErrorString (CUresult, const char **);
72 #define CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR 82
73 #endif
75 #if CUDA_VERSION >= 6050
76 #undef cuLinkCreate
77 #undef cuLinkAddData
78 CUresult cuLinkAddData (CUlinkState, CUjitInputType, void *, size_t,
79 const char *, unsigned, CUjit_option *, void **);
80 CUresult cuLinkCreate (unsigned, CUjit_option *, void **, CUlinkState *);
81 #else
82 typedef size_t (*CUoccupancyB2DSize)(int);
83 CUresult cuLinkAddData_v2 (CUlinkState, CUjitInputType, void *, size_t,
84 const char *, unsigned, CUjit_option *, void **);
85 CUresult cuLinkCreate_v2 (unsigned, CUjit_option *, void **, CUlinkState *);
86 CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
87 CUoccupancyB2DSize, size_t, int);
88 #endif
90 #define DO_PRAGMA(x) _Pragma (#x)
92 #ifndef PLUGIN_NVPTX_LINK_LIBCUDA
93 # include <dlfcn.h>
95 struct cuda_lib_s {
97 # define CUDA_ONE_CALL(call) \
98 __typeof (call) *call;
99 # define CUDA_ONE_CALL_MAYBE_NULL(call) \
100 CUDA_ONE_CALL (call)
101 #include "cuda-lib.def"
102 # undef CUDA_ONE_CALL
103 # undef CUDA_ONE_CALL_MAYBE_NULL
105 } cuda_lib;
107 /* -1 if init_cuda_lib has not been called yet, false
108 if it has been and failed, true if it has been and succeeded. */
109 static signed char cuda_lib_inited = -1;
111 /* Dynamically load the CUDA runtime library and initialize function
112 pointers, return false if unsuccessful, true if successful. */
113 static bool
114 init_cuda_lib (void)
116 if (cuda_lib_inited != -1)
117 return cuda_lib_inited;
118 const char *cuda_runtime_lib = "libcuda.so.1";
119 void *h = dlopen (cuda_runtime_lib, RTLD_LAZY);
120 cuda_lib_inited = false;
121 if (h == NULL)
122 return false;
124 # define CUDA_ONE_CALL(call) CUDA_ONE_CALL_1 (call, false)
125 # define CUDA_ONE_CALL_MAYBE_NULL(call) CUDA_ONE_CALL_1 (call, true)
126 # define CUDA_ONE_CALL_1(call, allow_null) \
127 cuda_lib.call = dlsym (h, #call); \
128 if (!allow_null && cuda_lib.call == NULL) \
129 return false;
130 #include "cuda-lib.def"
131 # undef CUDA_ONE_CALL
132 # undef CUDA_ONE_CALL_1
133 # undef CUDA_ONE_CALL_MAYBE_NULL
135 cuda_lib_inited = true;
136 return true;
138 # define CUDA_CALL_PREFIX cuda_lib.
139 #else
141 # define CUDA_ONE_CALL(call)
142 # define CUDA_ONE_CALL_MAYBE_NULL(call) DO_PRAGMA (weak call)
143 #include "cuda-lib.def"
144 #undef CUDA_ONE_CALL_MAYBE_NULL
145 #undef CUDA_ONE_CALL
147 # define CUDA_CALL_PREFIX
148 # define init_cuda_lib() true
149 #endif
151 #include "secure_getenv.h"
153 #undef MIN
154 #undef MAX
155 #define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
156 #define MAX(X,Y) ((X) > (Y) ? (X) : (Y))
158 /* Convenience macros for the frequently used CUDA library call and
159 error handling sequence as well as CUDA library calls that
160 do the error checking themselves or don't do it at all. */
162 #define CUDA_CALL_ERET(ERET, FN, ...) \
163 do { \
164 unsigned __r \
165 = CUDA_CALL_PREFIX FN (__VA_ARGS__); \
166 if (__r != CUDA_SUCCESS) \
168 GOMP_PLUGIN_error (#FN " error: %s", \
169 cuda_error (__r)); \
170 return ERET; \
172 } while (0)
174 #define CUDA_CALL(FN, ...) \
175 CUDA_CALL_ERET (false, FN, __VA_ARGS__)
177 #define CUDA_CALL_ASSERT(FN, ...) \
178 do { \
179 unsigned __r \
180 = CUDA_CALL_PREFIX FN (__VA_ARGS__); \
181 if (__r != CUDA_SUCCESS) \
183 GOMP_PLUGIN_fatal (#FN " error: %s", \
184 cuda_error (__r)); \
186 } while (0)
188 #define CUDA_CALL_NOCHECK(FN, ...) \
189 CUDA_CALL_PREFIX FN (__VA_ARGS__)
191 #define CUDA_CALL_EXISTS(FN) \
192 CUDA_CALL_PREFIX FN
194 static const char *
195 cuda_error (CUresult r)
197 const char *fallback = "unknown cuda error";
198 const char *desc;
200 if (!CUDA_CALL_EXISTS (cuGetErrorString))
201 return fallback;
203 r = CUDA_CALL_NOCHECK (cuGetErrorString, r, &desc);
204 if (r == CUDA_SUCCESS)
205 return desc;
207 return fallback;
210 /* Version of the CUDA Toolkit in the same MAJOR.MINOR format that is used by
211 Nvidia, such as in the 'deviceQuery' program (Nvidia's CUDA samples). */
212 static char cuda_driver_version_s[30];
214 static unsigned int instantiated_devices = 0;
215 static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
217 /* NVPTX/CUDA specific definition of asynchronous queues. */
218 struct goacc_asyncqueue
220 CUstream cuda_stream;
223 struct nvptx_callback
225 void (*fn) (void *);
226 void *ptr;
227 struct goacc_asyncqueue *aq;
228 struct nvptx_callback *next;
231 /* Thread-specific data for PTX. */
233 struct nvptx_thread
235 /* We currently have this embedded inside the plugin because libgomp manages
236 devices through integer target_ids. This might be better if using an
237 opaque target-specific pointer directly from gomp_device_descr. */
238 struct ptx_device *ptx_dev;
241 /* Target data function launch information. */
243 struct targ_fn_launch
245 const char *fn;
246 unsigned short dim[GOMP_DIM_MAX];
249 /* Target PTX object information. */
251 struct targ_ptx_obj
253 const char *code;
254 size_t size;
257 /* Target data image information. */
259 typedef struct nvptx_tdata
261 const struct targ_ptx_obj *ptx_objs;
262 unsigned ptx_num;
264 const char *const *var_names;
265 unsigned var_num;
267 const struct targ_fn_launch *fn_descs;
268 unsigned fn_num;
270 unsigned ind_fn_num;
271 } nvptx_tdata_t;
273 /* Descriptor of a loaded function. */
275 struct targ_fn_descriptor
277 CUfunction fn;
278 const struct targ_fn_launch *launch;
279 int regs_per_thread;
280 int max_threads_per_block;
283 /* A loaded PTX image. */
284 struct ptx_image_data
286 const void *target_data;
287 CUmodule module;
289 struct targ_fn_descriptor *fns; /* Array of functions. */
291 struct ptx_image_data *next;
294 struct ptx_free_block
296 void *ptr;
297 struct ptx_free_block *next;
300 struct ptx_device
302 CUcontext ctx;
303 bool ctx_shared;
304 CUdevice dev;
306 int ord;
307 bool overlap;
308 bool map;
309 bool concur;
310 bool mkern;
311 int mode;
312 int clock_khz;
313 int num_sms;
314 int regs_per_block;
315 int regs_per_sm;
316 int warp_size;
317 int max_threads_per_block;
318 int max_threads_per_multiprocessor;
319 int default_dims[GOMP_DIM_MAX];
321 /* Length as used by the CUDA Runtime API ('struct cudaDeviceProp'). */
322 char name[256];
324 struct ptx_image_data *images; /* Images loaded on device. */
325 pthread_mutex_t image_lock; /* Lock for above list. */
327 struct ptx_free_block *free_blocks;
328 pthread_mutex_t free_blocks_lock;
330 /* OpenMP stacks, cached between kernel invocations. */
331 struct
333 CUdeviceptr ptr;
334 size_t size;
335 pthread_mutex_t lock;
336 } omp_stacks;
338 struct rev_offload *rev_data;
339 struct ptx_device *next;
342 static struct ptx_device **ptx_devices;
344 static inline struct nvptx_thread *
345 nvptx_thread (void)
347 return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
350 /* Initialize the device. Return TRUE on success, else FALSE. PTX_DEV_LOCK
351 should be locked on entry and remains locked on exit. */
353 static bool
354 nvptx_init (void)
356 int ndevs;
358 if (instantiated_devices != 0)
359 return true;
361 if (!init_cuda_lib ())
362 return false;
364 CUDA_CALL (cuInit, 0);
366 int cuda_driver_version;
367 CUDA_CALL_ERET (NULL, cuDriverGetVersion, &cuda_driver_version);
368 snprintf (cuda_driver_version_s, sizeof cuda_driver_version_s,
369 "CUDA Driver %u.%u",
370 cuda_driver_version / 1000, cuda_driver_version % 1000 / 10);
372 CUDA_CALL (cuDeviceGetCount, &ndevs);
373 ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *)
374 * ndevs);
376 return true;
379 /* Select the N'th PTX device for the current host thread. The device must
380 have been previously opened before calling this function. */
382 static bool
383 nvptx_attach_host_thread_to_device (int n)
385 CUdevice dev;
386 CUresult r;
387 struct ptx_device *ptx_dev;
388 CUcontext thd_ctx;
390 r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &dev);
391 if (r == CUDA_ERROR_NOT_PERMITTED)
393 /* Assume we're in a CUDA callback, just return true. */
394 return true;
396 if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
398 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
399 return false;
402 if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n)
403 return true;
404 else
406 CUcontext old_ctx;
408 ptx_dev = ptx_devices[n];
409 if (!ptx_dev)
411 GOMP_PLUGIN_error ("device %d not found", n);
412 return false;
415 CUDA_CALL (cuCtxGetCurrent, &thd_ctx);
417 /* We don't necessarily have a current context (e.g. if it has been
418 destroyed. Pop it if we do though. */
419 if (thd_ctx != NULL)
420 CUDA_CALL (cuCtxPopCurrent, &old_ctx);
422 CUDA_CALL (cuCtxPushCurrent, ptx_dev->ctx);
424 return true;
427 static struct ptx_device *
428 nvptx_open_device (int n)
430 struct ptx_device *ptx_dev;
431 CUdevice dev, ctx_dev;
432 CUresult r;
433 int pi;
435 CUDA_CALL_ERET (NULL, cuDeviceGet, &dev, n);
437 ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device));
439 ptx_dev->ord = n;
440 ptx_dev->dev = dev;
441 ptx_dev->ctx_shared = false;
443 r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &ctx_dev);
444 if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
446 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
447 return NULL;
450 if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev)
452 /* The current host thread has an active context for a different device.
453 Detach it. */
454 CUcontext old_ctx;
455 CUDA_CALL_ERET (NULL, cuCtxPopCurrent, &old_ctx);
458 CUDA_CALL_ERET (NULL, cuCtxGetCurrent, &ptx_dev->ctx);
460 if (!ptx_dev->ctx)
461 CUDA_CALL_ERET (NULL, cuCtxCreate, &ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
462 else
463 ptx_dev->ctx_shared = true;
465 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
466 &pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
467 ptx_dev->overlap = pi;
469 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
470 &pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
471 ptx_dev->map = pi;
473 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
474 &pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
475 ptx_dev->concur = pi;
477 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
478 &pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
479 ptx_dev->mode = pi;
481 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
482 &pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
483 ptx_dev->mkern = pi;
485 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
486 &pi, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
487 ptx_dev->clock_khz = pi;
489 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
490 &pi, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
491 ptx_dev->num_sms = pi;
493 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
494 &pi, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev);
495 ptx_dev->regs_per_block = pi;
497 /* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR is defined only
498 in CUDA 6.0 and newer. */
499 r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi,
500 CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR,
501 dev);
502 /* Fallback: use limit of registers per block, which is usually equal. */
503 if (r == CUDA_ERROR_INVALID_VALUE)
504 pi = ptx_dev->regs_per_block;
505 else if (r != CUDA_SUCCESS)
507 GOMP_PLUGIN_error ("cuDeviceGetAttribute error: %s", cuda_error (r));
508 return NULL;
510 ptx_dev->regs_per_sm = pi;
512 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
513 &pi, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev);
514 if (pi != 32)
516 GOMP_PLUGIN_error ("Only warp size 32 is supported");
517 return NULL;
519 ptx_dev->warp_size = pi;
521 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
522 CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev);
523 ptx_dev->max_threads_per_block = pi;
525 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
526 CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev);
527 ptx_dev->max_threads_per_multiprocessor = pi;
529 /* Required below for reverse offload as implemented, but with compute
530 capability >= 2.0 and 64bit device processes, this should be universally be
531 the case; hence, an assert. */
532 r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi,
533 CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, dev);
534 assert (r == CUDA_SUCCESS && pi);
536 for (int i = 0; i != GOMP_DIM_MAX; i++)
537 ptx_dev->default_dims[i] = 0;
539 CUDA_CALL_ERET (NULL, cuDeviceGetName, ptx_dev->name, sizeof ptx_dev->name,
540 dev);
542 ptx_dev->images = NULL;
543 pthread_mutex_init (&ptx_dev->image_lock, NULL);
545 ptx_dev->free_blocks = NULL;
546 pthread_mutex_init (&ptx_dev->free_blocks_lock, NULL);
548 ptx_dev->omp_stacks.ptr = 0;
549 ptx_dev->omp_stacks.size = 0;
550 pthread_mutex_init (&ptx_dev->omp_stacks.lock, NULL);
552 ptx_dev->rev_data = NULL;
554 return ptx_dev;
557 static bool
558 nvptx_close_device (struct ptx_device *ptx_dev)
560 if (!ptx_dev)
561 return true;
563 for (struct ptx_free_block *b = ptx_dev->free_blocks; b;)
565 struct ptx_free_block *b_next = b->next;
566 CUDA_CALL (cuMemFree, (CUdeviceptr) b->ptr);
567 free (b);
568 b = b_next;
571 pthread_mutex_destroy (&ptx_dev->free_blocks_lock);
572 pthread_mutex_destroy (&ptx_dev->image_lock);
574 pthread_mutex_destroy (&ptx_dev->omp_stacks.lock);
576 if (ptx_dev->omp_stacks.ptr)
577 CUDA_CALL (cuMemFree, ptx_dev->omp_stacks.ptr);
579 if (!ptx_dev->ctx_shared)
580 CUDA_CALL (cuCtxDestroy, ptx_dev->ctx);
582 free (ptx_dev);
583 return true;
586 static int
587 nvptx_get_num_devices (void)
589 int n;
591 /* This function will be called before the plugin has been initialized in
592 order to enumerate available devices, but CUDA API routines can't be used
593 until cuInit has been called. Just call it now (but don't yet do any
594 further initialization). */
595 if (instantiated_devices == 0)
597 if (!init_cuda_lib ())
598 return 0;
599 CUresult r = CUDA_CALL_NOCHECK (cuInit, 0);
600 /* This is not an error: e.g. we may have CUDA libraries installed but
601 no devices available. */
602 if (r != CUDA_SUCCESS)
604 GOMP_PLUGIN_debug (0, "Disabling nvptx offloading; cuInit: %s\n",
605 cuda_error (r));
606 return 0;
610 CUDA_CALL_ERET (-1, cuDeviceGetCount, &n);
611 return n;
614 static void
615 notify_var (const char *var_name, const char *env_var)
617 if (env_var == NULL)
618 GOMP_PLUGIN_debug (0, "%s: <Not defined>\n", var_name);
619 else
620 GOMP_PLUGIN_debug (0, "%s: '%s'\n", var_name, env_var);
623 static void
624 process_GOMP_NVPTX_JIT (intptr_t *gomp_nvptx_o)
626 const char *var_name = "GOMP_NVPTX_JIT";
627 const char *env_var = secure_getenv (var_name);
628 notify_var (var_name, env_var);
630 if (env_var == NULL)
631 return;
633 const char *c = env_var;
634 while (*c != '\0')
636 while (*c == ' ')
637 c++;
639 if (c[0] == '-' && c[1] == 'O'
640 && '0' <= c[2] && c[2] <= '4'
641 && (c[3] == '\0' || c[3] == ' '))
643 *gomp_nvptx_o = c[2] - '0';
644 c += 3;
645 continue;
648 GOMP_PLUGIN_error ("Error parsing %s", var_name);
649 break;
653 static bool
654 link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs,
655 unsigned num_objs)
657 CUjit_option opts[7];
658 void *optvals[7];
659 float elapsed = 0.0;
660 char elog[1024];
661 char ilog[16384];
662 CUlinkState linkstate;
663 CUresult r;
664 void *linkout;
665 size_t linkoutsize __attribute__ ((unused));
667 opts[0] = CU_JIT_WALL_TIME;
668 optvals[0] = &elapsed;
670 opts[1] = CU_JIT_INFO_LOG_BUFFER;
671 optvals[1] = &ilog[0];
673 opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
674 optvals[2] = (void *) sizeof ilog;
676 opts[3] = CU_JIT_ERROR_LOG_BUFFER;
677 optvals[3] = &elog[0];
679 opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
680 optvals[4] = (void *) sizeof elog;
682 opts[5] = CU_JIT_LOG_VERBOSE;
683 optvals[5] = (void *) 1;
685 static intptr_t gomp_nvptx_o = -1;
687 static bool init_done = false;
688 if (!init_done)
690 process_GOMP_NVPTX_JIT (&gomp_nvptx_o);
691 init_done = true;
694 int nopts = 6;
695 if (gomp_nvptx_o != -1)
697 opts[nopts] = CU_JIT_OPTIMIZATION_LEVEL;
698 optvals[nopts] = (void *) gomp_nvptx_o;
699 nopts++;
702 if (CUDA_CALL_EXISTS (cuLinkCreate_v2))
703 CUDA_CALL (cuLinkCreate_v2, nopts, opts, optvals, &linkstate);
704 else
705 CUDA_CALL (cuLinkCreate, nopts, opts, optvals, &linkstate);
707 for (; num_objs--; ptx_objs++)
709 /* cuLinkAddData's 'data' argument erroneously omits the const
710 qualifier. */
711 GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs->code);
712 if (CUDA_CALL_EXISTS (cuLinkAddData_v2))
713 r = CUDA_CALL_NOCHECK (cuLinkAddData_v2, linkstate, CU_JIT_INPUT_PTX,
714 (char *) ptx_objs->code, ptx_objs->size,
715 0, 0, 0, 0);
716 else
717 r = CUDA_CALL_NOCHECK (cuLinkAddData, linkstate, CU_JIT_INPUT_PTX,
718 (char *) ptx_objs->code, ptx_objs->size,
719 0, 0, 0, 0);
720 if (r != CUDA_SUCCESS)
722 GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
723 GOMP_PLUGIN_error ("cuLinkAddData (ptx_code) error: %s",
724 cuda_error (r));
725 return false;
729 GOMP_PLUGIN_debug (0, "Linking\n");
730 r = CUDA_CALL_NOCHECK (cuLinkComplete, linkstate, &linkout, &linkoutsize);
732 GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed);
733 GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]);
735 if (r != CUDA_SUCCESS)
737 GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
738 GOMP_PLUGIN_error ("cuLinkComplete error: %s", cuda_error (r));
739 return false;
742 CUDA_CALL (cuModuleLoadData, module, linkout);
743 CUDA_CALL (cuLinkDestroy, linkstate);
744 return true;
747 static void
748 nvptx_exec (void (*fn), unsigned *dims, void *targ_mem_desc,
749 CUdeviceptr dp, CUstream stream)
751 struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
752 CUfunction function;
753 int i;
754 void *kargs[1];
755 struct nvptx_thread *nvthd = nvptx_thread ();
756 int warp_size = nvthd->ptx_dev->warp_size;
758 function = targ_fn->fn;
760 /* Initialize the launch dimensions. Typically this is constant,
761 provided by the device compiler, but we must permit runtime
762 values. */
763 int seen_zero = 0;
764 for (i = 0; i != GOMP_DIM_MAX; i++)
766 if (targ_fn->launch->dim[i])
767 dims[i] = targ_fn->launch->dim[i];
768 if (!dims[i])
769 seen_zero = 1;
772 if (seen_zero)
774 pthread_mutex_lock (&ptx_dev_lock);
776 static int gomp_openacc_dims[GOMP_DIM_MAX];
777 if (!gomp_openacc_dims[0])
779 /* See if the user provided GOMP_OPENACC_DIM environment
780 variable to specify runtime defaults. */
781 for (int i = 0; i < GOMP_DIM_MAX; ++i)
782 gomp_openacc_dims[i] = GOMP_PLUGIN_acc_default_dim (i);
785 if (!nvthd->ptx_dev->default_dims[0])
787 int default_dims[GOMP_DIM_MAX];
788 for (int i = 0; i < GOMP_DIM_MAX; ++i)
789 default_dims[i] = gomp_openacc_dims[i];
791 int gang, worker, vector;
793 int block_size = nvthd->ptx_dev->max_threads_per_block;
794 int cpu_size = nvthd->ptx_dev->max_threads_per_multiprocessor;
795 int dev_size = nvthd->ptx_dev->num_sms;
796 GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
797 " dev_size=%d, cpu_size=%d\n",
798 warp_size, block_size, dev_size, cpu_size);
800 gang = (cpu_size / block_size) * dev_size;
801 worker = block_size / warp_size;
802 vector = warp_size;
805 /* There is no upper bound on the gang size. The best size
806 matches the hardware configuration. Logical gangs are
807 scheduled onto physical hardware. To maximize usage, we
808 should guess a large number. */
809 if (default_dims[GOMP_DIM_GANG] < 1)
810 default_dims[GOMP_DIM_GANG] = gang ? gang : 1024;
811 /* The worker size must not exceed the hardware. */
812 if (default_dims[GOMP_DIM_WORKER] < 1
813 || (default_dims[GOMP_DIM_WORKER] > worker && gang))
814 default_dims[GOMP_DIM_WORKER] = worker;
815 /* The vector size must exactly match the hardware. */
816 if (default_dims[GOMP_DIM_VECTOR] < 1
817 || (default_dims[GOMP_DIM_VECTOR] != vector && gang))
818 default_dims[GOMP_DIM_VECTOR] = vector;
820 GOMP_PLUGIN_debug (0, " default dimensions [%d,%d,%d]\n",
821 default_dims[GOMP_DIM_GANG],
822 default_dims[GOMP_DIM_WORKER],
823 default_dims[GOMP_DIM_VECTOR]);
825 for (i = 0; i != GOMP_DIM_MAX; i++)
826 nvthd->ptx_dev->default_dims[i] = default_dims[i];
828 pthread_mutex_unlock (&ptx_dev_lock);
831 bool default_dim_p[GOMP_DIM_MAX];
832 for (i = 0; i != GOMP_DIM_MAX; i++)
833 default_dim_p[i] = !dims[i];
835 if (!CUDA_CALL_EXISTS (cuOccupancyMaxPotentialBlockSize))
837 for (i = 0; i != GOMP_DIM_MAX; i++)
838 if (default_dim_p[i])
839 dims[i] = nvthd->ptx_dev->default_dims[i];
841 if (default_dim_p[GOMP_DIM_VECTOR])
842 dims[GOMP_DIM_VECTOR]
843 = MIN (dims[GOMP_DIM_VECTOR],
844 (targ_fn->max_threads_per_block / warp_size
845 * warp_size));
847 if (default_dim_p[GOMP_DIM_WORKER])
848 dims[GOMP_DIM_WORKER]
849 = MIN (dims[GOMP_DIM_WORKER],
850 targ_fn->max_threads_per_block / dims[GOMP_DIM_VECTOR]);
852 else
854 /* Handle the case that the compiler allows the runtime to choose
855 the vector-length conservatively, by ignoring
856 gomp_openacc_dims[GOMP_DIM_VECTOR]. TODO: actually handle
857 it. */
858 int vectors = 0;
859 /* TODO: limit gomp_openacc_dims[GOMP_DIM_WORKER] such that that
860 gomp_openacc_dims[GOMP_DIM_WORKER] * actual_vectors does not
861 exceed targ_fn->max_threads_per_block. */
862 int workers = gomp_openacc_dims[GOMP_DIM_WORKER];
863 int gangs = gomp_openacc_dims[GOMP_DIM_GANG];
864 int grids, blocks;
866 CUDA_CALL_ASSERT (cuOccupancyMaxPotentialBlockSize, &grids,
867 &blocks, function, NULL, 0,
868 dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]);
869 GOMP_PLUGIN_debug (0, "cuOccupancyMaxPotentialBlockSize: "
870 "grid = %d, block = %d\n", grids, blocks);
872 /* Keep the num_gangs proportional to the block size. In
873 the case were a block size is limited by shared-memory
874 or the register file capacity, the runtime will not
875 excessively over assign gangs to the multiprocessor
876 units if their state is going to be swapped out even
877 more than necessary. The constant factor 2 is there to
878 prevent threads from idling when there is insufficient
879 work for them. */
880 if (gangs == 0)
881 gangs = 2 * grids * (blocks / warp_size);
883 if (vectors == 0)
884 vectors = warp_size;
886 if (workers == 0)
888 int actual_vectors = (default_dim_p[GOMP_DIM_VECTOR]
889 ? vectors
890 : dims[GOMP_DIM_VECTOR]);
891 workers = blocks / actual_vectors;
892 workers = MAX (workers, 1);
893 /* If we need a per-worker barrier ... . */
894 if (actual_vectors > 32)
895 /* Don't use more barriers than available. */
896 workers = MIN (workers, 15);
899 for (i = 0; i != GOMP_DIM_MAX; i++)
900 if (default_dim_p[i])
901 switch (i)
903 case GOMP_DIM_GANG: dims[i] = gangs; break;
904 case GOMP_DIM_WORKER: dims[i] = workers; break;
905 case GOMP_DIM_VECTOR: dims[i] = vectors; break;
906 default: GOMP_PLUGIN_fatal ("invalid dim");
912 /* Check if the accelerator has sufficient hardware resources to
913 launch the offloaded kernel. */
914 if (dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]
915 > targ_fn->max_threads_per_block)
917 const char *msg
918 = ("The Nvidia accelerator has insufficient resources to launch '%s'"
919 " with num_workers = %d and vector_length = %d"
920 "; "
921 "recompile the program with 'num_workers = x and vector_length = y'"
922 " on that offloaded region or '-fopenacc-dim=:x:y' where"
923 " x * y <= %d"
924 ".\n");
925 GOMP_PLUGIN_fatal (msg, targ_fn->launch->fn, dims[GOMP_DIM_WORKER],
926 dims[GOMP_DIM_VECTOR], targ_fn->max_threads_per_block);
929 /* Check if the accelerator has sufficient barrier resources to
930 launch the offloaded kernel. */
931 if (dims[GOMP_DIM_WORKER] > 15 && dims[GOMP_DIM_VECTOR] > 32)
933 const char *msg
934 = ("The Nvidia accelerator has insufficient barrier resources to launch"
935 " '%s' with num_workers = %d and vector_length = %d"
936 "; "
937 "recompile the program with 'num_workers = x' on that offloaded"
938 " region or '-fopenacc-dim=:x:' where x <= 15"
939 "; "
940 "or, recompile the program with 'vector_length = 32' on that"
941 " offloaded region or '-fopenacc-dim=::32'"
942 ".\n");
943 GOMP_PLUGIN_fatal (msg, targ_fn->launch->fn, dims[GOMP_DIM_WORKER],
944 dims[GOMP_DIM_VECTOR]);
947 GOMP_PLUGIN_debug (0, " %s: kernel %s: launch"
948 " gangs=%u, workers=%u, vectors=%u\n",
949 __FUNCTION__, targ_fn->launch->fn, dims[GOMP_DIM_GANG],
950 dims[GOMP_DIM_WORKER], dims[GOMP_DIM_VECTOR]);
952 // OpenACC CUDA
954 // num_gangs nctaid.x
955 // num_workers ntid.y
956 // vector length ntid.x
958 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
959 acc_prof_info *prof_info = thr->prof_info;
960 acc_event_info enqueue_launch_event_info;
961 acc_api_info *api_info = thr->api_info;
962 bool profiling_p = __builtin_expect (prof_info != NULL, false);
963 if (profiling_p)
965 prof_info->event_type = acc_ev_enqueue_launch_start;
967 enqueue_launch_event_info.launch_event.event_type
968 = prof_info->event_type;
969 enqueue_launch_event_info.launch_event.valid_bytes
970 = _ACC_LAUNCH_EVENT_INFO_VALID_BYTES;
971 enqueue_launch_event_info.launch_event.parent_construct
972 = acc_construct_parallel;
973 enqueue_launch_event_info.launch_event.implicit = 1;
974 enqueue_launch_event_info.launch_event.tool_info = NULL;
975 enqueue_launch_event_info.launch_event.kernel_name = targ_fn->launch->fn;
976 enqueue_launch_event_info.launch_event.num_gangs
977 = dims[GOMP_DIM_GANG];
978 enqueue_launch_event_info.launch_event.num_workers
979 = dims[GOMP_DIM_WORKER];
980 enqueue_launch_event_info.launch_event.vector_length
981 = dims[GOMP_DIM_VECTOR];
983 api_info->device_api = acc_device_api_cuda;
985 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &enqueue_launch_event_info,
986 api_info);
989 kargs[0] = &dp;
990 CUDA_CALL_ASSERT (cuLaunchKernel, function,
991 dims[GOMP_DIM_GANG], 1, 1,
992 dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1,
993 0, stream, kargs, 0);
995 if (profiling_p)
997 prof_info->event_type = acc_ev_enqueue_launch_end;
998 enqueue_launch_event_info.launch_event.event_type
999 = prof_info->event_type;
1000 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &enqueue_launch_event_info,
1001 api_info);
1004 GOMP_PLUGIN_debug (0, " %s: kernel %s: finished\n", __FUNCTION__,
1005 targ_fn->launch->fn);
1008 void * openacc_get_current_cuda_context (void);
1010 static void
1011 goacc_profiling_acc_ev_alloc (struct goacc_thread *thr, void *dp, size_t s)
1013 acc_prof_info *prof_info = thr->prof_info;
1014 acc_event_info data_event_info;
1015 acc_api_info *api_info = thr->api_info;
1017 prof_info->event_type = acc_ev_alloc;
1019 data_event_info.data_event.event_type = prof_info->event_type;
1020 data_event_info.data_event.valid_bytes = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1021 data_event_info.data_event.parent_construct = acc_construct_parallel;
1022 data_event_info.data_event.implicit = 1;
1023 data_event_info.data_event.tool_info = NULL;
1024 data_event_info.data_event.var_name = NULL;
1025 data_event_info.data_event.bytes = s;
1026 data_event_info.data_event.host_ptr = NULL;
1027 data_event_info.data_event.device_ptr = dp;
1029 api_info->device_api = acc_device_api_cuda;
1031 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, api_info);
1034 /* Free the cached soft-stacks block if it is above the SOFTSTACK_CACHE_LIMIT
1035 size threshold, or if FORCE is true. */
1037 static void
1038 nvptx_stacks_free (struct ptx_device *ptx_dev, bool force)
1040 pthread_mutex_lock (&ptx_dev->omp_stacks.lock);
1041 if (ptx_dev->omp_stacks.ptr
1042 && (force || ptx_dev->omp_stacks.size > SOFTSTACK_CACHE_LIMIT))
1044 CUresult r = CUDA_CALL_NOCHECK (cuMemFree, ptx_dev->omp_stacks.ptr);
1045 if (r != CUDA_SUCCESS)
1046 GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
1047 ptx_dev->omp_stacks.ptr = 0;
1048 ptx_dev->omp_stacks.size = 0;
1050 pthread_mutex_unlock (&ptx_dev->omp_stacks.lock);
1053 static void *
1054 nvptx_alloc (size_t s, bool suppress_errors)
1056 CUdeviceptr d;
1058 CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &d, s);
1059 if (suppress_errors && r == CUDA_ERROR_OUT_OF_MEMORY)
1060 return NULL;
1061 else if (r != CUDA_SUCCESS)
1063 GOMP_PLUGIN_error ("nvptx_alloc error: %s", cuda_error (r));
1064 return NULL;
1067 /* NOTE: We only do profiling stuff if the memory allocation succeeds. */
1068 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1069 bool profiling_p
1070 = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1071 if (profiling_p)
1072 goacc_profiling_acc_ev_alloc (thr, (void *) d, s);
1074 return (void *) d;
1077 static void
1078 goacc_profiling_acc_ev_free (struct goacc_thread *thr, void *p)
1080 acc_prof_info *prof_info = thr->prof_info;
1081 acc_event_info data_event_info;
1082 acc_api_info *api_info = thr->api_info;
1084 prof_info->event_type = acc_ev_free;
1086 data_event_info.data_event.event_type = prof_info->event_type;
1087 data_event_info.data_event.valid_bytes = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1088 data_event_info.data_event.parent_construct = acc_construct_parallel;
1089 data_event_info.data_event.implicit = 1;
1090 data_event_info.data_event.tool_info = NULL;
1091 data_event_info.data_event.var_name = NULL;
1092 data_event_info.data_event.bytes = -1;
1093 data_event_info.data_event.host_ptr = NULL;
1094 data_event_info.data_event.device_ptr = p;
1096 api_info->device_api = acc_device_api_cuda;
1098 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, api_info);
1101 static bool
1102 nvptx_free (void *p, struct ptx_device *ptx_dev)
1104 CUdeviceptr pb;
1105 size_t ps;
1107 CUresult r = CUDA_CALL_NOCHECK (cuMemGetAddressRange, &pb, &ps,
1108 (CUdeviceptr) p);
1109 if (r == CUDA_ERROR_NOT_PERMITTED)
1111 /* We assume that this error indicates we are in a CUDA callback context,
1112 where all CUDA calls are not allowed (see cuStreamAddCallback
1113 documentation for description). Arrange to free this piece of device
1114 memory later. */
1115 struct ptx_free_block *n
1116 = GOMP_PLUGIN_malloc (sizeof (struct ptx_free_block));
1117 n->ptr = p;
1118 pthread_mutex_lock (&ptx_dev->free_blocks_lock);
1119 n->next = ptx_dev->free_blocks;
1120 ptx_dev->free_blocks = n;
1121 pthread_mutex_unlock (&ptx_dev->free_blocks_lock);
1122 return true;
1124 else if (r != CUDA_SUCCESS)
1126 GOMP_PLUGIN_error ("cuMemGetAddressRange error: %s", cuda_error (r));
1127 return false;
1129 if ((CUdeviceptr) p != pb)
1131 GOMP_PLUGIN_error ("invalid device address");
1132 return false;
1135 CUDA_CALL (cuMemFree, (CUdeviceptr) p);
1136 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1137 bool profiling_p
1138 = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1139 if (profiling_p)
1140 goacc_profiling_acc_ev_free (thr, p);
1142 return true;
1145 static void *
1146 nvptx_get_current_cuda_device (void)
1148 struct nvptx_thread *nvthd = nvptx_thread ();
1150 if (!nvthd || !nvthd->ptx_dev)
1151 return NULL;
1153 return &nvthd->ptx_dev->dev;
1156 static void *
1157 nvptx_get_current_cuda_context (void)
1159 struct nvptx_thread *nvthd = nvptx_thread ();
1161 if (!nvthd || !nvthd->ptx_dev)
1162 return NULL;
1164 return nvthd->ptx_dev->ctx;
1167 /* Plugin entry points. */
1169 const char *
1170 GOMP_OFFLOAD_get_name (void)
1172 return "nvptx";
1175 unsigned int
1176 GOMP_OFFLOAD_get_caps (void)
1178 return GOMP_OFFLOAD_CAP_OPENACC_200 | GOMP_OFFLOAD_CAP_OPENMP_400;
1182 GOMP_OFFLOAD_get_type (void)
1184 return OFFLOAD_TARGET_TYPE_NVIDIA_PTX;
1188 GOMP_OFFLOAD_get_num_devices (unsigned int omp_requires_mask)
1190 int num_devices = nvptx_get_num_devices ();
1191 /* Return -1 if no omp_requires_mask cannot be fulfilled but
1192 devices were present. Unified-shared address: see comment in
1193 nvptx_open_device for CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING. */
1194 if (num_devices > 0
1195 && ((omp_requires_mask
1196 & ~(GOMP_REQUIRES_UNIFIED_ADDRESS
1197 | GOMP_REQUIRES_REVERSE_OFFLOAD)) != 0))
1198 return -1;
1199 return num_devices;
1202 bool
1203 GOMP_OFFLOAD_init_device (int n)
1205 struct ptx_device *dev;
1207 pthread_mutex_lock (&ptx_dev_lock);
1209 if (!nvptx_init () || ptx_devices[n] != NULL)
1211 pthread_mutex_unlock (&ptx_dev_lock);
1212 return false;
1215 dev = nvptx_open_device (n);
1216 if (dev)
1218 ptx_devices[n] = dev;
1219 instantiated_devices++;
1222 pthread_mutex_unlock (&ptx_dev_lock);
1224 return dev != NULL;
1227 bool
1228 GOMP_OFFLOAD_fini_device (int n)
1230 pthread_mutex_lock (&ptx_dev_lock);
1232 if (ptx_devices[n] != NULL)
1234 if (!nvptx_attach_host_thread_to_device (n)
1235 || !nvptx_close_device (ptx_devices[n]))
1237 pthread_mutex_unlock (&ptx_dev_lock);
1238 return false;
1240 ptx_devices[n] = NULL;
1241 instantiated_devices--;
1244 if (instantiated_devices == 0)
1246 free (ptx_devices);
1247 ptx_devices = NULL;
1250 pthread_mutex_unlock (&ptx_dev_lock);
1251 return true;
1254 /* Return the libgomp version number we're compatible with. There is
1255 no requirement for cross-version compatibility. */
1257 unsigned
1258 GOMP_OFFLOAD_version (void)
1260 return GOMP_VERSION;
1263 /* Initialize __nvptx_clocktick, if present in MODULE. */
1265 static void
1266 nvptx_set_clocktick (CUmodule module, struct ptx_device *dev)
1268 CUdeviceptr dptr;
1269 CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &dptr, NULL,
1270 module, "__nvptx_clocktick");
1271 if (r == CUDA_ERROR_NOT_FOUND)
1272 return;
1273 if (r != CUDA_SUCCESS)
1274 GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
1275 double __nvptx_clocktick = 1e-3 / dev->clock_khz;
1276 r = CUDA_CALL_NOCHECK (cuMemcpyHtoD, dptr, &__nvptx_clocktick,
1277 sizeof (__nvptx_clocktick));
1278 if (r != CUDA_SUCCESS)
1279 GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
1282 /* Load the (partial) program described by TARGET_DATA to device
1283 number ORD. Allocate and return TARGET_TABLE. If not NULL, REV_FN_TABLE
1284 will contain the on-device addresses of the functions for reverse offload.
1285 To be freed by the caller. */
1288 GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
1289 struct addr_pair **target_table,
1290 uint64_t **rev_fn_table,
1291 uint64_t *host_ind_fn_table)
1293 CUmodule module;
1294 const char *const *var_names;
1295 const struct targ_fn_launch *fn_descs;
1296 unsigned int fn_entries, var_entries, ind_fn_entries, other_entries, i, j;
1297 struct targ_fn_descriptor *targ_fns;
1298 struct addr_pair *targ_tbl;
1299 const nvptx_tdata_t *img_header = (const nvptx_tdata_t *) target_data;
1300 struct ptx_image_data *new_image;
1301 struct ptx_device *dev;
1303 if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1305 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1306 " (expected %u, received %u)",
1307 GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1308 return -1;
1311 if (!nvptx_attach_host_thread_to_device (ord)
1312 || !link_ptx (&module, img_header->ptx_objs, img_header->ptx_num))
1313 return -1;
1315 dev = ptx_devices[ord];
1317 /* The mkoffload utility emits a struct of pointers/integers at the
1318 start of each offload image. The array of kernel names and the
1319 functions addresses form a one-to-one correspondence. */
1321 var_entries = img_header->var_num;
1322 var_names = img_header->var_names;
1323 fn_entries = img_header->fn_num;
1324 fn_descs = img_header->fn_descs;
1325 ind_fn_entries = GOMP_VERSION_SUPPORTS_INDIRECT_FUNCS (version)
1326 ? img_header->ind_fn_num : 0;
1328 /* Currently, other_entries contains only the struct of ICVs. */
1329 other_entries = 1;
1331 targ_tbl = GOMP_PLUGIN_malloc (sizeof (struct addr_pair)
1332 * (fn_entries + var_entries + other_entries));
1333 targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
1334 * fn_entries);
1336 *target_table = targ_tbl;
1338 new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data));
1339 new_image->target_data = target_data;
1340 new_image->module = module;
1341 new_image->fns = targ_fns;
1343 pthread_mutex_lock (&dev->image_lock);
1344 new_image->next = dev->images;
1345 dev->images = new_image;
1346 pthread_mutex_unlock (&dev->image_lock);
1348 for (i = 0; i < fn_entries; i++, targ_fns++, targ_tbl++)
1350 CUfunction function;
1351 int nregs, mthrs;
1353 CUDA_CALL_ERET (-1, cuModuleGetFunction, &function, module,
1354 fn_descs[i].fn);
1355 CUDA_CALL_ERET (-1, cuFuncGetAttribute, &nregs,
1356 CU_FUNC_ATTRIBUTE_NUM_REGS, function);
1357 CUDA_CALL_ERET (-1, cuFuncGetAttribute, &mthrs,
1358 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, function);
1360 targ_fns->fn = function;
1361 targ_fns->launch = &fn_descs[i];
1362 targ_fns->regs_per_thread = nregs;
1363 targ_fns->max_threads_per_block = mthrs;
1365 targ_tbl->start = (uintptr_t) targ_fns;
1366 targ_tbl->end = targ_tbl->start + 1;
1369 for (j = 0; j < var_entries; j++, targ_tbl++)
1371 CUdeviceptr var;
1372 size_t bytes;
1374 CUDA_CALL_ERET (-1, cuModuleGetGlobal,
1375 &var, &bytes, module, var_names[j]);
1377 targ_tbl->start = (uintptr_t) var;
1378 targ_tbl->end = targ_tbl->start + bytes;
1381 if (ind_fn_entries > 0)
1383 CUdeviceptr var;
1384 size_t bytes;
1386 /* Read indirect function table from image. */
1387 CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &var, &bytes, module,
1388 "$offload_ind_func_table");
1389 if (r != CUDA_SUCCESS)
1390 GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
1391 assert (bytes == sizeof (uint64_t) * ind_fn_entries);
1393 uint64_t ind_fn_table[ind_fn_entries];
1394 r = CUDA_CALL_NOCHECK (cuMemcpyDtoH, ind_fn_table, var, bytes);
1395 if (r != CUDA_SUCCESS)
1396 GOMP_PLUGIN_fatal ("cuMemcpyDtoH error: %s", cuda_error (r));
1398 /* Build host->target address map for indirect functions. */
1399 uint64_t ind_fn_map[ind_fn_entries * 2 + 1];
1400 for (unsigned k = 0; k < ind_fn_entries; k++)
1402 ind_fn_map[k * 2] = host_ind_fn_table[k];
1403 ind_fn_map[k * 2 + 1] = ind_fn_table[k];
1404 GOMP_PLUGIN_debug (0, "Indirect function %d: %lx->%lx\n",
1405 k, host_ind_fn_table[k], ind_fn_table[k]);
1407 ind_fn_map[ind_fn_entries * 2] = 0;
1409 /* Write the map onto the target. */
1410 void *map_target_addr
1411 = GOMP_OFFLOAD_alloc (ord, sizeof (ind_fn_map));
1412 GOMP_PLUGIN_debug (0, "Allocated indirect map at %p\n", map_target_addr);
1414 GOMP_OFFLOAD_host2dev (ord, map_target_addr,
1415 (void*) ind_fn_map,
1416 sizeof (ind_fn_map));
1418 /* Write address of the map onto the target. */
1419 CUdeviceptr varptr;
1420 size_t varsize;
1421 r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &varptr, &varsize,
1422 module, XSTRING (GOMP_INDIRECT_ADDR_MAP));
1423 if (r != CUDA_SUCCESS)
1424 GOMP_PLUGIN_fatal ("Indirect map variable not found in image: %s",
1425 cuda_error (r));
1427 GOMP_PLUGIN_debug (0,
1428 "Indirect map variable found at %llx with size %ld\n",
1429 varptr, varsize);
1431 GOMP_OFFLOAD_host2dev (ord, (void *) varptr, &map_target_addr,
1432 sizeof (map_target_addr));
1435 CUdeviceptr varptr;
1436 size_t varsize;
1437 CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &varptr, &varsize,
1438 module, XSTRING (GOMP_ADDITIONAL_ICVS));
1440 if (r == CUDA_SUCCESS)
1442 targ_tbl->start = (uintptr_t) varptr;
1443 targ_tbl->end = (uintptr_t) (varptr + varsize);
1445 else
1446 /* The variable was not in this image. */
1447 targ_tbl->start = targ_tbl->end = 0;
1449 if (rev_fn_table && fn_entries == 0)
1450 *rev_fn_table = NULL;
1451 else if (rev_fn_table)
1453 CUdeviceptr var;
1454 size_t bytes;
1455 unsigned int i;
1456 r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &var, &bytes, module,
1457 "$offload_func_table");
1458 if (r != CUDA_SUCCESS)
1459 GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
1460 assert (bytes == sizeof (uint64_t) * fn_entries);
1461 *rev_fn_table = GOMP_PLUGIN_malloc (sizeof (uint64_t) * fn_entries);
1462 r = CUDA_CALL_NOCHECK (cuMemcpyDtoH, *rev_fn_table, var, bytes);
1463 if (r != CUDA_SUCCESS)
1464 GOMP_PLUGIN_fatal ("cuMemcpyDtoH error: %s", cuda_error (r));
1465 /* Free if only NULL entries. */
1466 for (i = 0; i < fn_entries; ++i)
1467 if ((*rev_fn_table)[i] != 0)
1468 break;
1469 if (i == fn_entries)
1471 free (*rev_fn_table);
1472 *rev_fn_table = NULL;
1476 if (rev_fn_table && *rev_fn_table && dev->rev_data == NULL)
1478 /* Get the on-device GOMP_REV_OFFLOAD_VAR variable. It should be
1479 available but it might be not. One reason could be: if the user code
1480 has 'omp target device(ancestor:1)' in pure hostcode, GOMP_target_ext
1481 is not called on the device and, hence, it and GOMP_REV_OFFLOAD_VAR
1482 are not linked in. */
1483 CUdeviceptr device_rev_offload_var;
1484 size_t device_rev_offload_size;
1485 CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal,
1486 &device_rev_offload_var,
1487 &device_rev_offload_size, module,
1488 XSTRING (GOMP_REV_OFFLOAD_VAR));
1489 if (r != CUDA_SUCCESS)
1491 free (*rev_fn_table);
1492 *rev_fn_table = NULL;
1494 else
1496 /* cuMemHostAlloc memory is accessible on the device, if
1497 unified-shared address is supported; this is assumed - see comment
1498 in nvptx_open_device for CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING. */
1499 CUDA_CALL_ASSERT (cuMemHostAlloc, (void **) &dev->rev_data,
1500 sizeof (*dev->rev_data), CU_MEMHOSTALLOC_DEVICEMAP);
1501 CUdeviceptr dp = (CUdeviceptr) dev->rev_data;
1502 r = CUDA_CALL_NOCHECK (cuMemcpyHtoD, device_rev_offload_var, &dp,
1503 sizeof (dp));
1504 if (r != CUDA_SUCCESS)
1505 GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
1509 nvptx_set_clocktick (module, dev);
1511 return fn_entries + var_entries + other_entries;
1514 /* Unload the program described by TARGET_DATA. DEV_DATA is the
1515 function descriptors allocated by G_O_load_image. */
1517 bool
1518 GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data)
1520 struct ptx_image_data *image, **prev_p;
1521 struct ptx_device *dev = ptx_devices[ord];
1523 if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1525 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1526 " (expected %u, received %u)",
1527 GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1528 return false;
1531 bool ret = true;
1532 pthread_mutex_lock (&dev->image_lock);
1533 for (prev_p = &dev->images; (image = *prev_p) != 0; prev_p = &image->next)
1534 if (image->target_data == target_data)
1536 *prev_p = image->next;
1537 if (CUDA_CALL_NOCHECK (cuModuleUnload, image->module) != CUDA_SUCCESS)
1538 ret = false;
1539 free (image->fns);
1540 free (image);
1541 break;
1543 pthread_mutex_unlock (&dev->image_lock);
1544 return ret;
1547 void *
1548 GOMP_OFFLOAD_alloc (int ord, size_t size)
1550 if (!nvptx_attach_host_thread_to_device (ord))
1551 return NULL;
1553 struct ptx_device *ptx_dev = ptx_devices[ord];
1554 struct ptx_free_block *blocks, *tmp;
1556 pthread_mutex_lock (&ptx_dev->free_blocks_lock);
1557 blocks = ptx_dev->free_blocks;
1558 ptx_dev->free_blocks = NULL;
1559 pthread_mutex_unlock (&ptx_dev->free_blocks_lock);
1561 nvptx_stacks_free (ptx_dev, false);
1563 while (blocks)
1565 tmp = blocks->next;
1566 nvptx_free (blocks->ptr, ptx_dev);
1567 free (blocks);
1568 blocks = tmp;
1571 void *d = nvptx_alloc (size, true);
1572 if (d)
1573 return d;
1574 else
1576 /* Memory allocation failed. Try freeing the stacks block, and
1577 retrying. */
1578 nvptx_stacks_free (ptx_dev, true);
1579 return nvptx_alloc (size, false);
1583 bool
1584 GOMP_OFFLOAD_free (int ord, void *ptr)
1586 return (nvptx_attach_host_thread_to_device (ord)
1587 && nvptx_free (ptr, ptx_devices[ord]));
1590 void
1591 GOMP_OFFLOAD_openacc_exec (void (*fn) (void *),
1592 size_t mapnum __attribute__((unused)),
1593 void **hostaddrs __attribute__((unused)),
1594 void **devaddrs,
1595 unsigned *dims, void *targ_mem_desc)
1597 GOMP_PLUGIN_debug (0, "nvptx %s\n", __FUNCTION__);
1599 CUdeviceptr dp = (CUdeviceptr) devaddrs;
1600 nvptx_exec (fn, dims, targ_mem_desc, dp, NULL);
1602 CUresult r = CUDA_CALL_NOCHECK (cuStreamSynchronize, NULL);
1603 const char *maybe_abort_msg = "(perhaps abort was called)";
1604 if (r == CUDA_ERROR_LAUNCH_FAILED)
1605 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
1606 maybe_abort_msg);
1607 else if (r != CUDA_SUCCESS)
1608 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1611 void
1612 GOMP_OFFLOAD_openacc_async_exec (void (*fn) (void *),
1613 size_t mapnum __attribute__((unused)),
1614 void **hostaddrs __attribute__((unused)),
1615 void **devaddrs,
1616 unsigned *dims, void *targ_mem_desc,
1617 struct goacc_asyncqueue *aq)
1619 GOMP_PLUGIN_debug (0, "nvptx %s\n", __FUNCTION__);
1621 CUdeviceptr dp = (CUdeviceptr) devaddrs;
1622 nvptx_exec (fn, dims, targ_mem_desc, dp, aq->cuda_stream);
1625 void *
1626 GOMP_OFFLOAD_openacc_create_thread_data (int ord)
1628 struct ptx_device *ptx_dev;
1629 struct nvptx_thread *nvthd
1630 = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
1631 CUcontext thd_ctx;
1633 ptx_dev = ptx_devices[ord];
1635 assert (ptx_dev);
1637 CUDA_CALL_ASSERT (cuCtxGetCurrent, &thd_ctx);
1639 assert (ptx_dev->ctx);
1641 if (!thd_ctx)
1642 CUDA_CALL_ASSERT (cuCtxPushCurrent, ptx_dev->ctx);
1644 nvthd->ptx_dev = ptx_dev;
1646 return (void *) nvthd;
1649 void
1650 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
1652 free (data);
1655 void *
1656 GOMP_OFFLOAD_openacc_cuda_get_current_device (void)
1658 return nvptx_get_current_cuda_device ();
1661 void *
1662 GOMP_OFFLOAD_openacc_cuda_get_current_context (void)
1664 return nvptx_get_current_cuda_context ();
1667 /* This returns a CUstream. */
1668 void *
1669 GOMP_OFFLOAD_openacc_cuda_get_stream (struct goacc_asyncqueue *aq)
1671 return (void *) aq->cuda_stream;
1674 /* This takes a CUstream. */
1676 GOMP_OFFLOAD_openacc_cuda_set_stream (struct goacc_asyncqueue *aq, void *stream)
1678 if (aq->cuda_stream)
1680 CUDA_CALL_ASSERT (cuStreamSynchronize, aq->cuda_stream);
1681 CUDA_CALL_ASSERT (cuStreamDestroy, aq->cuda_stream);
1684 aq->cuda_stream = (CUstream) stream;
1685 return 1;
1688 static struct goacc_asyncqueue *
1689 nvptx_goacc_asyncqueue_construct (unsigned int flags)
1691 CUstream stream = NULL;
1692 CUDA_CALL_ERET (NULL, cuStreamCreate, &stream, flags);
1694 struct goacc_asyncqueue *aq
1695 = GOMP_PLUGIN_malloc (sizeof (struct goacc_asyncqueue));
1696 aq->cuda_stream = stream;
1697 return aq;
1700 struct goacc_asyncqueue *
1701 GOMP_OFFLOAD_openacc_async_construct (int device __attribute__((unused)))
1703 return nvptx_goacc_asyncqueue_construct (CU_STREAM_DEFAULT);
1706 static bool
1707 nvptx_goacc_asyncqueue_destruct (struct goacc_asyncqueue *aq)
1709 CUDA_CALL_ERET (false, cuStreamDestroy, aq->cuda_stream);
1710 free (aq);
1711 return true;
1714 bool
1715 GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
1717 return nvptx_goacc_asyncqueue_destruct (aq);
1721 GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
1723 CUresult r = CUDA_CALL_NOCHECK (cuStreamQuery, aq->cuda_stream);
1724 if (r == CUDA_SUCCESS)
1725 return 1;
1726 if (r == CUDA_ERROR_NOT_READY)
1727 return 0;
1729 GOMP_PLUGIN_error ("cuStreamQuery error: %s", cuda_error (r));
1730 return -1;
1733 static bool
1734 nvptx_goacc_asyncqueue_synchronize (struct goacc_asyncqueue *aq)
1736 CUDA_CALL_ERET (false, cuStreamSynchronize, aq->cuda_stream);
1737 return true;
1740 bool
1741 GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
1743 return nvptx_goacc_asyncqueue_synchronize (aq);
1746 bool
1747 GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
1748 struct goacc_asyncqueue *aq2)
1750 CUevent e;
1751 CUDA_CALL_ERET (false, cuEventCreate, &e, CU_EVENT_DISABLE_TIMING);
1752 CUDA_CALL_ERET (false, cuEventRecord, e, aq1->cuda_stream);
1753 CUDA_CALL_ERET (false, cuStreamWaitEvent, aq2->cuda_stream, e, 0);
1754 return true;
1757 static void
1758 cuda_callback_wrapper (CUstream stream, CUresult res, void *ptr)
1760 if (res != CUDA_SUCCESS)
1761 GOMP_PLUGIN_fatal ("%s error: %s", __FUNCTION__, cuda_error (res));
1762 struct nvptx_callback *cb = (struct nvptx_callback *) ptr;
1763 cb->fn (cb->ptr);
1764 free (ptr);
1767 void
1768 GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
1769 void (*callback_fn)(void *),
1770 void *userptr)
1772 struct nvptx_callback *b = GOMP_PLUGIN_malloc (sizeof (*b));
1773 b->fn = callback_fn;
1774 b->ptr = userptr;
1775 b->aq = aq;
1776 CUDA_CALL_ASSERT (cuStreamAddCallback, aq->cuda_stream,
1777 cuda_callback_wrapper, (void *) b, 0);
1780 static bool
1781 cuda_memcpy_sanity_check (const void *h, const void *d, size_t s)
1783 CUdeviceptr pb;
1784 size_t ps;
1785 if (!s)
1786 return true;
1787 if (!d)
1789 GOMP_PLUGIN_error ("invalid device address");
1790 return false;
1792 CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
1793 if (!pb)
1795 GOMP_PLUGIN_error ("invalid device address");
1796 return false;
1798 if (!h)
1800 GOMP_PLUGIN_error ("invalid host address");
1801 return false;
1803 if (d == h)
1805 GOMP_PLUGIN_error ("invalid host or device address");
1806 return false;
1808 if ((void *)(d + s) > (void *)(pb + ps))
1810 GOMP_PLUGIN_error ("invalid size");
1811 return false;
1813 return true;
1816 bool
1817 GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
1819 if (!nvptx_attach_host_thread_to_device (ord)
1820 || !cuda_memcpy_sanity_check (src, dst, n))
1821 return false;
1822 CUDA_CALL (cuMemcpyHtoD, (CUdeviceptr) dst, src, n);
1823 return true;
1826 bool
1827 GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
1829 if (!nvptx_attach_host_thread_to_device (ord)
1830 || !cuda_memcpy_sanity_check (dst, src, n))
1831 return false;
1832 CUDA_CALL (cuMemcpyDtoH, dst, (CUdeviceptr) src, n);
1833 return true;
1836 bool
1837 GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n)
1839 CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n, NULL);
1840 return true;
1844 GOMP_OFFLOAD_memcpy2d (int dst_ord, int src_ord, size_t dim1_size,
1845 size_t dim0_len, void *dst, size_t dst_offset1_size,
1846 size_t dst_offset0_len, size_t dst_dim1_size,
1847 const void *src, size_t src_offset1_size,
1848 size_t src_offset0_len, size_t src_dim1_size)
1850 if (!nvptx_attach_host_thread_to_device (src_ord != -1 ? src_ord : dst_ord))
1851 return false;
1853 /* TODO: Consider using CU_MEMORYTYPE_UNIFIED if supported. */
1855 CUDA_MEMCPY2D data;
1857 memset (&data, 0, sizeof (data));
1858 data.WidthInBytes = dim1_size;
1859 data.Height = dim0_len;
1861 if (dst_ord == -1)
1863 data.dstMemoryType = CU_MEMORYTYPE_HOST;
1864 data.dstHost = dst;
1866 else
1868 data.dstMemoryType = CU_MEMORYTYPE_DEVICE;
1869 data.dstDevice = (CUdeviceptr) dst;
1871 data.dstPitch = dst_dim1_size;
1872 data.dstXInBytes = dst_offset1_size;
1873 data.dstY = dst_offset0_len;
1875 if (src_ord == -1)
1877 data.srcMemoryType = CU_MEMORYTYPE_HOST;
1878 data.srcHost = src;
1880 else
1882 data.srcMemoryType = CU_MEMORYTYPE_DEVICE;
1883 data.srcDevice = (CUdeviceptr) src;
1885 data.srcPitch = src_dim1_size;
1886 data.srcXInBytes = src_offset1_size;
1887 data.srcY = src_offset0_len;
1889 CUresult res = CUDA_CALL_NOCHECK (cuMemcpy2D, &data);
1890 if (res == CUDA_ERROR_INVALID_VALUE)
1891 /* If pitch > CU_DEVICE_ATTRIBUTE_MAX_PITCH or for device-to-device
1892 for (some) memory not allocated by cuMemAllocPitch, cuMemcpy2D fails
1893 with an error; try the slower cuMemcpy2DUnaligned now. */
1894 CUDA_CALL (cuMemcpy2DUnaligned, &data);
1895 else if (res != CUDA_SUCCESS)
1897 GOMP_PLUGIN_error ("cuMemcpy2D error: %s", cuda_error (res));
1898 return false;
1900 return true;
1904 GOMP_OFFLOAD_memcpy3d (int dst_ord, int src_ord, size_t dim2_size,
1905 size_t dim1_len, size_t dim0_len, void *dst,
1906 size_t dst_offset2_size, size_t dst_offset1_len,
1907 size_t dst_offset0_len, size_t dst_dim2_size,
1908 size_t dst_dim1_len, const void *src,
1909 size_t src_offset2_size, size_t src_offset1_len,
1910 size_t src_offset0_len, size_t src_dim2_size,
1911 size_t src_dim1_len)
1913 if (!nvptx_attach_host_thread_to_device (src_ord != -1 ? src_ord : dst_ord))
1914 return false;
1916 /* TODO: Consider using CU_MEMORYTYPE_UNIFIED if supported. */
1918 CUDA_MEMCPY3D data;
1920 memset (&data, 0, sizeof (data));
1921 data.WidthInBytes = dim2_size;
1922 data.Height = dim1_len;
1923 data.Depth = dim0_len;
1925 if (dst_ord == -1)
1927 data.dstMemoryType = CU_MEMORYTYPE_HOST;
1928 data.dstHost = dst;
1930 else
1932 data.dstMemoryType = CU_MEMORYTYPE_DEVICE;
1933 data.dstDevice = (CUdeviceptr) dst;
1935 data.dstPitch = dst_dim2_size;
1936 data.dstHeight = dst_dim1_len;
1937 data.dstXInBytes = dst_offset2_size;
1938 data.dstY = dst_offset1_len;
1939 data.dstZ = dst_offset0_len;
1941 if (src_ord == -1)
1943 data.srcMemoryType = CU_MEMORYTYPE_HOST;
1944 data.srcHost = src;
1946 else
1948 data.srcMemoryType = CU_MEMORYTYPE_DEVICE;
1949 data.srcDevice = (CUdeviceptr) src;
1951 data.srcPitch = src_dim2_size;
1952 data.srcHeight = src_dim1_len;
1953 data.srcXInBytes = src_offset2_size;
1954 data.srcY = src_offset1_len;
1955 data.srcZ = src_offset0_len;
1957 CUDA_CALL (cuMemcpy3D, &data);
1958 return true;
1961 bool
1962 GOMP_OFFLOAD_openacc_async_host2dev (int ord, void *dst, const void *src,
1963 size_t n, struct goacc_asyncqueue *aq)
1965 if (!nvptx_attach_host_thread_to_device (ord)
1966 || !cuda_memcpy_sanity_check (src, dst, n))
1967 return false;
1968 CUDA_CALL (cuMemcpyHtoDAsync, (CUdeviceptr) dst, src, n, aq->cuda_stream);
1969 return true;
1972 bool
1973 GOMP_OFFLOAD_openacc_async_dev2host (int ord, void *dst, const void *src,
1974 size_t n, struct goacc_asyncqueue *aq)
1976 if (!nvptx_attach_host_thread_to_device (ord)
1977 || !cuda_memcpy_sanity_check (dst, src, n))
1978 return false;
1979 CUDA_CALL (cuMemcpyDtoHAsync, dst, (CUdeviceptr) src, n, aq->cuda_stream);
1980 return true;
1983 union goacc_property_value
1984 GOMP_OFFLOAD_openacc_get_property (int n, enum goacc_property prop)
1986 union goacc_property_value propval = { .val = 0 };
1988 pthread_mutex_lock (&ptx_dev_lock);
1990 if (n >= nvptx_get_num_devices () || n < 0 || ptx_devices[n] == NULL)
1992 pthread_mutex_unlock (&ptx_dev_lock);
1993 return propval;
1996 struct ptx_device *ptx_dev = ptx_devices[n];
1997 switch (prop)
1999 case GOACC_PROPERTY_MEMORY:
2001 size_t total_mem;
2003 CUDA_CALL_ERET (propval, cuDeviceTotalMem, &total_mem, ptx_dev->dev);
2004 propval.val = total_mem;
2006 break;
2007 case GOACC_PROPERTY_FREE_MEMORY:
2009 size_t total_mem;
2010 size_t free_mem;
2011 CUdevice ctxdev;
2013 CUDA_CALL_ERET (propval, cuCtxGetDevice, &ctxdev);
2014 if (ptx_dev->dev == ctxdev)
2015 CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
2016 else if (ptx_dev->ctx)
2018 CUcontext old_ctx;
2020 CUDA_CALL_ERET (propval, cuCtxPushCurrent, ptx_dev->ctx);
2021 CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
2022 CUDA_CALL_ASSERT (cuCtxPopCurrent, &old_ctx);
2024 else
2026 CUcontext new_ctx;
2028 CUDA_CALL_ERET (propval, cuCtxCreate, &new_ctx, CU_CTX_SCHED_AUTO,
2029 ptx_dev->dev);
2030 CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
2031 CUDA_CALL_ASSERT (cuCtxDestroy, new_ctx);
2033 propval.val = free_mem;
2035 break;
2036 case GOACC_PROPERTY_NAME:
2037 propval.ptr = ptx_dev->name;
2038 break;
2039 case GOACC_PROPERTY_VENDOR:
2040 propval.ptr = "Nvidia";
2041 break;
2042 case GOACC_PROPERTY_DRIVER:
2043 propval.ptr = cuda_driver_version_s;
2044 break;
2045 default:
2046 break;
2049 pthread_mutex_unlock (&ptx_dev_lock);
2050 return propval;
2053 /* Adjust launch dimensions: pick good values for number of blocks and warps
2054 and ensure that number of warps does not exceed CUDA limits as well as GCC's
2055 own limits. */
2057 static void
2058 nvptx_adjust_launch_bounds (struct targ_fn_descriptor *fn,
2059 struct ptx_device *ptx_dev,
2060 int *teams_p, int *threads_p)
2062 int max_warps_block = fn->max_threads_per_block / 32;
2063 /* Maximum 32 warps per block is an implementation limit in NVPTX backend
2064 and libgcc, which matches documented limit of all GPUs as of 2015. */
2065 if (max_warps_block > 32)
2066 max_warps_block = 32;
2067 if (*threads_p <= 0)
2068 *threads_p = 8;
2069 if (*threads_p > max_warps_block)
2070 *threads_p = max_warps_block;
2072 int regs_per_block = fn->regs_per_thread * 32 * *threads_p;
2073 /* This is an estimate of how many blocks the device can host simultaneously.
2074 Actual limit, which may be lower, can be queried with "occupancy control"
2075 driver interface (since CUDA 6.0). */
2076 int max_blocks = ptx_dev->regs_per_sm / regs_per_block * ptx_dev->num_sms;
2077 if (*teams_p <= 0 || *teams_p > max_blocks)
2078 *teams_p = max_blocks;
2081 /* Return the size of per-warp stacks (see gcc -msoft-stack) to use for OpenMP
2082 target regions. */
2084 static size_t
2085 nvptx_stacks_size ()
2087 return 128 * 1024;
2090 /* Return contiguous storage for NUM stacks, each SIZE bytes. The lock for
2091 the storage should be held on entry, and remains held on exit. */
2093 static void *
2094 nvptx_stacks_acquire (struct ptx_device *ptx_dev, size_t size, int num)
2096 if (ptx_dev->omp_stacks.ptr && ptx_dev->omp_stacks.size >= size * num)
2097 return (void *) ptx_dev->omp_stacks.ptr;
2099 /* Free the old, too-small stacks. */
2100 if (ptx_dev->omp_stacks.ptr)
2102 CUresult r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
2103 if (r != CUDA_SUCCESS)
2104 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s\n", cuda_error (r));
2105 r = CUDA_CALL_NOCHECK (cuMemFree, ptx_dev->omp_stacks.ptr);
2106 if (r != CUDA_SUCCESS)
2107 GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
2110 /* Make new and bigger stacks, and remember where we put them and how big
2111 they are. */
2112 CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &ptx_dev->omp_stacks.ptr,
2113 size * num);
2114 if (r != CUDA_SUCCESS)
2115 GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
2117 ptx_dev->omp_stacks.size = size * num;
2119 return (void *) ptx_dev->omp_stacks.ptr;
2123 void
2124 GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
2126 struct targ_fn_descriptor *tgt_fn_desc
2127 = (struct targ_fn_descriptor *) tgt_fn;
2128 CUfunction function = tgt_fn_desc->fn;
2129 const struct targ_fn_launch *launch = tgt_fn_desc->launch;
2130 const char *fn_name = launch->fn;
2131 CUresult r;
2132 struct ptx_device *ptx_dev = ptx_devices[ord];
2133 const char *maybe_abort_msg = "(perhaps abort was called)";
2134 int teams = 0, threads = 0;
2136 if (!args)
2137 GOMP_PLUGIN_fatal ("No target arguments provided");
2138 while (*args)
2140 intptr_t id = (intptr_t) *args++, val;
2141 if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
2142 val = (intptr_t) *args++;
2143 else
2144 val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
2145 if ((id & GOMP_TARGET_ARG_DEVICE_MASK) != GOMP_TARGET_ARG_DEVICE_ALL)
2146 continue;
2147 val = val > INT_MAX ? INT_MAX : val;
2148 id &= GOMP_TARGET_ARG_ID_MASK;
2149 if (id == GOMP_TARGET_ARG_NUM_TEAMS)
2150 teams = val;
2151 else if (id == GOMP_TARGET_ARG_THREAD_LIMIT)
2152 threads = val;
2154 nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads);
2156 bool reverse_offload = ptx_dev->rev_data != NULL;
2157 struct goacc_asyncqueue *reverse_offload_aq = NULL;
2158 if (reverse_offload)
2160 reverse_offload_aq
2161 = nvptx_goacc_asyncqueue_construct (CU_STREAM_NON_BLOCKING);
2162 if (!reverse_offload_aq)
2163 exit (EXIT_FAILURE);
2166 size_t stack_size = nvptx_stacks_size ();
2168 pthread_mutex_lock (&ptx_dev->omp_stacks.lock);
2169 void *stacks = nvptx_stacks_acquire (ptx_dev, stack_size, teams * threads);
2170 void *fn_args[] = {tgt_vars, stacks, (void *) stack_size};
2171 size_t fn_args_size = sizeof fn_args;
2172 void *config[] = {
2173 CU_LAUNCH_PARAM_BUFFER_POINTER, fn_args,
2174 CU_LAUNCH_PARAM_BUFFER_SIZE, &fn_args_size,
2175 CU_LAUNCH_PARAM_END
2177 GOMP_PLUGIN_debug (0, " %s: kernel %s: launch"
2178 " [(teams: %u), 1, 1] [(lanes: 32), (threads: %u), 1]\n",
2179 __FUNCTION__, fn_name, teams, threads);
2180 r = CUDA_CALL_NOCHECK (cuLaunchKernel, function, teams, 1, 1,
2181 32, threads, 1, 0, NULL, NULL, config);
2182 if (r != CUDA_SUCCESS)
2183 GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
2184 if (reverse_offload)
2185 while (true)
2187 r = CUDA_CALL_NOCHECK (cuStreamQuery, NULL);
2188 if (r == CUDA_SUCCESS)
2189 break;
2190 if (r == CUDA_ERROR_LAUNCH_FAILED)
2191 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s %s\n", cuda_error (r),
2192 maybe_abort_msg);
2193 else if (r != CUDA_ERROR_NOT_READY)
2194 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
2196 if (__atomic_load_n (&ptx_dev->rev_data->fn, __ATOMIC_ACQUIRE) != 0)
2198 struct rev_offload *rev_data = ptx_dev->rev_data;
2199 GOMP_PLUGIN_target_rev (rev_data->fn, rev_data->mapnum,
2200 rev_data->addrs, rev_data->sizes,
2201 rev_data->kinds, rev_data->dev_num,
2202 reverse_offload_aq);
2203 if (!nvptx_goacc_asyncqueue_synchronize (reverse_offload_aq))
2204 exit (EXIT_FAILURE);
2205 __atomic_store_n (&rev_data->fn, 0, __ATOMIC_RELEASE);
2207 usleep (1);
2209 else
2210 r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
2211 if (r == CUDA_ERROR_LAUNCH_FAILED)
2212 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
2213 maybe_abort_msg);
2214 else if (r != CUDA_SUCCESS)
2215 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
2217 pthread_mutex_unlock (&ptx_dev->omp_stacks.lock);
2219 if (reverse_offload)
2221 if (!nvptx_goacc_asyncqueue_destruct (reverse_offload_aq))
2222 exit (EXIT_FAILURE);
2226 /* TODO: Implement GOMP_OFFLOAD_async_run. */