libgomp/plugin/plugin-nvptx.c

   1 /* Plugin for NVPTX execution.
   2
   3    Copyright (C) 2013-2024 Free Software Foundation, Inc.
   4
   5    Contributed by Mentor Embedded.
   6
   7    This file is part of the GNU Offloading and Multi Processing Library
   8    (libgomp).
   9
  10    Libgomp is free software; you can redistribute it and/or modify it
  11    under the terms of the GNU General Public License as published by
  12    the Free Software Foundation; either version 3, or (at your option)
  13    any later version.
  14
  15    Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
  16    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  17    FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  18    more details.
  19
  20    Under Section 7 of GPL version 3, you are granted additional
  21    permissions described in the GCC Runtime Library Exception, version
  22    3.1, as published by the Free Software Foundation.
  23
  24    You should have received a copy of the GNU General Public License and
  25    a copy of the GCC Runtime Library Exception along with this program;
  26    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  27    <http://www.gnu.org/licenses/>.  */
  28
  29 /* Nvidia PTX-specific parts of OpenACC support.  The cuda driver
  30    library appears to hold some implicit state, but the documentation
  31    is not clear as to what that state might be.  Or how one might
  32    propagate it from one thread to another.  */
  33
  34 #define _GNU_SOURCE
  35 #include "openacc.h"
  36 #include "config.h"
  37 #include "symcat.h"
  38 #include "libgomp-plugin.h"
  39 #include "oacc-plugin.h"
  40 #include "gomp-constants.h"
  41 #include "oacc-int.h"
  42
  43 /* For struct rev_offload + GOMP_REV_OFFLOAD_VAR. */
  44 #include "config/nvptx/libgomp-nvptx.h"
  45
  46 #include <pthread.h>
  47 #ifndef PLUGIN_NVPTX_INCLUDE_SYSTEM_CUDA_H
  48 # include "cuda/cuda.h"
  49 #else
  50 # include <cuda.h>
  51 #endif
  52 #include <stdbool.h>
  53 #include <limits.h>
  54 #include <string.h>
  55 #include <stdio.h>
  56 #include <unistd.h>
  57 #include <assert.h>
  58 #include <errno.h>
  59 #include <stdlib.h>
  60
  61 /* An arbitrary fixed limit (128MB) for the size of the OpenMP soft stacks
  62    block to cache between kernel invocations.  For soft-stacks blocks bigger
  63    than this, we will free the block before attempting another GPU memory
  64    allocation (i.e. in GOMP_OFFLOAD_alloc).  Otherwise, if an allocation fails,
  65    we will free the cached soft-stacks block anyway then retry the
  66    allocation.  If that fails too, we lose.  */
  67
  68 #define SOFTSTACK_CACHE_LIMIT 134217728
  69
  70 #if CUDA_VERSION < 6000
  71 extern CUresult cuGetErrorString (CUresult, const char **);
  72 #define CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR 82
  73 #endif
  74
  75 #if CUDA_VERSION >= 6050
  76 #undef cuLinkCreate
  77 #undef cuLinkAddData
  78 CUresult cuLinkAddData (CUlinkState, CUjitInputType, void *, size_t,
  79                         const char *, unsigned, CUjit_option *, void **);
  80 CUresult cuLinkCreate (unsigned, CUjit_option *, void **, CUlinkState *);
  81 #else
  82 typedef size_t (*CUoccupancyB2DSize)(int);
  83 CUresult cuLinkAddData_v2 (CUlinkState, CUjitInputType, void *, size_t,
  84                            const char *, unsigned, CUjit_option *, void **);
  85 CUresult cuLinkCreate_v2 (unsigned, CUjit_option *, void **, CUlinkState *);
  86 CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
  87                                           CUoccupancyB2DSize, size_t, int);
  88 #endif
  89
  90 #define DO_PRAGMA(x) _Pragma (#x)
  91
  92 #ifndef PLUGIN_NVPTX_LINK_LIBCUDA
  93 # include <dlfcn.h>
  94
  95 struct cuda_lib_s {
  96
  97 # define CUDA_ONE_CALL(call)                    \
  98   __typeof (call) *call;
  99 # define CUDA_ONE_CALL_MAYBE_NULL(call)         \
 100   CUDA_ONE_CALL (call)
 101 #include "cuda-lib.def"
 102 # undef CUDA_ONE_CALL
 103 # undef CUDA_ONE_CALL_MAYBE_NULL
 104
 105 } cuda_lib;
 106
 107 /* -1 if init_cuda_lib has not been called yet, false
 108    if it has been and failed, true if it has been and succeeded.  */
 109 static signed char cuda_lib_inited = -1;
 110
 111 /* Dynamically load the CUDA runtime library and initialize function
 112    pointers, return false if unsuccessful, true if successful.  */
 113 static bool
 114 init_cuda_lib (void)
 115 {
 116   if (cuda_lib_inited != -1)
 117     return cuda_lib_inited;
 118   const char *cuda_runtime_lib = "libcuda.so.1";
 119   void *h = dlopen (cuda_runtime_lib, RTLD_LAZY);
 120   cuda_lib_inited = false;
 121   if (h == NULL)
 122     return false;
 123
 124 # define CUDA_ONE_CALL(call) CUDA_ONE_CALL_1 (call, false)
 125 # define CUDA_ONE_CALL_MAYBE_NULL(call) CUDA_ONE_CALL_1 (call, true)
 126 # define CUDA_ONE_CALL_1(call, allow_null)              \
 127   cuda_lib.call = dlsym (h, #call);     \
 128   if (!allow_null && cuda_lib.call == NULL)             \
 129     GOMP_PLUGIN_fatal ("'%s' is missing '%s'", cuda_runtime_lib, #call);
 130 #include "cuda-lib.def"
 131 # undef CUDA_ONE_CALL
 132 # undef CUDA_ONE_CALL_1
 133 # undef CUDA_ONE_CALL_MAYBE_NULL
 134
 135   cuda_lib_inited = true;
 136   return true;
 137 }
 138 # define CUDA_CALL_PREFIX cuda_lib.
 139 #else
 140
 141 # define CUDA_ONE_CALL(call)
 142 # define CUDA_ONE_CALL_MAYBE_NULL(call) DO_PRAGMA (weak call)
 143 #include "cuda-lib.def"
 144 #undef CUDA_ONE_CALL_MAYBE_NULL
 145 #undef CUDA_ONE_CALL
 146
 147 # define CUDA_CALL_PREFIX
 148 # define init_cuda_lib() true
 149 #endif
 150
 151 #include "secure_getenv.h"
 152
 153 static void notify_var (const char *, const char *);
 154
 155 #undef MIN
 156 #undef MAX
 157 #define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
 158 #define MAX(X,Y) ((X) > (Y) ? (X) : (Y))
 159
 160 /* Convenience macros for the frequently used CUDA library call and
 161    error handling sequence as well as CUDA library calls that
 162    do the error checking themselves or don't do it at all.  */
 163
 164 #define CUDA_CALL_ERET(ERET, FN, ...)           \
 165   do {                                          \
 166     unsigned __r                                \
 167       = CUDA_CALL_PREFIX FN (__VA_ARGS__);      \
 168     if (__r != CUDA_SUCCESS)                    \
 169       {                                         \
 170         GOMP_PLUGIN_error (#FN " error: %s",    \
 171                            cuda_error (__r));   \
 172         return ERET;                            \
 173       }                                         \
 174   } while (0)
 175
 176 #define CUDA_CALL(FN, ...)                      \
 177   CUDA_CALL_ERET (false, FN, __VA_ARGS__)
 178
 179 #define CUDA_CALL_ASSERT(FN, ...)               \
 180   do {                                          \
 181     unsigned __r                                \
 182       = CUDA_CALL_PREFIX FN (__VA_ARGS__);      \
 183     if (__r != CUDA_SUCCESS)                    \
 184       {                                         \
 185         GOMP_PLUGIN_fatal (#FN " error: %s",    \
 186                            cuda_error (__r));   \
 187       }                                         \
 188   } while (0)
 189
 190 #define CUDA_CALL_NOCHECK(FN, ...)              \
 191   CUDA_CALL_PREFIX FN (__VA_ARGS__)
 192
 193 #define CUDA_CALL_EXISTS(FN)                    \
 194   CUDA_CALL_PREFIX FN
 195
 196 static const char *
 197 cuda_error (CUresult r)
 198 {
 199   const char *fallback = "unknown cuda error";
 200   const char *desc;
 201
 202   if (!CUDA_CALL_EXISTS (cuGetErrorString))
 203     return fallback;
 204
 205   r = CUDA_CALL_NOCHECK (cuGetErrorString, r, &desc);
 206   if (r == CUDA_SUCCESS)
 207     return desc;
 208
 209   return fallback;
 210 }
 211
 212 /* Version of the CUDA Toolkit in the same MAJOR.MINOR format that is used by
 213    Nvidia, such as in the 'deviceQuery' program (Nvidia's CUDA samples). */
 214 static char cuda_driver_version_s[30];
 215
 216 static unsigned int instantiated_devices = 0;
 217 static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
 218
 219 /* NVPTX/CUDA specific definition of asynchronous queues.  */
 220 struct goacc_asyncqueue
 221 {
 222   CUstream cuda_stream;
 223 };
 224
 225 struct nvptx_callback
 226 {
 227   void (*fn) (void *);
 228   void *ptr;
 229   struct goacc_asyncqueue *aq;
 230   struct nvptx_callback *next;
 231 };
 232
 233 /* Thread-specific data for PTX.  */
 234
 235 struct nvptx_thread
 236 {
 237   /* We currently have this embedded inside the plugin because libgomp manages
 238      devices through integer target_ids.  This might be better if using an
 239      opaque target-specific pointer directly from gomp_device_descr.  */
 240   struct ptx_device *ptx_dev;
 241 };
 242
 243 /* Target data function launch information.  */
 244
 245 struct targ_fn_launch
 246 {
 247   const char *fn;
 248   unsigned short dim[GOMP_DIM_MAX];
 249 };
 250
 251 /* Target PTX object information.  */
 252
 253 struct targ_ptx_obj
 254 {
 255   const char *code;
 256   size_t size;
 257 };
 258
 259 /* Target data image information.  */
 260
 261 typedef struct nvptx_tdata
 262 {
 263   const struct targ_ptx_obj *ptx_objs;
 264   unsigned ptx_num;
 265
 266   const char *const *var_names;
 267   unsigned var_num;
 268
 269   const struct targ_fn_launch *fn_descs;
 270   unsigned fn_num;
 271
 272   unsigned ind_fn_num;
 273 } nvptx_tdata_t;
 274
 275 /* Descriptor of a loaded function.  */
 276
 277 struct targ_fn_descriptor
 278 {
 279   CUfunction fn;
 280   const struct targ_fn_launch *launch;
 281   int regs_per_thread;
 282   int max_threads_per_block;
 283 };
 284
 285 /* A loaded PTX image.  */
 286 struct ptx_image_data
 287 {
 288   const void *target_data;
 289   CUmodule module;
 290
 291   struct targ_fn_descriptor *fns;  /* Array of functions.  */
 292
 293   struct ptx_image_data *next;
 294 };
 295
 296 struct ptx_free_block
 297 {
 298   void *ptr;
 299   struct ptx_free_block *next;
 300 };
 301
 302 struct ptx_device
 303 {
 304   CUcontext ctx;
 305   bool ctx_shared;
 306   CUdevice dev;
 307
 308   int ord;
 309   bool overlap;
 310   bool map;
 311   bool concur;
 312   bool mkern;
 313   int mode;
 314   int clock_khz;
 315   int num_sms;
 316   int regs_per_block;
 317   int regs_per_sm;
 318   int warp_size;
 319   int max_threads_per_block;
 320   int max_threads_per_multiprocessor;
 321   int default_dims[GOMP_DIM_MAX];
 322
 323   /* Length as used by the CUDA Runtime API ('struct cudaDeviceProp').  */
 324   char name[256];
 325
 326   struct ptx_image_data *images;  /* Images loaded on device.  */
 327   pthread_mutex_t image_lock;     /* Lock for above list.  */
 328
 329   struct ptx_free_block *free_blocks;
 330   pthread_mutex_t free_blocks_lock;
 331
 332   /* OpenMP stacks, cached between kernel invocations.  */
 333   struct
 334     {
 335       CUdeviceptr ptr;
 336       size_t size;
 337       pthread_mutex_t lock;
 338     } omp_stacks;
 339
 340   struct rev_offload *rev_data;
 341   struct ptx_device *next;
 342 };
 343
 344 static struct ptx_device **ptx_devices;
 345
 346 /* "Native" GPU thread stack size.  */
 347 static unsigned native_gpu_thread_stack_size = 0;
 348
 349 /* OpenMP kernels reserve a small amount of ".shared" space for use by
 350    omp_alloc.  The size is configured using GOMP_NVPTX_LOWLAT_POOL, but the
 351    default is set here.  */
 352 static unsigned lowlat_pool_size = 8 * 1024;
 353
 354 static bool nvptx_do_global_cdtors (CUmodule, struct ptx_device *,
 355                                     const char *);
 356 static size_t nvptx_stacks_size ();
 357 static void *nvptx_stacks_acquire (struct ptx_device *, size_t, int);
 358
 359 static inline struct nvptx_thread *
 360 nvptx_thread (void)
 361 {
 362   return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
 363 }
 364
 365 /* Initialize the device.  Return TRUE on success, else FALSE.  PTX_DEV_LOCK
 366    should be locked on entry and remains locked on exit.  */
 367
 368 static bool
 369 nvptx_init (void)
 370 {
 371   int ndevs;
 372
 373   if (instantiated_devices != 0)
 374     return true;
 375
 376   if (!init_cuda_lib ())
 377     return false;
 378
 379   CUDA_CALL (cuInit, 0);
 380
 381   int cuda_driver_version;
 382   CUDA_CALL_ERET (NULL, cuDriverGetVersion, &cuda_driver_version);
 383   snprintf (cuda_driver_version_s, sizeof cuda_driver_version_s,
 384             "CUDA Driver %u.%u",
 385             cuda_driver_version / 1000, cuda_driver_version % 1000 / 10);
 386
 387   CUDA_CALL (cuDeviceGetCount, &ndevs);
 388   ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *)
 389                                             * ndevs);
 390
 391   return true;
 392 }
 393
 394 /* Select the N'th PTX device for the current host thread.  The device must
 395    have been previously opened before calling this function.  */
 396
 397 static bool
 398 nvptx_attach_host_thread_to_device (int n)
 399 {
 400   CUdevice dev;
 401   CUresult r;
 402   struct ptx_device *ptx_dev;
 403   CUcontext thd_ctx;
 404
 405   r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &dev);
 406   if (r == CUDA_ERROR_NOT_PERMITTED)
 407     {
 408       /* Assume we're in a CUDA callback, just return true.  */
 409       return true;
 410     }
 411   if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
 412     {
 413       GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
 414       return false;
 415     }
 416
 417   if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n)
 418     return true;
 419   else
 420     {
 421       CUcontext old_ctx;
 422
 423       ptx_dev = ptx_devices[n];
 424       if (!ptx_dev)
 425         {
 426           GOMP_PLUGIN_error ("device %d not found", n);
 427           return false;
 428         }
 429
 430       CUDA_CALL (cuCtxGetCurrent, &thd_ctx);
 431
 432       /* We don't necessarily have a current context (e.g. if it has been
 433          destroyed.  Pop it if we do though.  */
 434       if (thd_ctx != NULL)
 435         CUDA_CALL (cuCtxPopCurrent, &old_ctx);
 436
 437       CUDA_CALL (cuCtxPushCurrent, ptx_dev->ctx);
 438     }
 439   return true;
 440 }
 441
 442 static struct ptx_device *
 443 nvptx_open_device (int n)
 444 {
 445   struct ptx_device *ptx_dev;
 446   CUdevice dev, ctx_dev;
 447   CUresult r;
 448   int pi;
 449
 450   CUDA_CALL_ERET (NULL, cuDeviceGet, &dev, n);
 451
 452   ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device));
 453
 454   ptx_dev->ord = n;
 455   ptx_dev->dev = dev;
 456   ptx_dev->ctx_shared = false;
 457
 458   r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &ctx_dev);
 459   if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
 460     {
 461       GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
 462       return NULL;
 463     }
 464
 465   if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev)
 466     {
 467       /* The current host thread has an active context for a different device.
 468          Detach it.  */
 469       CUcontext old_ctx;
 470       CUDA_CALL_ERET (NULL, cuCtxPopCurrent, &old_ctx);
 471     }
 472
 473   CUDA_CALL_ERET (NULL, cuCtxGetCurrent, &ptx_dev->ctx);
 474
 475   if (!ptx_dev->ctx)
 476     CUDA_CALL_ERET (NULL, cuCtxCreate, &ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
 477   else
 478     ptx_dev->ctx_shared = true;
 479
 480   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 481                   &pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
 482   ptx_dev->overlap = pi;
 483
 484   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 485                   &pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
 486   ptx_dev->map = pi;
 487
 488   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 489                   &pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
 490   ptx_dev->concur = pi;
 491
 492   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 493                   &pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
 494   ptx_dev->mode = pi;
 495
 496   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 497                   &pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
 498   ptx_dev->mkern = pi;
 499
 500   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 501                   &pi, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
 502   ptx_dev->clock_khz = pi;
 503
 504   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 505                   &pi, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
 506   ptx_dev->num_sms = pi;
 507
 508   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 509                   &pi, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev);
 510   ptx_dev->regs_per_block = pi;
 511
 512   /* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR is defined only
 513      in CUDA 6.0 and newer.  */
 514   r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi,
 515                          CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR,
 516                          dev);
 517   /* Fallback: use limit of registers per block, which is usually equal.  */
 518   if (r == CUDA_ERROR_INVALID_VALUE)
 519     pi = ptx_dev->regs_per_block;
 520   else if (r != CUDA_SUCCESS)
 521     {
 522       GOMP_PLUGIN_error ("cuDeviceGetAttribute error: %s", cuda_error (r));
 523       return NULL;
 524     }
 525   ptx_dev->regs_per_sm = pi;
 526
 527   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 528                   &pi, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev);
 529   if (pi != 32)
 530     {
 531       GOMP_PLUGIN_error ("Only warp size 32 is supported");
 532       return NULL;
 533     }
 534   ptx_dev->warp_size = pi;
 535
 536   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
 537                   CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev);
 538   ptx_dev->max_threads_per_block = pi;
 539
 540   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
 541                   CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev);
 542   ptx_dev->max_threads_per_multiprocessor = pi;
 543
 544   /* Required below for reverse offload as implemented, but with compute
 545      capability >= 2.0 and 64bit device processes, this should be universally be
 546      the case; hence, an assert.  */
 547   r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi,
 548                          CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, dev);
 549   assert (r == CUDA_SUCCESS && pi);
 550
 551   for (int i = 0; i != GOMP_DIM_MAX; i++)
 552     ptx_dev->default_dims[i] = 0;
 553
 554   CUDA_CALL_ERET (NULL, cuDeviceGetName, ptx_dev->name, sizeof ptx_dev->name,
 555                   dev);
 556
 557   ptx_dev->images = NULL;
 558   pthread_mutex_init (&ptx_dev->image_lock, NULL);
 559
 560   ptx_dev->free_blocks = NULL;
 561   pthread_mutex_init (&ptx_dev->free_blocks_lock, NULL);
 562
 563   /* "Native" GPU thread stack size.  */
 564   {
 565     /* This is intentionally undocumented, until we work out a proper, common
 566        scheme (as much as makes sense) between all offload plugins as well
 567        as between nvptx offloading use of "native" stacks for OpenACC vs.
 568        OpenMP "soft stacks" vs. OpenMP '-msoft-stack-reserve-local=[...]'.
 569
 570        GCN offloading has a 'GCN_STACK_SIZE' environment variable (without
 571        'GOMP_' prefix): documented; presumably used for all things OpenACC and
 572        OpenMP?  Based on GCN command-line option '-mstack-size=[...]' (marked
 573        "obsolete"), that one may be set via a GCN 'mkoffload'-synthesized
 574        'constructor' function.  */
 575     const char *var_name = "GOMP_NVPTX_NATIVE_GPU_THREAD_STACK_SIZE";
 576     const char *env_var = secure_getenv (var_name);
 577     notify_var (var_name, env_var);
 578
 579     if (env_var != NULL)
 580       {
 581         char *endptr;
 582         unsigned long val = strtoul (env_var, &endptr, 10);
 583         if (endptr == NULL || *endptr != '\0'
 584             || errno == ERANGE || errno == EINVAL
 585             || val > UINT_MAX)
 586           GOMP_PLUGIN_error ("Error parsing %s", var_name);
 587         else
 588           native_gpu_thread_stack_size = val;
 589       }
 590   }
 591   if (native_gpu_thread_stack_size == 0)
 592     ; /* Zero means use default.  */
 593   else
 594     {
 595       GOMP_PLUGIN_debug (0, "Setting \"native\" GPU thread stack size"
 596                          " ('CU_LIMIT_STACK_SIZE') to %u bytes\n",
 597                          native_gpu_thread_stack_size);
 598       CUDA_CALL (cuCtxSetLimit,
 599                  CU_LIMIT_STACK_SIZE, (size_t) native_gpu_thread_stack_size);
 600     }
 601
 602   /* OpenMP "soft stacks".  */
 603   ptx_dev->omp_stacks.ptr = 0;
 604   ptx_dev->omp_stacks.size = 0;
 605   pthread_mutex_init (&ptx_dev->omp_stacks.lock, NULL);
 606
 607   ptx_dev->rev_data = NULL;
 608
 609   return ptx_dev;
 610 }
 611
 612 static bool
 613 nvptx_close_device (struct ptx_device *ptx_dev)
 614 {
 615   if (!ptx_dev)
 616     return true;
 617
 618   bool ret = true;
 619
 620   for (struct ptx_image_data *image = ptx_dev->images;
 621        image != NULL;
 622        image = image->next)
 623     {
 624       if (!nvptx_do_global_cdtors (image->module, ptx_dev,
 625                                    "__do_global_dtors__entry"
 626                                    /* or "__do_global_dtors__entry__mgomp" */))
 627         ret = false;
 628     }
 629
 630   for (struct ptx_free_block *b = ptx_dev->free_blocks; b;)
 631     {
 632       struct ptx_free_block *b_next = b->next;
 633       CUDA_CALL (cuMemFree, (CUdeviceptr) b->ptr);
 634       free (b);
 635       b = b_next;
 636     }
 637
 638   pthread_mutex_destroy (&ptx_dev->free_blocks_lock);
 639   pthread_mutex_destroy (&ptx_dev->image_lock);
 640
 641   pthread_mutex_destroy (&ptx_dev->omp_stacks.lock);
 642
 643   if (ptx_dev->omp_stacks.ptr)
 644     CUDA_CALL (cuMemFree, ptx_dev->omp_stacks.ptr);
 645
 646   if (!ptx_dev->ctx_shared)
 647     CUDA_CALL (cuCtxDestroy, ptx_dev->ctx);
 648
 649   free (ptx_dev);
 650
 651   return ret;
 652 }
 653
 654 static int
 655 nvptx_get_num_devices (void)
 656 {
 657   int n;
 658
 659   /* This function will be called before the plugin has been initialized in
 660      order to enumerate available devices, but CUDA API routines can't be used
 661      until cuInit has been called.  Just call it now (but don't yet do any
 662      further initialization).  */
 663   if (instantiated_devices == 0)
 664     {
 665       if (!init_cuda_lib ())
 666         return 0;
 667       CUresult r = CUDA_CALL_NOCHECK (cuInit, 0);
 668       /* This is not an error: e.g. we may have CUDA libraries installed but
 669          no devices available.  */
 670       if (r == CUDA_ERROR_NO_DEVICE)
 671         {
 672           GOMP_PLUGIN_debug (0, "Disabling nvptx offloading; cuInit: %s\n",
 673                              cuda_error (r));
 674           return 0;
 675         }
 676       else if (r != CUDA_SUCCESS)
 677         GOMP_PLUGIN_fatal ("cuInit error: %s", cuda_error (r));
 678     }
 679
 680   CUDA_CALL_ASSERT (cuDeviceGetCount, &n);
 681   return n;
 682 }
 683
 684 static void
 685 notify_var (const char *var_name, const char *env_var)
 686 {
 687   if (env_var == NULL)
 688     GOMP_PLUGIN_debug (0, "%s: <Not defined>\n", var_name);
 689   else
 690     GOMP_PLUGIN_debug (0, "%s: '%s'\n", var_name, env_var);
 691 }
 692
 693 static void
 694 process_GOMP_NVPTX_JIT (intptr_t *gomp_nvptx_o)
 695 {
 696   const char *var_name = "GOMP_NVPTX_JIT";
 697   const char *env_var = secure_getenv (var_name);
 698   notify_var (var_name, env_var);
 699
 700   if (env_var == NULL)
 701     return;
 702
 703   const char *c = env_var;
 704   while (*c != '\0')
 705     {
 706       while (*c == ' ')
 707         c++;
 708
 709       if (c[0] == '-' && c[1] == 'O'
 710           && '0' <= c[2] && c[2] <= '4'
 711           && (c[3] == '\0' || c[3] == ' '))
 712         {
 713           *gomp_nvptx_o = c[2] - '0';
 714           c += 3;
 715           continue;
 716         }
 717
 718       GOMP_PLUGIN_error ("Error parsing %s", var_name);
 719       break;
 720     }
 721 }
 722
 723 static bool
 724 link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs,
 725           unsigned num_objs)
 726 {
 727   CUjit_option opts[7];
 728   void *optvals[7];
 729   float elapsed = 0.0;
 730   char elog[1024];
 731   char ilog[16384];
 732   CUlinkState linkstate;
 733   CUresult r;
 734   void *linkout;
 735   size_t linkoutsize __attribute__ ((unused));
 736
 737   opts[0] = CU_JIT_WALL_TIME;
 738   optvals[0] = &elapsed;
 739
 740   opts[1] = CU_JIT_INFO_LOG_BUFFER;
 741   optvals[1] = &ilog[0];
 742
 743   opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
 744   optvals[2] = (void *) sizeof ilog;
 745
 746   opts[3] = CU_JIT_ERROR_LOG_BUFFER;
 747   optvals[3] = &elog[0];
 748
 749   opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
 750   optvals[4] = (void *) sizeof elog;
 751
 752   opts[5] = CU_JIT_LOG_VERBOSE;
 753   optvals[5] = (void *) 1;
 754
 755   static intptr_t gomp_nvptx_o = -1;
 756
 757   static bool init_done = false;
 758   if (!init_done)
 759     {
 760       process_GOMP_NVPTX_JIT (&gomp_nvptx_o);
 761       init_done = true;
 762   }
 763
 764   int nopts = 6;
 765   if (gomp_nvptx_o != -1)
 766     {
 767       opts[nopts] = CU_JIT_OPTIMIZATION_LEVEL;
 768       optvals[nopts] = (void *) gomp_nvptx_o;
 769       nopts++;
 770     }
 771
 772   if (CUDA_CALL_EXISTS (cuLinkCreate_v2))
 773     CUDA_CALL (cuLinkCreate_v2, nopts, opts, optvals, &linkstate);
 774   else
 775     CUDA_CALL (cuLinkCreate, nopts, opts, optvals, &linkstate);
 776
 777   for (; num_objs--; ptx_objs++)
 778     {
 779       /* cuLinkAddData's 'data' argument erroneously omits the const
 780          qualifier.  */
 781       GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs->code);
 782       if (CUDA_CALL_EXISTS (cuLinkAddData_v2))
 783         r = CUDA_CALL_NOCHECK (cuLinkAddData_v2, linkstate, CU_JIT_INPUT_PTX,
 784                                (char *) ptx_objs->code, ptx_objs->size,
 785                                0, 0, 0, 0);
 786       else
 787         r = CUDA_CALL_NOCHECK (cuLinkAddData, linkstate, CU_JIT_INPUT_PTX,
 788                                (char *) ptx_objs->code, ptx_objs->size,
 789                                0, 0, 0, 0);
 790       if (r != CUDA_SUCCESS)
 791         {
 792           GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
 793           GOMP_PLUGIN_error ("cuLinkAddData (ptx_code) error: %s",
 794                              cuda_error (r));
 795           return false;
 796         }
 797     }
 798
 799   GOMP_PLUGIN_debug (0, "Linking\n");
 800   r = CUDA_CALL_NOCHECK (cuLinkComplete, linkstate, &linkout, &linkoutsize);
 801
 802   GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed);
 803   GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]);
 804
 805   if (r != CUDA_SUCCESS)
 806     {
 807       GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
 808       GOMP_PLUGIN_error ("cuLinkComplete error: %s", cuda_error (r));
 809       return false;
 810     }
 811
 812   CUDA_CALL (cuModuleLoadData, module, linkout);
 813   CUDA_CALL (cuLinkDestroy, linkstate);
 814   return true;
 815 }
 816
 817 static void
 818 nvptx_exec (void (*fn), unsigned *dims, void *targ_mem_desc,
 819             CUdeviceptr dp, CUstream stream)
 820 {
 821   struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
 822   CUfunction function;
 823   int i;
 824   void *kargs[1];
 825   struct nvptx_thread *nvthd = nvptx_thread ();
 826   int warp_size = nvthd->ptx_dev->warp_size;
 827
 828   function = targ_fn->fn;
 829
 830   /* Initialize the launch dimensions.  Typically this is constant,
 831      provided by the device compiler, but we must permit runtime
 832      values.  */
 833   int seen_zero = 0;
 834   for (i = 0; i != GOMP_DIM_MAX; i++)
 835     {
 836       if (targ_fn->launch->dim[i])
 837        dims[i] = targ_fn->launch->dim[i];
 838       if (!dims[i])
 839        seen_zero = 1;
 840     }
 841
 842   if (seen_zero)
 843     {
 844       pthread_mutex_lock (&ptx_dev_lock);
 845
 846       static int gomp_openacc_dims[GOMP_DIM_MAX];
 847       if (!gomp_openacc_dims[0])
 848         {
 849           /* See if the user provided GOMP_OPENACC_DIM environment
 850              variable to specify runtime defaults.  */
 851           for (int i = 0; i < GOMP_DIM_MAX; ++i)
 852             gomp_openacc_dims[i] = GOMP_PLUGIN_acc_default_dim (i);
 853         }
 854
 855       if (!nvthd->ptx_dev->default_dims[0])
 856         {
 857           int default_dims[GOMP_DIM_MAX];
 858           for (int i = 0; i < GOMP_DIM_MAX; ++i)
 859             default_dims[i] = gomp_openacc_dims[i];
 860
 861           int gang, worker, vector;
 862           {
 863             int block_size = nvthd->ptx_dev->max_threads_per_block;
 864             int cpu_size = nvthd->ptx_dev->max_threads_per_multiprocessor;
 865             int dev_size = nvthd->ptx_dev->num_sms;
 866             GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
 867                                " dev_size=%d, cpu_size=%d\n",
 868                                warp_size, block_size, dev_size, cpu_size);
 869
 870             gang = (cpu_size / block_size) * dev_size;
 871             worker = block_size / warp_size;
 872             vector = warp_size;
 873           }
 874
 875           /* There is no upper bound on the gang size.  The best size
 876              matches the hardware configuration.  Logical gangs are
 877              scheduled onto physical hardware.  To maximize usage, we
 878              should guess a large number.  */
 879           if (default_dims[GOMP_DIM_GANG] < 1)
 880             default_dims[GOMP_DIM_GANG] = gang ? gang : 1024;
 881           /* The worker size must not exceed the hardware.  */
 882           if (default_dims[GOMP_DIM_WORKER] < 1
 883               || (default_dims[GOMP_DIM_WORKER] > worker && gang))
 884             default_dims[GOMP_DIM_WORKER] = worker;
 885           /* The vector size must exactly match the hardware.  */
 886           if (default_dims[GOMP_DIM_VECTOR] < 1
 887               || (default_dims[GOMP_DIM_VECTOR] != vector && gang))
 888             default_dims[GOMP_DIM_VECTOR] = vector;
 889
 890           GOMP_PLUGIN_debug (0, " default dimensions [%d,%d,%d]\n",
 891                              default_dims[GOMP_DIM_GANG],
 892                              default_dims[GOMP_DIM_WORKER],
 893                              default_dims[GOMP_DIM_VECTOR]);
 894
 895           for (i = 0; i != GOMP_DIM_MAX; i++)
 896             nvthd->ptx_dev->default_dims[i] = default_dims[i];
 897         }
 898       pthread_mutex_unlock (&ptx_dev_lock);
 899
 900       {
 901         bool default_dim_p[GOMP_DIM_MAX];
 902         for (i = 0; i != GOMP_DIM_MAX; i++)
 903           default_dim_p[i] = !dims[i];
 904
 905         if (!CUDA_CALL_EXISTS (cuOccupancyMaxPotentialBlockSize))
 906           {
 907             for (i = 0; i != GOMP_DIM_MAX; i++)
 908               if (default_dim_p[i])
 909                 dims[i] = nvthd->ptx_dev->default_dims[i];
 910
 911             if (default_dim_p[GOMP_DIM_VECTOR])
 912               dims[GOMP_DIM_VECTOR]
 913                 = MIN (dims[GOMP_DIM_VECTOR],
 914                        (targ_fn->max_threads_per_block / warp_size
 915                         * warp_size));
 916
 917             if (default_dim_p[GOMP_DIM_WORKER])
 918               dims[GOMP_DIM_WORKER]
 919                 = MIN (dims[GOMP_DIM_WORKER],
 920                        targ_fn->max_threads_per_block / dims[GOMP_DIM_VECTOR]);
 921           }
 922         else
 923           {
 924             /* Handle the case that the compiler allows the runtime to choose
 925                the vector-length conservatively, by ignoring
 926                gomp_openacc_dims[GOMP_DIM_VECTOR].  TODO: actually handle
 927                it.  */
 928             int vectors = 0;
 929             /* TODO: limit gomp_openacc_dims[GOMP_DIM_WORKER] such that that
 930                gomp_openacc_dims[GOMP_DIM_WORKER] * actual_vectors does not
 931                exceed targ_fn->max_threads_per_block. */
 932             int workers = gomp_openacc_dims[GOMP_DIM_WORKER];
 933             int gangs = gomp_openacc_dims[GOMP_DIM_GANG];
 934             int grids, blocks;
 935
 936             CUDA_CALL_ASSERT (cuOccupancyMaxPotentialBlockSize, &grids,
 937                               &blocks, function, NULL, 0,
 938                               dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]);
 939             GOMP_PLUGIN_debug (0, "cuOccupancyMaxPotentialBlockSize: "
 940                                "grid = %d, block = %d\n", grids, blocks);
 941
 942             /* Keep the num_gangs proportional to the block size.  In
 943                the case were a block size is limited by shared-memory
 944                or the register file capacity, the runtime will not
 945                excessively over assign gangs to the multiprocessor
 946                units if their state is going to be swapped out even
 947                more than necessary. The constant factor 2 is there to
 948                prevent threads from idling when there is insufficient
 949                work for them.  */
 950             if (gangs == 0)
 951               gangs = 2 * grids * (blocks / warp_size);
 952
 953             if (vectors == 0)
 954               vectors = warp_size;
 955
 956             if (workers == 0)
 957               {
 958                 int actual_vectors = (default_dim_p[GOMP_DIM_VECTOR]
 959                                       ? vectors
 960                                       : dims[GOMP_DIM_VECTOR]);
 961                 workers = blocks / actual_vectors;
 962                 workers = MAX (workers, 1);
 963                 /* If we need a per-worker barrier ... .  */
 964                 if (actual_vectors > 32)
 965                   /* Don't use more barriers than available.  */
 966                   workers = MIN (workers, 15);
 967               }
 968
 969             for (i = 0; i != GOMP_DIM_MAX; i++)
 970               if (default_dim_p[i])
 971                 switch (i)
 972                   {
 973                   case GOMP_DIM_GANG: dims[i] = gangs; break;
 974                   case GOMP_DIM_WORKER: dims[i] = workers; break;
 975                   case GOMP_DIM_VECTOR: dims[i] = vectors; break;
 976                   default: GOMP_PLUGIN_fatal ("invalid dim");
 977                   }
 978           }
 979       }
 980     }
 981
 982   /* Check if the accelerator has sufficient hardware resources to
 983      launch the offloaded kernel.  */
 984   if (dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]
 985       > targ_fn->max_threads_per_block)
 986     {
 987       const char *msg
 988         = ("The Nvidia accelerator has insufficient resources to launch '%s'"
 989            " with num_workers = %d and vector_length = %d"
 990            "; "
 991            "recompile the program with 'num_workers = x and vector_length = y'"
 992            " on that offloaded region or '-fopenacc-dim=:x:y' where"
 993            " x * y <= %d"
 994            ".\n");
 995       GOMP_PLUGIN_fatal (msg, targ_fn->launch->fn, dims[GOMP_DIM_WORKER],
 996                          dims[GOMP_DIM_VECTOR], targ_fn->max_threads_per_block);
 997     }
 998
 999   /* Check if the accelerator has sufficient barrier resources to
1000      launch the offloaded kernel.  */
1001   if (dims[GOMP_DIM_WORKER] > 15 && dims[GOMP_DIM_VECTOR] > 32)
1002     {
1003       const char *msg
1004         = ("The Nvidia accelerator has insufficient barrier resources to launch"
1005            " '%s' with num_workers = %d and vector_length = %d"
1006            "; "
1007            "recompile the program with 'num_workers = x' on that offloaded"
1008            " region or '-fopenacc-dim=:x:' where x <= 15"
1009            "; "
1010            "or, recompile the program with 'vector_length = 32' on that"
1011            " offloaded region or '-fopenacc-dim=::32'"
1012            ".\n");
1013         GOMP_PLUGIN_fatal (msg, targ_fn->launch->fn, dims[GOMP_DIM_WORKER],
1014                            dims[GOMP_DIM_VECTOR]);
1015     }
1016
1017   GOMP_PLUGIN_debug (0, "  %s: kernel %s: launch"
1018                      " gangs=%u, workers=%u, vectors=%u\n",
1019                      __FUNCTION__, targ_fn->launch->fn, dims[GOMP_DIM_GANG],
1020                      dims[GOMP_DIM_WORKER], dims[GOMP_DIM_VECTOR]);
1021
1022   // OpenACC            CUDA
1023   //
1024   // num_gangs          nctaid.x
1025   // num_workers        ntid.y
1026   // vector length      ntid.x
1027
1028   struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1029   acc_prof_info *prof_info = thr->prof_info;
1030   acc_event_info enqueue_launch_event_info;
1031   acc_api_info *api_info = thr->api_info;
1032   bool profiling_p = __builtin_expect (prof_info != NULL, false);
1033   if (profiling_p)
1034     {
1035       prof_info->event_type = acc_ev_enqueue_launch_start;
1036
1037       enqueue_launch_event_info.launch_event.event_type
1038         = prof_info->event_type;
1039       enqueue_launch_event_info.launch_event.valid_bytes
1040         = _ACC_LAUNCH_EVENT_INFO_VALID_BYTES;
1041       enqueue_launch_event_info.launch_event.parent_construct
1042         = acc_construct_parallel;
1043       enqueue_launch_event_info.launch_event.implicit = 1;
1044       enqueue_launch_event_info.launch_event.tool_info = NULL;
1045       enqueue_launch_event_info.launch_event.kernel_name = targ_fn->launch->fn;
1046       enqueue_launch_event_info.launch_event.num_gangs
1047         = dims[GOMP_DIM_GANG];
1048       enqueue_launch_event_info.launch_event.num_workers
1049         = dims[GOMP_DIM_WORKER];
1050       enqueue_launch_event_info.launch_event.vector_length
1051         = dims[GOMP_DIM_VECTOR];
1052
1053       api_info->device_api = acc_device_api_cuda;
1054
1055       GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &enqueue_launch_event_info,
1056                                             api_info);
1057     }
1058
1059   kargs[0] = &dp;
1060   CUDA_CALL_ASSERT (cuLaunchKernel, function,
1061                     dims[GOMP_DIM_GANG], 1, 1,
1062                     dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1,
1063                     0, stream, kargs, 0);
1064
1065   if (profiling_p)
1066     {
1067       prof_info->event_type = acc_ev_enqueue_launch_end;
1068       enqueue_launch_event_info.launch_event.event_type
1069         = prof_info->event_type;
1070       GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &enqueue_launch_event_info,
1071                                             api_info);
1072     }
1073
1074   GOMP_PLUGIN_debug (0, "  %s: kernel %s: finished\n", __FUNCTION__,
1075                      targ_fn->launch->fn);
1076 }
1077
1078 void * openacc_get_current_cuda_context (void);
1079
1080 static void
1081 goacc_profiling_acc_ev_alloc (struct goacc_thread *thr, void *dp, size_t s)
1082 {
1083   acc_prof_info *prof_info = thr->prof_info;
1084   acc_event_info data_event_info;
1085   acc_api_info *api_info = thr->api_info;
1086
1087   prof_info->event_type = acc_ev_alloc;
1088
1089   data_event_info.data_event.event_type = prof_info->event_type;
1090   data_event_info.data_event.valid_bytes = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1091   data_event_info.data_event.parent_construct = acc_construct_parallel;
1092   data_event_info.data_event.implicit = 1;
1093   data_event_info.data_event.tool_info = NULL;
1094   data_event_info.data_event.var_name = NULL;
1095   data_event_info.data_event.bytes = s;
1096   data_event_info.data_event.host_ptr = NULL;
1097   data_event_info.data_event.device_ptr = dp;
1098
1099   api_info->device_api = acc_device_api_cuda;
1100
1101   GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, api_info);
1102 }
1103
1104 /* Free the cached soft-stacks block if it is above the SOFTSTACK_CACHE_LIMIT
1105    size threshold, or if FORCE is true.  */
1106
1107 static void
1108 nvptx_stacks_free (struct ptx_device *ptx_dev, bool force)
1109 {
1110   pthread_mutex_lock (&ptx_dev->omp_stacks.lock);
1111   if (ptx_dev->omp_stacks.ptr
1112       && (force || ptx_dev->omp_stacks.size > SOFTSTACK_CACHE_LIMIT))
1113     {
1114       CUresult r = CUDA_CALL_NOCHECK (cuMemFree, ptx_dev->omp_stacks.ptr);
1115       if (r != CUDA_SUCCESS)
1116         GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
1117       ptx_dev->omp_stacks.ptr = 0;
1118       ptx_dev->omp_stacks.size = 0;
1119     }
1120   pthread_mutex_unlock (&ptx_dev->omp_stacks.lock);
1121 }
1122
1123 static void *
1124 nvptx_alloc (size_t s, bool suppress_errors)
1125 {
1126   CUdeviceptr d;
1127
1128   CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &d, s);
1129   if (suppress_errors && r == CUDA_ERROR_OUT_OF_MEMORY)
1130     return NULL;
1131   else if (r != CUDA_SUCCESS)
1132     {
1133       GOMP_PLUGIN_error ("nvptx_alloc error: %s", cuda_error (r));
1134       return NULL;
1135     }
1136
1137   /* NOTE: We only do profiling stuff if the memory allocation succeeds.  */
1138   struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1139   bool profiling_p
1140     = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1141   if (profiling_p)
1142     goacc_profiling_acc_ev_alloc (thr, (void *) d, s);
1143
1144   return (void *) d;
1145 }
1146
1147 static void
1148 goacc_profiling_acc_ev_free (struct goacc_thread *thr, void *p)
1149 {
1150   acc_prof_info *prof_info = thr->prof_info;
1151   acc_event_info data_event_info;
1152   acc_api_info *api_info = thr->api_info;
1153
1154   prof_info->event_type = acc_ev_free;
1155
1156   data_event_info.data_event.event_type = prof_info->event_type;
1157   data_event_info.data_event.valid_bytes = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1158   data_event_info.data_event.parent_construct = acc_construct_parallel;
1159   data_event_info.data_event.implicit = 1;
1160   data_event_info.data_event.tool_info = NULL;
1161   data_event_info.data_event.var_name = NULL;
1162   data_event_info.data_event.bytes = -1;
1163   data_event_info.data_event.host_ptr = NULL;
1164   data_event_info.data_event.device_ptr = p;
1165
1166   api_info->device_api = acc_device_api_cuda;
1167
1168   GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, api_info);
1169 }
1170
1171 static bool
1172 nvptx_free (void *p, struct ptx_device *ptx_dev)
1173 {
1174   CUdeviceptr pb;
1175   size_t ps;
1176
1177   CUresult r = CUDA_CALL_NOCHECK (cuMemGetAddressRange, &pb, &ps,
1178                                   (CUdeviceptr) p);
1179   if (r == CUDA_ERROR_NOT_PERMITTED)
1180     {
1181       /* We assume that this error indicates we are in a CUDA callback context,
1182          where all CUDA calls are not allowed (see cuStreamAddCallback
1183          documentation for description). Arrange to free this piece of device
1184          memory later.  */
1185       struct ptx_free_block *n
1186         = GOMP_PLUGIN_malloc (sizeof (struct ptx_free_block));
1187       n->ptr = p;
1188       pthread_mutex_lock (&ptx_dev->free_blocks_lock);
1189       n->next = ptx_dev->free_blocks;
1190       ptx_dev->free_blocks = n;
1191       pthread_mutex_unlock (&ptx_dev->free_blocks_lock);
1192       return true;
1193     }
1194   else if (r != CUDA_SUCCESS)
1195     {
1196       GOMP_PLUGIN_error ("cuMemGetAddressRange error: %s", cuda_error (r));
1197       return false;
1198     }
1199   if ((CUdeviceptr) p != pb)
1200     {
1201       GOMP_PLUGIN_error ("invalid device address");
1202       return false;
1203     }
1204
1205   CUDA_CALL (cuMemFree, (CUdeviceptr) p);
1206   struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1207   bool profiling_p
1208     = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1209   if (profiling_p)
1210     goacc_profiling_acc_ev_free (thr, p);
1211
1212   return true;
1213 }
1214
1215 static void *
1216 nvptx_get_current_cuda_device (void)
1217 {
1218   struct nvptx_thread *nvthd = nvptx_thread ();
1219
1220   if (!nvthd || !nvthd->ptx_dev)
1221     return NULL;
1222
1223   return &nvthd->ptx_dev->dev;
1224 }
1225
1226 static void *
1227 nvptx_get_current_cuda_context (void)
1228 {
1229   struct nvptx_thread *nvthd = nvptx_thread ();
1230
1231   if (!nvthd || !nvthd->ptx_dev)
1232     return NULL;
1233
1234   return nvthd->ptx_dev->ctx;
1235 }
1236
1237 /* Plugin entry points.  */
1238
1239 const char *
1240 GOMP_OFFLOAD_get_name (void)
1241 {
1242   return "nvptx";
1243 }
1244
1245 /* Return the UID; if not available return NULL.
1246    Returns freshly allocated memoy.  */
1247
1248 const char *
1249 GOMP_OFFLOAD_get_uid (int ord)
1250 {
1251   CUresult r;
1252   CUuuid s;
1253   struct ptx_device *dev = ptx_devices[ord];
1254
1255   if (CUDA_CALL_EXISTS (cuDeviceGetUuid_v2))
1256     r = CUDA_CALL_NOCHECK (cuDeviceGetUuid_v2, &s, dev->dev);
1257   else if (CUDA_CALL_EXISTS (cuDeviceGetUuid))
1258     r = CUDA_CALL_NOCHECK (cuDeviceGetUuid, &s, dev->dev);
1259   else
1260     return NULL;
1261   if (r != CUDA_SUCCESS)
1262     NULL;
1263
1264   size_t len = strlen ("GPU-12345678-9abc-defg-hijk-lmniopqrstuv");
1265   char *str = (char *) GOMP_PLUGIN_malloc (len + 1);
1266   sprintf (str,
1267            "GPU-%02x" "%02x" "%02x" "%02x"
1268            "-%02x" "%02x"
1269            "-%02x" "%02x"
1270            "-%02x" "%02x" "%02x" "%02x" "%02x" "%02x" "%02x" "%02x",
1271            (unsigned char) s.bytes[0], (unsigned char) s.bytes[1],
1272            (unsigned char) s.bytes[2], (unsigned char) s.bytes[3],
1273            (unsigned char) s.bytes[4], (unsigned char) s.bytes[5],
1274            (unsigned char) s.bytes[6], (unsigned char) s.bytes[7],
1275            (unsigned char) s.bytes[8], (unsigned char) s.bytes[9],
1276            (unsigned char) s.bytes[10], (unsigned char) s.bytes[11],
1277            (unsigned char) s.bytes[12], (unsigned char) s.bytes[13],
1278            (unsigned char) s.bytes[14], (unsigned char) s.bytes[15]);
1279   return str;
1280 }
1281
1282 unsigned int
1283 GOMP_OFFLOAD_get_caps (void)
1284 {
1285   return GOMP_OFFLOAD_CAP_OPENACC_200 | GOMP_OFFLOAD_CAP_OPENMP_400;
1286 }
1287
1288 int
1289 GOMP_OFFLOAD_get_type (void)
1290 {
1291   return OFFLOAD_TARGET_TYPE_NVIDIA_PTX;
1292 }
1293
1294 int
1295 GOMP_OFFLOAD_get_num_devices (unsigned int omp_requires_mask)
1296 {
1297   int num_devices = nvptx_get_num_devices ();
1298   /* Return -1 if no omp_requires_mask cannot be fulfilled but
1299      devices were present.  Unified-shared address: see comment in
1300      nvptx_open_device for CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING.  */
1301   if (num_devices > 0
1302       && ((omp_requires_mask
1303            & ~(GOMP_REQUIRES_UNIFIED_ADDRESS
1304                | GOMP_REQUIRES_SELF_MAPS
1305                | GOMP_REQUIRES_UNIFIED_SHARED_MEMORY
1306                | GOMP_REQUIRES_REVERSE_OFFLOAD)) != 0))
1307     return -1;
1308   /* Check whether host page access (direct or via migration) is supported;
1309      if so, enable USM.  Currently, capabilities is per device type, hence,
1310      check all devices.  */
1311   if (num_devices > 0
1312       && (omp_requires_mask
1313           & (GOMP_REQUIRES_UNIFIED_SHARED_MEMORY | GOMP_REQUIRES_SELF_MAPS)))
1314     for (int dev = 0; dev < num_devices; dev++)
1315       {
1316         int pi;
1317         CUresult r;
1318         r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi,
1319                                CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS, dev);
1320         if (r != CUDA_SUCCESS || pi == 0)
1321           return -1;
1322       }
1323   return num_devices;
1324 }
1325
1326 bool
1327 GOMP_OFFLOAD_init_device (int n)
1328 {
1329   struct ptx_device *dev;
1330
1331   pthread_mutex_lock (&ptx_dev_lock);
1332
1333   if (!nvptx_init () || ptx_devices[n] != NULL)
1334     {
1335       pthread_mutex_unlock (&ptx_dev_lock);
1336       return false;
1337     }
1338
1339   dev = nvptx_open_device (n);
1340   if (dev)
1341     {
1342       ptx_devices[n] = dev;
1343       instantiated_devices++;
1344     }
1345
1346   const char *var_name = "GOMP_NVPTX_LOWLAT_POOL";
1347   const char *env_var = secure_getenv (var_name);
1348   notify_var (var_name, env_var);
1349
1350   if (env_var != NULL)
1351     {
1352       char *endptr;
1353       unsigned long val = strtoul (env_var, &endptr, 10);
1354       if (endptr == NULL || *endptr != '\0'
1355           || errno == ERANGE || errno == EINVAL
1356           || val > UINT_MAX)
1357         GOMP_PLUGIN_error ("Error parsing %s", var_name);
1358       else
1359         lowlat_pool_size = val;
1360     }
1361
1362   pthread_mutex_unlock (&ptx_dev_lock);
1363
1364   return dev != NULL;
1365 }
1366
1367 bool
1368 GOMP_OFFLOAD_fini_device (int n)
1369 {
1370   pthread_mutex_lock (&ptx_dev_lock);
1371
1372   if (ptx_devices[n] != NULL)
1373     {
1374       if (!nvptx_attach_host_thread_to_device (n)
1375           || !nvptx_close_device (ptx_devices[n]))
1376         {
1377           pthread_mutex_unlock (&ptx_dev_lock);
1378           return false;
1379         }
1380       ptx_devices[n] = NULL;
1381       instantiated_devices--;
1382     }
1383
1384   if (instantiated_devices == 0)
1385     {
1386       free (ptx_devices);
1387       ptx_devices = NULL;
1388     }
1389
1390   pthread_mutex_unlock (&ptx_dev_lock);
1391   return true;
1392 }
1393
1394 /* Return the libgomp version number we're compatible with.  There is
1395    no requirement for cross-version compatibility.  */
1396
1397 unsigned
1398 GOMP_OFFLOAD_version (void)
1399 {
1400   return GOMP_VERSION;
1401 }
1402
1403 /* Initialize __nvptx_clocktick, if present in MODULE.  */
1404
1405 static void
1406 nvptx_set_clocktick (CUmodule module, struct ptx_device *dev)
1407 {
1408   CUdeviceptr dptr;
1409   CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &dptr, NULL,
1410                                   module, "__nvptx_clocktick");
1411   if (r == CUDA_ERROR_NOT_FOUND)
1412     return;
1413   if (r != CUDA_SUCCESS)
1414     GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
1415   double __nvptx_clocktick = 1e-3 / dev->clock_khz;
1416   r = CUDA_CALL_NOCHECK (cuMemcpyHtoD, dptr, &__nvptx_clocktick,
1417                          sizeof (__nvptx_clocktick));
1418   if (r != CUDA_SUCCESS)
1419     GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
1420 }
1421
1422 /* Invoke MODULE's global constructors/destructors.  */
1423
1424 static bool
1425 nvptx_do_global_cdtors (CUmodule module, struct ptx_device *ptx_dev,
1426                         const char *funcname)
1427 {
1428   bool ret = true;
1429   char *funcname_mgomp = NULL;
1430   CUresult r;
1431   CUfunction funcptr;
1432   r = CUDA_CALL_NOCHECK (cuModuleGetFunction,
1433                          &funcptr, module, funcname);
1434   GOMP_PLUGIN_debug (0, "cuModuleGetFunction (%s): %s\n",
1435                      funcname, cuda_error (r));
1436   if (r == CUDA_ERROR_NOT_FOUND)
1437     {
1438       /* Try '[funcname]__mgomp'.  */
1439
1440       size_t funcname_len = strlen (funcname);
1441       const char *mgomp_suffix = "__mgomp";
1442       size_t mgomp_suffix_len = strlen (mgomp_suffix);
1443       funcname_mgomp
1444         = GOMP_PLUGIN_malloc (funcname_len + mgomp_suffix_len + 1);
1445       memcpy (funcname_mgomp, funcname, funcname_len);
1446       memcpy (funcname_mgomp + funcname_len,
1447               mgomp_suffix, mgomp_suffix_len + 1);
1448       funcname = funcname_mgomp;
1449
1450       r = CUDA_CALL_NOCHECK (cuModuleGetFunction,
1451                              &funcptr, module, funcname);
1452       GOMP_PLUGIN_debug (0, "cuModuleGetFunction (%s): %s\n",
1453                          funcname, cuda_error (r));
1454     }
1455   if (r == CUDA_ERROR_NOT_FOUND)
1456     ;
1457   else if (r != CUDA_SUCCESS)
1458     {
1459       GOMP_PLUGIN_error ("cuModuleGetFunction (%s) error: %s",
1460                          funcname, cuda_error (r));
1461       ret = false;
1462     }
1463   else
1464     {
1465       /* If necessary, set up soft stack.  */
1466       void *nvptx_stacks_0;
1467       void *kargs[1];
1468       if (funcname_mgomp)
1469         {
1470           size_t stack_size = nvptx_stacks_size ();
1471           pthread_mutex_lock (&ptx_dev->omp_stacks.lock);
1472           nvptx_stacks_0 = nvptx_stacks_acquire (ptx_dev, stack_size, 1);
1473           nvptx_stacks_0 += stack_size;
1474           kargs[0] = &nvptx_stacks_0;
1475         }
1476       r = CUDA_CALL_NOCHECK (cuLaunchKernel,
1477                              funcptr,
1478                              1, 1, 1, 1, 1, 1,
1479                              /* sharedMemBytes */ 0,
1480                              /* hStream */ NULL,
1481                              /* kernelParams */ funcname_mgomp ? kargs : NULL,
1482                              /* extra */ NULL);
1483       if (r != CUDA_SUCCESS)
1484         {
1485           GOMP_PLUGIN_error ("cuLaunchKernel (%s) error: %s",
1486                              funcname, cuda_error (r));
1487           ret = false;
1488         }
1489
1490       r = CUDA_CALL_NOCHECK (cuStreamSynchronize,
1491                              NULL);
1492       if (r != CUDA_SUCCESS)
1493         {
1494           GOMP_PLUGIN_error ("cuStreamSynchronize (%s) error: %s",
1495                              funcname, cuda_error (r));
1496           ret = false;
1497         }
1498
1499       if (funcname_mgomp)
1500         pthread_mutex_unlock (&ptx_dev->omp_stacks.lock);
1501     }
1502
1503   if (funcname_mgomp)
1504     free (funcname_mgomp);
1505
1506   return ret;
1507 }
1508
1509 /* Load the (partial) program described by TARGET_DATA to device
1510    number ORD.  Allocate and return TARGET_TABLE.  If not NULL, REV_FN_TABLE
1511    will contain the on-device addresses of the functions for reverse offload.
1512    To be freed by the caller.  */
1513
1514 int
1515 GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
1516                          struct addr_pair **target_table,
1517                          uint64_t **rev_fn_table,
1518                          uint64_t *host_ind_fn_table)
1519 {
1520   CUmodule module;
1521   const char *const *var_names;
1522   const struct targ_fn_launch *fn_descs;
1523   unsigned int fn_entries, var_entries, ind_fn_entries, other_entries, i, j;
1524   struct targ_fn_descriptor *targ_fns;
1525   struct addr_pair *targ_tbl;
1526   const nvptx_tdata_t *img_header = (const nvptx_tdata_t *) target_data;
1527   struct ptx_image_data *new_image;
1528   struct ptx_device *dev;
1529
1530   if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1531     {
1532       GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1533                          " (expected %u, received %u)",
1534                          GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1535       return -1;
1536     }
1537
1538   if (!nvptx_attach_host_thread_to_device (ord)
1539       || !link_ptx (&module, img_header->ptx_objs, img_header->ptx_num))
1540     return -1;
1541
1542   dev = ptx_devices[ord];
1543
1544   /* The mkoffload utility emits a struct of pointers/integers at the
1545      start of each offload image.  The array of kernel names and the
1546      functions addresses form a one-to-one correspondence.  */
1547
1548   var_entries = img_header->var_num;
1549   var_names = img_header->var_names;
1550   fn_entries = img_header->fn_num;
1551   fn_descs = img_header->fn_descs;
1552   ind_fn_entries = GOMP_VERSION_SUPPORTS_INDIRECT_FUNCS (version)
1553                      ? img_header->ind_fn_num : 0;
1554
1555   /* Currently, other_entries contains only the struct of ICVs.  */
1556   other_entries = 1;
1557
1558   targ_tbl = GOMP_PLUGIN_malloc (sizeof (struct addr_pair)
1559                                  * (fn_entries + var_entries + other_entries));
1560   targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
1561                                  * fn_entries);
1562
1563   *target_table = targ_tbl;
1564
1565   new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data));
1566   new_image->target_data = target_data;
1567   new_image->module = module;
1568   new_image->fns = targ_fns;
1569
1570   pthread_mutex_lock (&dev->image_lock);
1571   new_image->next = dev->images;
1572   dev->images = new_image;
1573   pthread_mutex_unlock (&dev->image_lock);
1574
1575   for (i = 0; i < fn_entries; i++, targ_fns++, targ_tbl++)
1576     {
1577       CUfunction function;
1578       int nregs, mthrs;
1579
1580       CUDA_CALL_ERET (-1, cuModuleGetFunction, &function, module,
1581                       fn_descs[i].fn);
1582       CUDA_CALL_ERET (-1, cuFuncGetAttribute, &nregs,
1583                       CU_FUNC_ATTRIBUTE_NUM_REGS, function);
1584       CUDA_CALL_ERET (-1, cuFuncGetAttribute, &mthrs,
1585                       CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, function);
1586
1587       targ_fns->fn = function;
1588       targ_fns->launch = &fn_descs[i];
1589       targ_fns->regs_per_thread = nregs;
1590       targ_fns->max_threads_per_block = mthrs;
1591
1592       targ_tbl->start = (uintptr_t) targ_fns;
1593       targ_tbl->end = targ_tbl->start + 1;
1594     }
1595
1596   for (j = 0; j < var_entries; j++, targ_tbl++)
1597     {
1598       CUdeviceptr var;
1599       size_t bytes;
1600
1601       CUDA_CALL_ERET (-1, cuModuleGetGlobal,
1602                       &var, &bytes, module, var_names[j]);
1603
1604       targ_tbl->start = (uintptr_t) var;
1605       targ_tbl->end = targ_tbl->start + bytes;
1606     }
1607
1608   if (ind_fn_entries > 0)
1609     {
1610       CUdeviceptr var;
1611       size_t bytes;
1612
1613       /* Read indirect function table from image.  */
1614       CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &var, &bytes, module,
1615                                       "$offload_ind_func_table");
1616       if (r != CUDA_SUCCESS)
1617         GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
1618       assert (bytes == sizeof (uint64_t) * ind_fn_entries);
1619
1620       uint64_t ind_fn_table[ind_fn_entries];
1621       r = CUDA_CALL_NOCHECK (cuMemcpyDtoH, ind_fn_table, var, bytes);
1622       if (r != CUDA_SUCCESS)
1623         GOMP_PLUGIN_fatal ("cuMemcpyDtoH error: %s", cuda_error (r));
1624
1625       /* Build host->target address map for indirect functions.  */
1626       uint64_t ind_fn_map[ind_fn_entries * 2 + 1];
1627       for (unsigned k = 0; k < ind_fn_entries; k++)
1628         {
1629           ind_fn_map[k * 2] = host_ind_fn_table[k];
1630           ind_fn_map[k * 2 + 1] = ind_fn_table[k];
1631           GOMP_PLUGIN_debug (0, "Indirect function %d: %lx->%lx\n",
1632                              k, host_ind_fn_table[k], ind_fn_table[k]);
1633         }
1634       ind_fn_map[ind_fn_entries * 2] = 0;
1635
1636       /* Write the map onto the target.  */
1637       void *map_target_addr
1638         = GOMP_OFFLOAD_alloc (ord, sizeof (ind_fn_map));
1639       GOMP_PLUGIN_debug (0, "Allocated indirect map at %p\n", map_target_addr);
1640
1641       GOMP_OFFLOAD_host2dev (ord, map_target_addr,
1642                              (void*) ind_fn_map,
1643                              sizeof (ind_fn_map));
1644
1645       /* Write address of the map onto the target.  */
1646       CUdeviceptr varptr;
1647       size_t varsize;
1648       r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &varptr, &varsize,
1649                              module, XSTRING (GOMP_INDIRECT_ADDR_MAP));
1650       if (r != CUDA_SUCCESS)
1651         GOMP_PLUGIN_fatal ("Indirect map variable not found in image: %s",
1652                            cuda_error (r));
1653
1654       GOMP_PLUGIN_debug (0,
1655                          "Indirect map variable found at %llx with size %ld\n",
1656                          varptr, varsize);
1657
1658       GOMP_OFFLOAD_host2dev (ord, (void *) varptr, &map_target_addr,
1659                              sizeof (map_target_addr));
1660     }
1661
1662   CUdeviceptr varptr;
1663   size_t varsize;
1664   CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &varptr, &varsize,
1665                                   module, XSTRING (GOMP_ADDITIONAL_ICVS));
1666
1667   if (r == CUDA_SUCCESS)
1668     {
1669       targ_tbl->start = (uintptr_t) varptr;
1670       targ_tbl->end = (uintptr_t) (varptr + varsize);
1671     }
1672   else
1673     /* The variable was not in this image.  */
1674     targ_tbl->start = targ_tbl->end = 0;
1675
1676   if (rev_fn_table && fn_entries == 0)
1677     *rev_fn_table = NULL;
1678   else if (rev_fn_table)
1679     {
1680       CUdeviceptr var;
1681       size_t bytes;
1682       unsigned int i;
1683       r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &var, &bytes, module,
1684                              "$offload_func_table");
1685       if (r != CUDA_SUCCESS)
1686         GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
1687       assert (bytes == sizeof (uint64_t) * fn_entries);
1688       *rev_fn_table = GOMP_PLUGIN_malloc (sizeof (uint64_t) * fn_entries);
1689       r = CUDA_CALL_NOCHECK (cuMemcpyDtoH, *rev_fn_table, var, bytes);
1690       if (r != CUDA_SUCCESS)
1691         GOMP_PLUGIN_fatal ("cuMemcpyDtoH error: %s", cuda_error (r));
1692       /* Free if only NULL entries.  */
1693       for (i = 0; i < fn_entries; ++i)
1694         if ((*rev_fn_table)[i] != 0)
1695           break;
1696       if (i == fn_entries)
1697         {
1698           free (*rev_fn_table);
1699           *rev_fn_table = NULL;
1700         }
1701     }
1702
1703   if (rev_fn_table && *rev_fn_table && dev->rev_data == NULL)
1704     {
1705       /* Get the on-device GOMP_REV_OFFLOAD_VAR variable.  It should be
1706          available but it might be not.  One reason could be: if the user code
1707          has 'omp target device(ancestor:1)' in pure hostcode, GOMP_target_ext
1708          is not called on the device and, hence, it and GOMP_REV_OFFLOAD_VAR
1709          are not linked in.  */
1710       CUdeviceptr device_rev_offload_var;
1711       size_t device_rev_offload_size;
1712       CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal,
1713                                       &device_rev_offload_var,
1714                                       &device_rev_offload_size, module,
1715                                       XSTRING (GOMP_REV_OFFLOAD_VAR));
1716       if (r != CUDA_SUCCESS)
1717         {
1718           free (*rev_fn_table);
1719           *rev_fn_table = NULL;
1720         }
1721       else
1722         {
1723           /* cuMemHostAlloc memory is accessible on the device, if
1724              unified-shared address is supported; this is assumed - see comment
1725              in nvptx_open_device for CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING. */
1726           CUDA_CALL_ASSERT (cuMemHostAlloc, (void **) &dev->rev_data,
1727                             sizeof (*dev->rev_data), CU_MEMHOSTALLOC_DEVICEMAP);
1728           CUdeviceptr dp = (CUdeviceptr) dev->rev_data;
1729           r = CUDA_CALL_NOCHECK (cuMemcpyHtoD, device_rev_offload_var, &dp,
1730                                  sizeof (dp));
1731           if (r != CUDA_SUCCESS)
1732             GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
1733         }
1734     }
1735
1736   nvptx_set_clocktick (module, dev);
1737
1738   if (!nvptx_do_global_cdtors (module, dev,
1739                                "__do_global_ctors__entry"
1740                                /* or "__do_global_ctors__entry__mgomp" */))
1741     return -1;
1742
1743   return fn_entries + var_entries + other_entries;
1744 }
1745
1746 /* Unload the program described by TARGET_DATA.  DEV_DATA is the
1747    function descriptors allocated by G_O_load_image.  */
1748
1749 bool
1750 GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data)
1751 {
1752   struct ptx_image_data *image, **prev_p;
1753   struct ptx_device *dev = ptx_devices[ord];
1754
1755   if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1756     {
1757       GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1758                          " (expected %u, received %u)",
1759                          GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1760       return false;
1761     }
1762
1763   bool ret = true;
1764   pthread_mutex_lock (&dev->image_lock);
1765   for (prev_p = &dev->images; (image = *prev_p) != 0; prev_p = &image->next)
1766     if (image->target_data == target_data)
1767       {
1768         if (!nvptx_do_global_cdtors (image->module, dev,
1769                                      "__do_global_dtors__entry"
1770                                      /* or "__do_global_dtors__entry__mgomp" */))
1771           ret = false;
1772
1773         *prev_p = image->next;
1774         if (CUDA_CALL_NOCHECK (cuModuleUnload, image->module) != CUDA_SUCCESS)
1775           ret = false;
1776         free (image->fns);
1777         free (image);
1778         break;
1779       }
1780   pthread_mutex_unlock (&dev->image_lock);
1781   return ret;
1782 }
1783
1784 void *
1785 GOMP_OFFLOAD_alloc (int ord, size_t size)
1786 {
1787   if (!nvptx_attach_host_thread_to_device (ord))
1788     return NULL;
1789
1790   struct ptx_device *ptx_dev = ptx_devices[ord];
1791   struct ptx_free_block *blocks, *tmp;
1792
1793   pthread_mutex_lock (&ptx_dev->free_blocks_lock);
1794   blocks = ptx_dev->free_blocks;
1795   ptx_dev->free_blocks = NULL;
1796   pthread_mutex_unlock (&ptx_dev->free_blocks_lock);
1797
1798   nvptx_stacks_free (ptx_dev, false);
1799
1800   while (blocks)
1801     {
1802       tmp = blocks->next;
1803       nvptx_free (blocks->ptr, ptx_dev);
1804       free (blocks);
1805       blocks = tmp;
1806     }
1807
1808   void *d = nvptx_alloc (size, true);
1809   if (d)
1810     return d;
1811   else
1812     {
1813       /* Memory allocation failed.  Try freeing the stacks block, and
1814          retrying.  */
1815       nvptx_stacks_free (ptx_dev, true);
1816       return nvptx_alloc (size, false);
1817     }
1818 }
1819
1820 bool
1821 GOMP_OFFLOAD_free (int ord, void *ptr)
1822 {
1823   return (nvptx_attach_host_thread_to_device (ord)
1824           && nvptx_free (ptr, ptx_devices[ord]));
1825 }
1826
1827 void
1828 GOMP_OFFLOAD_openacc_exec (void (*fn) (void *),
1829                            size_t mapnum  __attribute__((unused)),
1830                            void **hostaddrs __attribute__((unused)),
1831                            void **devaddrs,
1832                            unsigned *dims, void *targ_mem_desc)
1833 {
1834   GOMP_PLUGIN_debug (0, "nvptx %s\n", __FUNCTION__);
1835
1836   CUdeviceptr dp = (CUdeviceptr) devaddrs;
1837   nvptx_exec (fn, dims, targ_mem_desc, dp, NULL);
1838
1839   CUresult r = CUDA_CALL_NOCHECK (cuStreamSynchronize, NULL);
1840   const char *maybe_abort_msg = "(perhaps abort was called)";
1841   if (r == CUDA_ERROR_LAUNCH_FAILED)
1842     GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
1843                        maybe_abort_msg);
1844   else if (r != CUDA_SUCCESS)
1845     GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1846 }
1847
1848 void
1849 GOMP_OFFLOAD_openacc_async_exec (void (*fn) (void *),
1850                                  size_t mapnum __attribute__((unused)),
1851                                  void **hostaddrs __attribute__((unused)),
1852                                  void **devaddrs,
1853                                  unsigned *dims, void *targ_mem_desc,
1854                                  struct goacc_asyncqueue *aq)
1855 {
1856   GOMP_PLUGIN_debug (0, "nvptx %s\n", __FUNCTION__);
1857
1858   CUdeviceptr dp = (CUdeviceptr) devaddrs;
1859   nvptx_exec (fn, dims, targ_mem_desc, dp, aq->cuda_stream);
1860 }
1861
1862 void *
1863 GOMP_OFFLOAD_openacc_create_thread_data (int ord)
1864 {
1865   struct ptx_device *ptx_dev;
1866   struct nvptx_thread *nvthd
1867     = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
1868   CUcontext thd_ctx;
1869
1870   ptx_dev = ptx_devices[ord];
1871
1872   assert (ptx_dev);
1873
1874   CUDA_CALL_ASSERT (cuCtxGetCurrent, &thd_ctx);
1875
1876   assert (ptx_dev->ctx);
1877
1878   if (!thd_ctx)
1879     CUDA_CALL_ASSERT (cuCtxPushCurrent, ptx_dev->ctx);
1880
1881   nvthd->ptx_dev = ptx_dev;
1882
1883   return (void *) nvthd;
1884 }
1885
1886 void
1887 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
1888 {
1889   free (data);
1890 }
1891
1892 void *
1893 GOMP_OFFLOAD_openacc_cuda_get_current_device (void)
1894 {
1895   return nvptx_get_current_cuda_device ();
1896 }
1897
1898 void *
1899 GOMP_OFFLOAD_openacc_cuda_get_current_context (void)
1900 {
1901   return nvptx_get_current_cuda_context ();
1902 }
1903
1904 /* This returns a CUstream.  */
1905 void *
1906 GOMP_OFFLOAD_openacc_cuda_get_stream (struct goacc_asyncqueue *aq)
1907 {
1908   return (void *) aq->cuda_stream;
1909 }
1910
1911 /* This takes a CUstream.  */
1912 int
1913 GOMP_OFFLOAD_openacc_cuda_set_stream (struct goacc_asyncqueue *aq, void *stream)
1914 {
1915   if (aq->cuda_stream)
1916     {
1917       CUDA_CALL_ASSERT (cuStreamSynchronize, aq->cuda_stream);
1918       CUDA_CALL_ASSERT (cuStreamDestroy, aq->cuda_stream);
1919     }
1920
1921   aq->cuda_stream = (CUstream) stream;
1922   return 1;
1923 }
1924
1925 static struct goacc_asyncqueue *
1926 nvptx_goacc_asyncqueue_construct (unsigned int flags)
1927 {
1928   CUstream stream = NULL;
1929   CUDA_CALL_ERET (NULL, cuStreamCreate, &stream, flags);
1930
1931   struct goacc_asyncqueue *aq
1932     = GOMP_PLUGIN_malloc (sizeof (struct goacc_asyncqueue));
1933   aq->cuda_stream = stream;
1934   return aq;
1935 }
1936
1937 struct goacc_asyncqueue *
1938 GOMP_OFFLOAD_openacc_async_construct (int device __attribute__((unused)))
1939 {
1940   return nvptx_goacc_asyncqueue_construct (CU_STREAM_DEFAULT);
1941 }
1942
1943 static bool
1944 nvptx_goacc_asyncqueue_destruct (struct goacc_asyncqueue *aq)
1945 {
1946   CUDA_CALL_ERET (false, cuStreamDestroy, aq->cuda_stream);
1947   free (aq);
1948   return true;
1949 }
1950
1951 bool
1952 GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
1953 {
1954   return nvptx_goacc_asyncqueue_destruct (aq);
1955 }
1956
1957 int
1958 GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
1959 {
1960   CUresult r = CUDA_CALL_NOCHECK (cuStreamQuery, aq->cuda_stream);
1961   if (r == CUDA_SUCCESS)
1962     return 1;
1963   if (r == CUDA_ERROR_NOT_READY)
1964     return 0;
1965
1966   GOMP_PLUGIN_error ("cuStreamQuery error: %s", cuda_error (r));
1967   return -1;
1968 }
1969
1970 static bool
1971 nvptx_goacc_asyncqueue_synchronize (struct goacc_asyncqueue *aq)
1972 {
1973   CUDA_CALL_ERET (false, cuStreamSynchronize, aq->cuda_stream);
1974   return true;
1975 }
1976
1977 bool
1978 GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
1979 {
1980   return nvptx_goacc_asyncqueue_synchronize (aq);
1981 }
1982
1983 bool
1984 GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
1985                                       struct goacc_asyncqueue *aq2)
1986 {
1987   CUevent e;
1988   CUDA_CALL_ERET (false, cuEventCreate, &e, CU_EVENT_DISABLE_TIMING);
1989   CUDA_CALL_ERET (false, cuEventRecord, e, aq1->cuda_stream);
1990   CUDA_CALL_ERET (false, cuStreamWaitEvent, aq2->cuda_stream, e, 0);
1991   return true;
1992 }
1993
1994 static void
1995 cuda_callback_wrapper (CUstream stream, CUresult res, void *ptr)
1996 {
1997   if (res != CUDA_SUCCESS)
1998     GOMP_PLUGIN_fatal ("%s error: %s", __FUNCTION__, cuda_error (res));
1999   struct nvptx_callback *cb = (struct nvptx_callback *) ptr;
2000   cb->fn (cb->ptr);
2001   free (ptr);
2002 }
2003
2004 void
2005 GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
2006                                            void (*callback_fn)(void *),
2007                                            void *userptr)
2008 {
2009   struct nvptx_callback *b = GOMP_PLUGIN_malloc (sizeof (*b));
2010   b->fn = callback_fn;
2011   b->ptr = userptr;
2012   b->aq = aq;
2013   CUDA_CALL_ASSERT (cuStreamAddCallback, aq->cuda_stream,
2014                     cuda_callback_wrapper, (void *) b, 0);
2015 }
2016
2017 static bool
2018 cuda_memcpy_sanity_check (const void *h, const void *d, size_t s)
2019 {
2020   CUdeviceptr pb;
2021   size_t ps;
2022   if (!s)
2023     return true;
2024   if (!d)
2025     {
2026       GOMP_PLUGIN_error ("invalid device address");
2027       return false;
2028     }
2029   CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
2030   if (!pb)
2031     {
2032       GOMP_PLUGIN_error ("invalid device address");
2033       return false;
2034     }
2035   if (!h)
2036     {
2037       GOMP_PLUGIN_error ("invalid host address");
2038       return false;
2039     }
2040   if (d == h)
2041     {
2042       GOMP_PLUGIN_error ("invalid host or device address");
2043       return false;
2044     }
2045   if ((void *)(d + s) > (void *)(pb + ps))
2046     {
2047       GOMP_PLUGIN_error ("invalid size");
2048       return false;
2049     }
2050   return true;
2051 }
2052
2053 bool
2054 GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
2055 {
2056   if (!nvptx_attach_host_thread_to_device (ord)
2057       || !cuda_memcpy_sanity_check (src, dst, n))
2058     return false;
2059   CUDA_CALL (cuMemcpyHtoD, (CUdeviceptr) dst, src, n);
2060   return true;
2061 }
2062
2063 bool
2064 GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
2065 {
2066   if (!nvptx_attach_host_thread_to_device (ord)
2067       || !cuda_memcpy_sanity_check (dst, src, n))
2068     return false;
2069   CUDA_CALL (cuMemcpyDtoH, dst, (CUdeviceptr) src, n);
2070   return true;
2071 }
2072
2073 bool
2074 GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n)
2075 {
2076   CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n, NULL);
2077   return true;
2078 }
2079
2080 int
2081 GOMP_OFFLOAD_memcpy2d (int dst_ord, int src_ord, size_t dim1_size,
2082                        size_t dim0_len, void *dst, size_t dst_offset1_size,
2083                        size_t dst_offset0_len, size_t dst_dim1_size,
2084                        const void *src, size_t src_offset1_size,
2085                        size_t src_offset0_len, size_t src_dim1_size)
2086 {
2087   if (!nvptx_attach_host_thread_to_device (src_ord != -1 ? src_ord : dst_ord))
2088     return false;
2089
2090   /* TODO: Consider using CU_MEMORYTYPE_UNIFIED if supported.  */
2091
2092   CUDA_MEMCPY2D data;
2093
2094   memset (&data, 0, sizeof (data));
2095   data.WidthInBytes = dim1_size;
2096   data.Height = dim0_len;
2097
2098   if (dst_ord == -1)
2099     {
2100       data.dstMemoryType = CU_MEMORYTYPE_HOST;
2101       data.dstHost = dst;
2102     }
2103   else
2104     {
2105       data.dstMemoryType = CU_MEMORYTYPE_DEVICE;
2106       data.dstDevice = (CUdeviceptr) dst;
2107     }
2108   data.dstPitch = dst_dim1_size;
2109   data.dstXInBytes = dst_offset1_size;
2110   data.dstY = dst_offset0_len;
2111
2112   if (src_ord == -1)
2113     {
2114       data.srcMemoryType = CU_MEMORYTYPE_HOST;
2115       data.srcHost = src;
2116     }
2117   else
2118     {
2119       data.srcMemoryType = CU_MEMORYTYPE_DEVICE;
2120       data.srcDevice = (CUdeviceptr) src;
2121     }
2122   data.srcPitch = src_dim1_size;
2123   data.srcXInBytes = src_offset1_size;
2124   data.srcY = src_offset0_len;
2125
2126   if (data.srcXInBytes != 0 || data.srcY != 0)
2127     {
2128       /* Adjust origin to the actual array data, else the CUDA 2D memory
2129          copy API calls below may fail to validate source/dest pointers
2130          correctly (especially for Fortran where the "virtual origin" of an
2131          array is often outside the stored data).  */
2132       if (src_ord == -1)
2133         data.srcHost = (const void *) ((const char *) data.srcHost
2134                                       + data.srcY * data.srcPitch
2135                                       + data.srcXInBytes);
2136       else
2137         data.srcDevice += data.srcY * data.srcPitch + data.srcXInBytes;
2138       data.srcXInBytes = 0;
2139       data.srcY = 0;
2140     }
2141
2142   if (data.dstXInBytes != 0 || data.dstY != 0)
2143     {
2144       /* As above.  */
2145       if (dst_ord == -1)
2146         data.dstHost = (void *) ((char *) data.dstHost
2147                                  + data.dstY * data.dstPitch
2148                                  + data.dstXInBytes);
2149       else
2150         data.dstDevice += data.dstY * data.dstPitch + data.dstXInBytes;
2151       data.dstXInBytes = 0;
2152       data.dstY = 0;
2153     }
2154
2155   CUresult res = CUDA_CALL_NOCHECK (cuMemcpy2D, &data);
2156   if (res == CUDA_ERROR_INVALID_VALUE)
2157     /* If pitch > CU_DEVICE_ATTRIBUTE_MAX_PITCH or for device-to-device
2158        for (some) memory not allocated by cuMemAllocPitch, cuMemcpy2D fails
2159        with an error; try the slower cuMemcpy2DUnaligned now.  */
2160     CUDA_CALL (cuMemcpy2DUnaligned, &data);
2161   else if (res != CUDA_SUCCESS)
2162     {
2163       GOMP_PLUGIN_error ("cuMemcpy2D error: %s", cuda_error (res));
2164       return false;
2165     }
2166   return true;
2167 }
2168
2169 int
2170 GOMP_OFFLOAD_memcpy3d (int dst_ord, int src_ord, size_t dim2_size,
2171                        size_t dim1_len, size_t dim0_len, void *dst,
2172                        size_t dst_offset2_size, size_t dst_offset1_len,
2173                        size_t dst_offset0_len, size_t dst_dim2_size,
2174                        size_t dst_dim1_len, const void *src,
2175                        size_t src_offset2_size, size_t src_offset1_len,
2176                        size_t src_offset0_len, size_t src_dim2_size,
2177                        size_t src_dim1_len)
2178 {
2179   if (!nvptx_attach_host_thread_to_device (src_ord != -1 ? src_ord : dst_ord))
2180     return false;
2181
2182   /* TODO: Consider using CU_MEMORYTYPE_UNIFIED if supported.  */
2183
2184   CUDA_MEMCPY3D data;
2185
2186   memset (&data, 0, sizeof (data));
2187   data.WidthInBytes = dim2_size;
2188   data.Height = dim1_len;
2189   data.Depth = dim0_len;
2190
2191   if (dst_ord == -1)
2192     {
2193       data.dstMemoryType = CU_MEMORYTYPE_HOST;
2194       data.dstHost = dst;
2195     }
2196   else
2197     {
2198       data.dstMemoryType = CU_MEMORYTYPE_DEVICE;
2199       data.dstDevice = (CUdeviceptr) dst;
2200     }
2201   data.dstPitch = dst_dim2_size;
2202   data.dstHeight = dst_dim1_len;
2203   data.dstXInBytes = dst_offset2_size;
2204   data.dstY = dst_offset1_len;
2205   data.dstZ = dst_offset0_len;
2206
2207   if (src_ord == -1)
2208     {
2209       data.srcMemoryType = CU_MEMORYTYPE_HOST;
2210       data.srcHost = src;
2211     }
2212   else
2213     {
2214       data.srcMemoryType = CU_MEMORYTYPE_DEVICE;
2215       data.srcDevice = (CUdeviceptr) src;
2216     }
2217   data.srcPitch = src_dim2_size;
2218   data.srcHeight = src_dim1_len;
2219   data.srcXInBytes = src_offset2_size;
2220   data.srcY = src_offset1_len;
2221   data.srcZ = src_offset0_len;
2222
2223   if (data.srcXInBytes != 0 || data.srcY != 0 || data.srcZ != 0)
2224     {
2225       /* Adjust origin to the actual array data, else the CUDA 3D memory
2226          copy API call below may fail to validate source/dest pointers
2227          correctly (especially for Fortran where the "virtual origin" of an
2228          array is often outside the stored data).  */
2229       if (src_ord == -1)
2230         data.srcHost
2231           = (const void *) ((const char *) data.srcHost
2232                             + (data.srcZ * data.srcHeight + data.srcY)
2233                               * data.srcPitch
2234                             + data.srcXInBytes);
2235       else
2236         data.srcDevice
2237           += (data.srcZ * data.srcHeight + data.srcY) * data.srcPitch
2238              + data.srcXInBytes;
2239       data.srcXInBytes = 0;
2240       data.srcY = 0;
2241       data.srcZ = 0;
2242     }
2243
2244   if (data.dstXInBytes != 0 || data.dstY != 0 || data.dstZ != 0)
2245     {
2246       /* As above.  */
2247       if (dst_ord == -1)
2248         data.dstHost = (void *) ((char *) data.dstHost
2249                                  + (data.dstZ * data.dstHeight + data.dstY)
2250                                    * data.dstPitch
2251                                  + data.dstXInBytes);
2252       else
2253         data.dstDevice
2254           += (data.dstZ * data.dstHeight + data.dstY) * data.dstPitch
2255              + data.dstXInBytes;
2256       data.dstXInBytes = 0;
2257       data.dstY = 0;
2258       data.dstZ = 0;
2259     }
2260
2261   CUDA_CALL (cuMemcpy3D, &data);
2262   return true;
2263 }
2264
2265 bool
2266 GOMP_OFFLOAD_openacc_async_host2dev (int ord, void *dst, const void *src,
2267                                      size_t n, struct goacc_asyncqueue *aq)
2268 {
2269   if (!nvptx_attach_host_thread_to_device (ord)
2270       || !cuda_memcpy_sanity_check (src, dst, n))
2271     return false;
2272   CUDA_CALL (cuMemcpyHtoDAsync, (CUdeviceptr) dst, src, n, aq->cuda_stream);
2273   return true;
2274 }
2275
2276 bool
2277 GOMP_OFFLOAD_openacc_async_dev2host (int ord, void *dst, const void *src,
2278                                      size_t n, struct goacc_asyncqueue *aq)
2279 {
2280   if (!nvptx_attach_host_thread_to_device (ord)
2281       || !cuda_memcpy_sanity_check (dst, src, n))
2282     return false;
2283   CUDA_CALL (cuMemcpyDtoHAsync, dst, (CUdeviceptr) src, n, aq->cuda_stream);
2284   return true;
2285 }
2286
2287 union goacc_property_value
2288 GOMP_OFFLOAD_openacc_get_property (int n, enum goacc_property prop)
2289 {
2290   union goacc_property_value propval = { .val = 0 };
2291
2292   pthread_mutex_lock (&ptx_dev_lock);
2293
2294   if (n >= nvptx_get_num_devices () || n < 0 || ptx_devices[n] == NULL)
2295     {
2296       pthread_mutex_unlock (&ptx_dev_lock);
2297       return propval;
2298     }
2299
2300   struct ptx_device *ptx_dev = ptx_devices[n];
2301   switch (prop)
2302     {
2303     case GOACC_PROPERTY_MEMORY:
2304       {
2305         size_t total_mem;
2306
2307         CUDA_CALL_ERET (propval, cuDeviceTotalMem, &total_mem, ptx_dev->dev);
2308         propval.val = total_mem;
2309       }
2310       break;
2311     case GOACC_PROPERTY_FREE_MEMORY:
2312       {
2313         size_t total_mem;
2314         size_t free_mem;
2315         CUdevice ctxdev;
2316
2317         CUDA_CALL_ERET (propval, cuCtxGetDevice, &ctxdev);
2318         if (ptx_dev->dev == ctxdev)
2319           CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
2320         else if (ptx_dev->ctx)
2321           {
2322             CUcontext old_ctx;
2323
2324             CUDA_CALL_ERET (propval, cuCtxPushCurrent, ptx_dev->ctx);
2325             CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
2326             CUDA_CALL_ASSERT (cuCtxPopCurrent, &old_ctx);
2327           }
2328         else
2329           {
2330             CUcontext new_ctx;
2331
2332             CUDA_CALL_ERET (propval, cuCtxCreate, &new_ctx, CU_CTX_SCHED_AUTO,
2333                             ptx_dev->dev);
2334             CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
2335             CUDA_CALL_ASSERT (cuCtxDestroy, new_ctx);
2336           }
2337         propval.val = free_mem;
2338       }
2339       break;
2340     case GOACC_PROPERTY_NAME:
2341       propval.ptr = ptx_dev->name;
2342       break;
2343     case GOACC_PROPERTY_VENDOR:
2344       propval.ptr = "Nvidia";
2345       break;
2346     case GOACC_PROPERTY_DRIVER:
2347       propval.ptr = cuda_driver_version_s;
2348       break;
2349     default:
2350       break;
2351     }
2352
2353   pthread_mutex_unlock (&ptx_dev_lock);
2354   return propval;
2355 }
2356
2357 /* Adjust launch dimensions: pick good values for number of blocks and warps
2358    and ensure that number of warps does not exceed CUDA limits as well as GCC's
2359    own limits.  */
2360
2361 static void
2362 nvptx_adjust_launch_bounds (struct targ_fn_descriptor *fn,
2363                             struct ptx_device *ptx_dev,
2364                             int *teams_p, int *threads_p)
2365 {
2366   int max_warps_block = fn->max_threads_per_block / 32;
2367   /* Maximum 32 warps per block is an implementation limit in NVPTX backend
2368      and libgcc, which matches documented limit of all GPUs as of 2015.  */
2369   if (max_warps_block > 32)
2370     max_warps_block = 32;
2371   if (*threads_p <= 0)
2372     *threads_p = 8;
2373   if (*threads_p > max_warps_block)
2374     *threads_p = max_warps_block;
2375
2376   int regs_per_block = fn->regs_per_thread * 32 * *threads_p;
2377   /* This is an estimate of how many blocks the device can host simultaneously.
2378      Actual limit, which may be lower, can be queried with "occupancy control"
2379      driver interface (since CUDA 6.0).  */
2380   int max_blocks = ptx_dev->regs_per_sm / regs_per_block * ptx_dev->num_sms;
2381   if (*teams_p <= 0 || *teams_p > max_blocks)
2382     *teams_p = max_blocks;
2383 }
2384
2385 /* Return the size of per-warp stacks (see gcc -msoft-stack) to use for OpenMP
2386    target regions.  */
2387
2388 static size_t
2389 nvptx_stacks_size ()
2390 {
2391   return 128 * 1024;
2392 }
2393
2394 /* Return contiguous storage for NUM stacks, each SIZE bytes.  The lock for
2395    the storage should be held on entry, and remains held on exit.  */
2396
2397 static void *
2398 nvptx_stacks_acquire (struct ptx_device *ptx_dev, size_t size, int num)
2399 {
2400   if (ptx_dev->omp_stacks.ptr && ptx_dev->omp_stacks.size >= size * num)
2401     return (void *) ptx_dev->omp_stacks.ptr;
2402
2403   /* Free the old, too-small stacks.  */
2404   if (ptx_dev->omp_stacks.ptr)
2405     {
2406       CUresult r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
2407       if (r != CUDA_SUCCESS)
2408         GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s\n", cuda_error (r));
2409       r = CUDA_CALL_NOCHECK (cuMemFree, ptx_dev->omp_stacks.ptr);
2410       if (r != CUDA_SUCCESS)
2411         GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
2412     }
2413
2414   /* Make new and bigger stacks, and remember where we put them and how big
2415      they are.  */
2416   CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &ptx_dev->omp_stacks.ptr,
2417                                   size * num);
2418   if (r != CUDA_SUCCESS)
2419     GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
2420
2421   ptx_dev->omp_stacks.size = size * num;
2422
2423   return (void *) ptx_dev->omp_stacks.ptr;
2424 }
2425
2426
2427 void
2428 GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
2429 {
2430   struct targ_fn_descriptor *tgt_fn_desc
2431     = (struct targ_fn_descriptor *) tgt_fn;
2432   CUfunction function = tgt_fn_desc->fn;
2433   const struct targ_fn_launch *launch = tgt_fn_desc->launch;
2434   const char *fn_name = launch->fn;
2435   CUresult r;
2436   struct ptx_device *ptx_dev = ptx_devices[ord];
2437   const char *maybe_abort_msg = "(perhaps abort was called)";
2438   int teams = 0, threads = 0;
2439
2440   if (!args)
2441     GOMP_PLUGIN_fatal ("No target arguments provided");
2442   while (*args)
2443     {
2444       intptr_t id = (intptr_t) *args++, val;
2445       if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
2446         val = (intptr_t) *args++;
2447       else
2448         val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
2449       if ((id & GOMP_TARGET_ARG_DEVICE_MASK) != GOMP_TARGET_ARG_DEVICE_ALL)
2450         continue;
2451       val = val > INT_MAX ? INT_MAX : val;
2452       id &= GOMP_TARGET_ARG_ID_MASK;
2453       if (id == GOMP_TARGET_ARG_NUM_TEAMS)
2454         teams = val;
2455       else if (id == GOMP_TARGET_ARG_THREAD_LIMIT)
2456         threads = val;
2457     }
2458   nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads);
2459
2460   bool reverse_offload = ptx_dev->rev_data != NULL;
2461   struct goacc_asyncqueue *reverse_offload_aq = NULL;
2462   if (reverse_offload)
2463     {
2464       reverse_offload_aq
2465         = nvptx_goacc_asyncqueue_construct (CU_STREAM_NON_BLOCKING);
2466       if (!reverse_offload_aq)
2467         exit (EXIT_FAILURE);
2468     }
2469
2470   size_t stack_size = nvptx_stacks_size ();
2471
2472   pthread_mutex_lock (&ptx_dev->omp_stacks.lock);
2473   void *stacks = nvptx_stacks_acquire (ptx_dev, stack_size, teams * threads);
2474   void *fn_args[] = {tgt_vars, stacks, (void *) stack_size};
2475   size_t fn_args_size = sizeof fn_args;
2476   void *config[] = {
2477     CU_LAUNCH_PARAM_BUFFER_POINTER, fn_args,
2478     CU_LAUNCH_PARAM_BUFFER_SIZE, &fn_args_size,
2479     CU_LAUNCH_PARAM_END
2480   };
2481   GOMP_PLUGIN_debug (0, "  %s: kernel %s: launch"
2482                      " [(teams: %u), 1, 1] [(lanes: 32), (threads: %u), 1]\n",
2483                      __FUNCTION__, fn_name, teams, threads);
2484   r = CUDA_CALL_NOCHECK (cuLaunchKernel, function, teams, 1, 1,
2485                          32, threads, 1, lowlat_pool_size, NULL, NULL, config);
2486   if (r != CUDA_SUCCESS)
2487     GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
2488   if (reverse_offload)
2489     while (true)
2490       {
2491         r = CUDA_CALL_NOCHECK (cuStreamQuery, NULL);
2492         if (r == CUDA_SUCCESS)
2493           break;
2494         if (r == CUDA_ERROR_LAUNCH_FAILED)
2495           GOMP_PLUGIN_fatal ("cuStreamQuery error: %s %s\n", cuda_error (r),
2496                              maybe_abort_msg);
2497         else if (r != CUDA_ERROR_NOT_READY)
2498           GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
2499
2500         if (__atomic_load_n (&ptx_dev->rev_data->fn, __ATOMIC_ACQUIRE) != 0)
2501           {
2502             struct rev_offload *rev_data = ptx_dev->rev_data;
2503             GOMP_PLUGIN_target_rev (rev_data->fn, rev_data->mapnum,
2504                                     rev_data->addrs, rev_data->sizes,
2505                                     rev_data->kinds, rev_data->dev_num,
2506                                     reverse_offload_aq);
2507             if (!nvptx_goacc_asyncqueue_synchronize (reverse_offload_aq))
2508               exit (EXIT_FAILURE);
2509             __atomic_store_n (&rev_data->fn, 0, __ATOMIC_RELEASE);
2510           }
2511         usleep (1);
2512       }
2513   else
2514     r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
2515   if (r == CUDA_ERROR_LAUNCH_FAILED)
2516     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
2517                        maybe_abort_msg);
2518   else if (r != CUDA_SUCCESS)
2519     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
2520
2521   pthread_mutex_unlock (&ptx_dev->omp_stacks.lock);
2522
2523   if (reverse_offload)
2524     {
2525       if (!nvptx_goacc_asyncqueue_destruct (reverse_offload_aq))
2526         exit (EXIT_FAILURE);
2527     }
2528 }
2529
2530 /* TODO: Implement GOMP_OFFLOAD_async_run. */