libgomp/plugin/plugin-nvptx.c

   1 /* Plugin for NVPTX execution.
   2
   3    Copyright (C) 2013-2021 Free Software Foundation, Inc.
   4
   5    Contributed by Mentor Embedded.
   6
   7    This file is part of the GNU Offloading and Multi Processing Library
   8    (libgomp).
   9
  10    Libgomp is free software; you can redistribute it and/or modify it
  11    under the terms of the GNU General Public License as published by
  12    the Free Software Foundation; either version 3, or (at your option)
  13    any later version.
  14
  15    Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
  16    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  17    FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  18    more details.
  19
  20    Under Section 7 of GPL version 3, you are granted additional
  21    permissions described in the GCC Runtime Library Exception, version
  22    3.1, as published by the Free Software Foundation.
  23
  24    You should have received a copy of the GNU General Public License and
  25    a copy of the GCC Runtime Library Exception along with this program;
  26    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  27    <http://www.gnu.org/licenses/>.  */
  28
  29 /* Nvidia PTX-specific parts of OpenACC support.  The cuda driver
  30    library appears to hold some implicit state, but the documentation
  31    is not clear as to what that state might be.  Or how one might
  32    propagate it from one thread to another.  */
  33
  34 #define _GNU_SOURCE
  35 #include "openacc.h"
  36 #include "config.h"
  37 #include "symcat.h"
  38 #include "libgomp-plugin.h"
  39 #include "oacc-plugin.h"
  40 #include "gomp-constants.h"
  41 #include "oacc-int.h"
  42
  43 #include <pthread.h>
  44 #include <cuda.h>
  45 #include <stdbool.h>
  46 #include <limits.h>
  47 #include <string.h>
  48 #include <stdio.h>
  49 #include <unistd.h>
  50 #include <assert.h>
  51 #include <errno.h>
  52
  53 /* An arbitrary fixed limit (128MB) for the size of the OpenMP soft stacks
  54    block to cache between kernel invocations.  For soft-stacks blocks bigger
  55    than this, we will free the block before attempting another GPU memory
  56    allocation (i.e. in GOMP_OFFLOAD_alloc).  Otherwise, if an allocation fails,
  57    we will free the cached soft-stacks block anyway then retry the
  58    allocation.  If that fails too, we lose.  */
  59
  60 #define SOFTSTACK_CACHE_LIMIT 134217728
  61
  62 #if CUDA_VERSION < 6000
  63 extern CUresult cuGetErrorString (CUresult, const char **);
  64 #define CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR 82
  65 #endif
  66
  67 #if CUDA_VERSION >= 6050
  68 #undef cuLinkCreate
  69 #undef cuLinkAddData
  70 CUresult cuLinkAddData (CUlinkState, CUjitInputType, void *, size_t,
  71                         const char *, unsigned, CUjit_option *, void **);
  72 CUresult cuLinkCreate (unsigned, CUjit_option *, void **, CUlinkState *);
  73 #else
  74 typedef size_t (*CUoccupancyB2DSize)(int);
  75 CUresult cuLinkAddData_v2 (CUlinkState, CUjitInputType, void *, size_t,
  76                            const char *, unsigned, CUjit_option *, void **);
  77 CUresult cuLinkCreate_v2 (unsigned, CUjit_option *, void **, CUlinkState *);
  78 CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
  79                                           CUoccupancyB2DSize, size_t, int);
  80 #endif
  81
  82 #define DO_PRAGMA(x) _Pragma (#x)
  83
  84 #if PLUGIN_NVPTX_DYNAMIC
  85 # include <dlfcn.h>
  86
  87 struct cuda_lib_s {
  88
  89 # define CUDA_ONE_CALL(call)                    \
  90   __typeof (call) *call;
  91 # define CUDA_ONE_CALL_MAYBE_NULL(call)         \
  92   CUDA_ONE_CALL (call)
  93 #include "cuda-lib.def"
  94 # undef CUDA_ONE_CALL
  95 # undef CUDA_ONE_CALL_MAYBE_NULL
  96
  97 } cuda_lib;
  98
  99 /* -1 if init_cuda_lib has not been called yet, false
 100    if it has been and failed, true if it has been and succeeded.  */
 101 static signed char cuda_lib_inited = -1;
 102
 103 /* Dynamically load the CUDA runtime library and initialize function
 104    pointers, return false if unsuccessful, true if successful.  */
 105 static bool
 106 init_cuda_lib (void)
 107 {
 108   if (cuda_lib_inited != -1)
 109     return cuda_lib_inited;
 110   const char *cuda_runtime_lib = "libcuda.so.1";
 111   void *h = dlopen (cuda_runtime_lib, RTLD_LAZY);
 112   cuda_lib_inited = false;
 113   if (h == NULL)
 114     return false;
 115
 116 # define CUDA_ONE_CALL(call) CUDA_ONE_CALL_1 (call, false)
 117 # define CUDA_ONE_CALL_MAYBE_NULL(call) CUDA_ONE_CALL_1 (call, true)
 118 # define CUDA_ONE_CALL_1(call, allow_null)              \
 119   cuda_lib.call = dlsym (h, #call);     \
 120   if (!allow_null && cuda_lib.call == NULL)             \
 121     return false;
 122 #include "cuda-lib.def"
 123 # undef CUDA_ONE_CALL
 124 # undef CUDA_ONE_CALL_1
 125 # undef CUDA_ONE_CALL_MAYBE_NULL
 126
 127   cuda_lib_inited = true;
 128   return true;
 129 }
 130 # define CUDA_CALL_PREFIX cuda_lib.
 131 #else
 132
 133 # define CUDA_ONE_CALL(call)
 134 # define CUDA_ONE_CALL_MAYBE_NULL(call) DO_PRAGMA (weak call)
 135 #include "cuda-lib.def"
 136 #undef CUDA_ONE_CALL_MAYBE_NULL
 137 #undef CUDA_ONE_CALL
 138
 139 # define CUDA_CALL_PREFIX
 140 # define init_cuda_lib() true
 141 #endif
 142
 143 #include "secure_getenv.h"
 144
 145 #undef MIN
 146 #undef MAX
 147 #define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
 148 #define MAX(X,Y) ((X) > (Y) ? (X) : (Y))
 149
 150 /* Convenience macros for the frequently used CUDA library call and
 151    error handling sequence as well as CUDA library calls that
 152    do the error checking themselves or don't do it at all.  */
 153
 154 #define CUDA_CALL_ERET(ERET, FN, ...)           \
 155   do {                                          \
 156     unsigned __r                                \
 157       = CUDA_CALL_PREFIX FN (__VA_ARGS__);      \
 158     if (__r != CUDA_SUCCESS)                    \
 159       {                                         \
 160         GOMP_PLUGIN_error (#FN " error: %s",    \
 161                            cuda_error (__r));   \
 162         return ERET;                            \
 163       }                                         \
 164   } while (0)
 165
 166 #define CUDA_CALL(FN, ...)                      \
 167   CUDA_CALL_ERET (false, FN, __VA_ARGS__)
 168
 169 #define CUDA_CALL_ASSERT(FN, ...)               \
 170   do {                                          \
 171     unsigned __r                                \
 172       = CUDA_CALL_PREFIX FN (__VA_ARGS__);      \
 173     if (__r != CUDA_SUCCESS)                    \
 174       {                                         \
 175         GOMP_PLUGIN_fatal (#FN " error: %s",    \
 176                            cuda_error (__r));   \
 177       }                                         \
 178   } while (0)
 179
 180 #define CUDA_CALL_NOCHECK(FN, ...)              \
 181   CUDA_CALL_PREFIX FN (__VA_ARGS__)
 182
 183 #define CUDA_CALL_EXISTS(FN)                    \
 184   CUDA_CALL_PREFIX FN
 185
 186 static const char *
 187 cuda_error (CUresult r)
 188 {
 189   const char *fallback = "unknown cuda error";
 190   const char *desc;
 191
 192   if (!CUDA_CALL_EXISTS (cuGetErrorString))
 193     return fallback;
 194
 195   r = CUDA_CALL_NOCHECK (cuGetErrorString, r, &desc);
 196   if (r == CUDA_SUCCESS)
 197     return desc;
 198
 199   return fallback;
 200 }
 201
 202 /* Version of the CUDA Toolkit in the same MAJOR.MINOR format that is used by
 203    Nvidia, such as in the 'deviceQuery' program (Nvidia's CUDA samples). */
 204 static char cuda_driver_version_s[30];
 205
 206 static unsigned int instantiated_devices = 0;
 207 static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
 208
 209 /* NVPTX/CUDA specific definition of asynchronous queues.  */
 210 struct goacc_asyncqueue
 211 {
 212   CUstream cuda_stream;
 213 };
 214
 215 struct nvptx_callback
 216 {
 217   void (*fn) (void *);
 218   void *ptr;
 219   struct goacc_asyncqueue *aq;
 220   struct nvptx_callback *next;
 221 };
 222
 223 /* Thread-specific data for PTX.  */
 224
 225 struct nvptx_thread
 226 {
 227   /* We currently have this embedded inside the plugin because libgomp manages
 228      devices through integer target_ids.  This might be better if using an
 229      opaque target-specific pointer directly from gomp_device_descr.  */
 230   struct ptx_device *ptx_dev;
 231 };
 232
 233 /* Target data function launch information.  */
 234
 235 struct targ_fn_launch
 236 {
 237   const char *fn;
 238   unsigned short dim[GOMP_DIM_MAX];
 239 };
 240
 241 /* Target PTX object information.  */
 242
 243 struct targ_ptx_obj
 244 {
 245   const char *code;
 246   size_t size;
 247 };
 248
 249 /* Target data image information.  */
 250
 251 typedef struct nvptx_tdata
 252 {
 253   const struct targ_ptx_obj *ptx_objs;
 254   unsigned ptx_num;
 255
 256   const char *const *var_names;
 257   unsigned var_num;
 258
 259   const struct targ_fn_launch *fn_descs;
 260   unsigned fn_num;
 261 } nvptx_tdata_t;
 262
 263 /* Descriptor of a loaded function.  */
 264
 265 struct targ_fn_descriptor
 266 {
 267   CUfunction fn;
 268   const struct targ_fn_launch *launch;
 269   int regs_per_thread;
 270   int max_threads_per_block;
 271 };
 272
 273 /* A loaded PTX image.  */
 274 struct ptx_image_data
 275 {
 276   const void *target_data;
 277   CUmodule module;
 278
 279   struct targ_fn_descriptor *fns;  /* Array of functions.  */
 280
 281   struct ptx_image_data *next;
 282 };
 283
 284 struct ptx_free_block
 285 {
 286   void *ptr;
 287   struct ptx_free_block *next;
 288 };
 289
 290 struct ptx_device
 291 {
 292   CUcontext ctx;
 293   bool ctx_shared;
 294   CUdevice dev;
 295
 296   int ord;
 297   bool overlap;
 298   bool map;
 299   bool concur;
 300   bool mkern;
 301   int mode;
 302   int clock_khz;
 303   int num_sms;
 304   int regs_per_block;
 305   int regs_per_sm;
 306   int warp_size;
 307   int max_threads_per_block;
 308   int max_threads_per_multiprocessor;
 309   int default_dims[GOMP_DIM_MAX];
 310
 311   /* Length as used by the CUDA Runtime API ('struct cudaDeviceProp').  */
 312   char name[256];
 313
 314   struct ptx_image_data *images;  /* Images loaded on device.  */
 315   pthread_mutex_t image_lock;     /* Lock for above list.  */
 316
 317   struct ptx_free_block *free_blocks;
 318   pthread_mutex_t free_blocks_lock;
 319
 320   /* OpenMP stacks, cached between kernel invocations.  */
 321   struct
 322     {
 323       CUdeviceptr ptr;
 324       size_t size;
 325       pthread_mutex_t lock;
 326     } omp_stacks;
 327
 328   struct ptx_device *next;
 329 };
 330
 331 static struct ptx_device **ptx_devices;
 332
 333 static inline struct nvptx_thread *
 334 nvptx_thread (void)
 335 {
 336   return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
 337 }
 338
 339 /* Initialize the device.  Return TRUE on success, else FALSE.  PTX_DEV_LOCK
 340    should be locked on entry and remains locked on exit.  */
 341
 342 static bool
 343 nvptx_init (void)
 344 {
 345   int ndevs;
 346
 347   if (instantiated_devices != 0)
 348     return true;
 349
 350   if (!init_cuda_lib ())
 351     return false;
 352
 353   CUDA_CALL (cuInit, 0);
 354
 355   int cuda_driver_version;
 356   CUDA_CALL_ERET (NULL, cuDriverGetVersion, &cuda_driver_version);
 357   snprintf (cuda_driver_version_s, sizeof cuda_driver_version_s,
 358             "CUDA Driver %u.%u",
 359             cuda_driver_version / 1000, cuda_driver_version % 1000 / 10);
 360
 361   CUDA_CALL (cuDeviceGetCount, &ndevs);
 362   ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *)
 363                                             * ndevs);
 364
 365   return true;
 366 }
 367
 368 /* Select the N'th PTX device for the current host thread.  The device must
 369    have been previously opened before calling this function.  */
 370
 371 static bool
 372 nvptx_attach_host_thread_to_device (int n)
 373 {
 374   CUdevice dev;
 375   CUresult r;
 376   struct ptx_device *ptx_dev;
 377   CUcontext thd_ctx;
 378
 379   r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &dev);
 380   if (r == CUDA_ERROR_NOT_PERMITTED)
 381     {
 382       /* Assume we're in a CUDA callback, just return true.  */
 383       return true;
 384     }
 385   if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
 386     {
 387       GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
 388       return false;
 389     }
 390
 391   if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n)
 392     return true;
 393   else
 394     {
 395       CUcontext old_ctx;
 396
 397       ptx_dev = ptx_devices[n];
 398       if (!ptx_dev)
 399         {
 400           GOMP_PLUGIN_error ("device %d not found", n);
 401           return false;
 402         }
 403
 404       CUDA_CALL (cuCtxGetCurrent, &thd_ctx);
 405
 406       /* We don't necessarily have a current context (e.g. if it has been
 407          destroyed.  Pop it if we do though.  */
 408       if (thd_ctx != NULL)
 409         CUDA_CALL (cuCtxPopCurrent, &old_ctx);
 410
 411       CUDA_CALL (cuCtxPushCurrent, ptx_dev->ctx);
 412     }
 413   return true;
 414 }
 415
 416 static struct ptx_device *
 417 nvptx_open_device (int n)
 418 {
 419   struct ptx_device *ptx_dev;
 420   CUdevice dev, ctx_dev;
 421   CUresult r;
 422   int async_engines, pi;
 423
 424   CUDA_CALL_ERET (NULL, cuDeviceGet, &dev, n);
 425
 426   ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device));
 427
 428   ptx_dev->ord = n;
 429   ptx_dev->dev = dev;
 430   ptx_dev->ctx_shared = false;
 431
 432   r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &ctx_dev);
 433   if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
 434     {
 435       GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
 436       return NULL;
 437     }
 438
 439   if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev)
 440     {
 441       /* The current host thread has an active context for a different device.
 442          Detach it.  */
 443       CUcontext old_ctx;
 444       CUDA_CALL_ERET (NULL, cuCtxPopCurrent, &old_ctx);
 445     }
 446
 447   CUDA_CALL_ERET (NULL, cuCtxGetCurrent, &ptx_dev->ctx);
 448
 449   if (!ptx_dev->ctx)
 450     CUDA_CALL_ERET (NULL, cuCtxCreate, &ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
 451   else
 452     ptx_dev->ctx_shared = true;
 453
 454   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 455                   &pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
 456   ptx_dev->overlap = pi;
 457
 458   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 459                   &pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
 460   ptx_dev->map = pi;
 461
 462   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 463                   &pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
 464   ptx_dev->concur = pi;
 465
 466   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 467                   &pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
 468   ptx_dev->mode = pi;
 469
 470   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 471                   &pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
 472   ptx_dev->mkern = pi;
 473
 474   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 475                   &pi, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
 476   ptx_dev->clock_khz = pi;
 477
 478   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 479                   &pi, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
 480   ptx_dev->num_sms = pi;
 481
 482   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 483                   &pi, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev);
 484   ptx_dev->regs_per_block = pi;
 485
 486   /* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR is defined only
 487      in CUDA 6.0 and newer.  */
 488   r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi,
 489                          CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR,
 490                          dev);
 491   /* Fallback: use limit of registers per block, which is usually equal.  */
 492   if (r == CUDA_ERROR_INVALID_VALUE)
 493     pi = ptx_dev->regs_per_block;
 494   else if (r != CUDA_SUCCESS)
 495     {
 496       GOMP_PLUGIN_error ("cuDeviceGetAttribute error: %s", cuda_error (r));
 497       return NULL;
 498     }
 499   ptx_dev->regs_per_sm = pi;
 500
 501   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 502                   &pi, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev);
 503   if (pi != 32)
 504     {
 505       GOMP_PLUGIN_error ("Only warp size 32 is supported");
 506       return NULL;
 507     }
 508   ptx_dev->warp_size = pi;
 509
 510   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
 511                   CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev);
 512   ptx_dev->max_threads_per_block = pi;
 513
 514   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
 515                   CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev);
 516   ptx_dev->max_threads_per_multiprocessor = pi;
 517
 518   r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &async_engines,
 519                          CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
 520   if (r != CUDA_SUCCESS)
 521     async_engines = 1;
 522
 523   for (int i = 0; i != GOMP_DIM_MAX; i++)
 524     ptx_dev->default_dims[i] = 0;
 525
 526   CUDA_CALL_ERET (NULL, cuDeviceGetName, ptx_dev->name, sizeof ptx_dev->name,
 527                   dev);
 528
 529   ptx_dev->images = NULL;
 530   pthread_mutex_init (&ptx_dev->image_lock, NULL);
 531
 532   ptx_dev->free_blocks = NULL;
 533   pthread_mutex_init (&ptx_dev->free_blocks_lock, NULL);
 534
 535   ptx_dev->omp_stacks.ptr = 0;
 536   ptx_dev->omp_stacks.size = 0;
 537   pthread_mutex_init (&ptx_dev->omp_stacks.lock, NULL);
 538
 539   return ptx_dev;
 540 }
 541
 542 static bool
 543 nvptx_close_device (struct ptx_device *ptx_dev)
 544 {
 545   if (!ptx_dev)
 546     return true;
 547
 548   for (struct ptx_free_block *b = ptx_dev->free_blocks; b;)
 549     {
 550       struct ptx_free_block *b_next = b->next;
 551       CUDA_CALL (cuMemFree, (CUdeviceptr) b->ptr);
 552       free (b);
 553       b = b_next;
 554     }
 555
 556   pthread_mutex_destroy (&ptx_dev->free_blocks_lock);
 557   pthread_mutex_destroy (&ptx_dev->image_lock);
 558
 559   pthread_mutex_destroy (&ptx_dev->omp_stacks.lock);
 560
 561   if (ptx_dev->omp_stacks.ptr)
 562     CUDA_CALL (cuMemFree, ptx_dev->omp_stacks.ptr);
 563
 564   if (!ptx_dev->ctx_shared)
 565     CUDA_CALL (cuCtxDestroy, ptx_dev->ctx);
 566
 567   free (ptx_dev);
 568   return true;
 569 }
 570
 571 static int
 572 nvptx_get_num_devices (void)
 573 {
 574   int n;
 575
 576   /* This function will be called before the plugin has been initialized in
 577      order to enumerate available devices, but CUDA API routines can't be used
 578      until cuInit has been called.  Just call it now (but don't yet do any
 579      further initialization).  */
 580   if (instantiated_devices == 0)
 581     {
 582       if (!init_cuda_lib ())
 583         return 0;
 584       CUresult r = CUDA_CALL_NOCHECK (cuInit, 0);
 585       /* This is not an error: e.g. we may have CUDA libraries installed but
 586          no devices available.  */
 587       if (r != CUDA_SUCCESS)
 588         {
 589           GOMP_PLUGIN_debug (0, "Disabling nvptx offloading; cuInit: %s\n",
 590                              cuda_error (r));
 591           return 0;
 592         }
 593     }
 594
 595   CUDA_CALL_ERET (-1, cuDeviceGetCount, &n);
 596   return n;
 597 }
 598
 599 static void
 600 notify_var (const char *var_name, const char *env_var)
 601 {
 602   if (env_var == NULL)
 603     GOMP_PLUGIN_debug (0, "%s: <Not defined>\n", var_name);
 604   else
 605     GOMP_PLUGIN_debug (0, "%s: '%s'\n", var_name, env_var);
 606 }
 607
 608 static void
 609 process_GOMP_NVPTX_JIT (intptr_t *gomp_nvptx_o)
 610 {
 611   const char *var_name = "GOMP_NVPTX_JIT";
 612   const char *env_var = secure_getenv (var_name);
 613   notify_var (var_name, env_var);
 614
 615   if (env_var == NULL)
 616     return;
 617
 618   const char *c = env_var;
 619   while (*c != '\0')
 620     {
 621       while (*c == ' ')
 622         c++;
 623
 624       if (c[0] == '-' && c[1] == 'O'
 625           && '0' <= c[2] && c[2] <= '4'
 626           && (c[3] == '\0' || c[3] == ' '))
 627         {
 628           *gomp_nvptx_o = c[2] - '0';
 629           c += 3;
 630           continue;
 631         }
 632
 633       GOMP_PLUGIN_error ("Error parsing %s", var_name);
 634       break;
 635     }
 636 }
 637
 638 static bool
 639 link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs,
 640           unsigned num_objs)
 641 {
 642   CUjit_option opts[7];
 643   void *optvals[7];
 644   float elapsed = 0.0;
 645   char elog[1024];
 646   char ilog[16384];
 647   CUlinkState linkstate;
 648   CUresult r;
 649   void *linkout;
 650   size_t linkoutsize __attribute__ ((unused));
 651
 652   opts[0] = CU_JIT_WALL_TIME;
 653   optvals[0] = &elapsed;
 654
 655   opts[1] = CU_JIT_INFO_LOG_BUFFER;
 656   optvals[1] = &ilog[0];
 657
 658   opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
 659   optvals[2] = (void *) sizeof ilog;
 660
 661   opts[3] = CU_JIT_ERROR_LOG_BUFFER;
 662   optvals[3] = &elog[0];
 663
 664   opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
 665   optvals[4] = (void *) sizeof elog;
 666
 667   opts[5] = CU_JIT_LOG_VERBOSE;
 668   optvals[5] = (void *) 1;
 669
 670   static intptr_t gomp_nvptx_o = -1;
 671
 672   static bool init_done = false;
 673   if (!init_done)
 674     {
 675       process_GOMP_NVPTX_JIT (&gomp_nvptx_o);
 676       init_done = true;
 677   }
 678
 679   int nopts = 6;
 680   if (gomp_nvptx_o != -1)
 681     {
 682       opts[nopts] = CU_JIT_OPTIMIZATION_LEVEL;
 683       optvals[nopts] = (void *) gomp_nvptx_o;
 684       nopts++;
 685     }
 686
 687   if (CUDA_CALL_EXISTS (cuLinkCreate_v2))
 688     CUDA_CALL (cuLinkCreate_v2, nopts, opts, optvals, &linkstate);
 689   else
 690     CUDA_CALL (cuLinkCreate, nopts, opts, optvals, &linkstate);
 691
 692   for (; num_objs--; ptx_objs++)
 693     {
 694       /* cuLinkAddData's 'data' argument erroneously omits the const
 695          qualifier.  */
 696       GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs->code);
 697       if (CUDA_CALL_EXISTS (cuLinkAddData_v2))
 698         r = CUDA_CALL_NOCHECK (cuLinkAddData_v2, linkstate, CU_JIT_INPUT_PTX,
 699                                (char *) ptx_objs->code, ptx_objs->size,
 700                                0, 0, 0, 0);
 701       else
 702         r = CUDA_CALL_NOCHECK (cuLinkAddData, linkstate, CU_JIT_INPUT_PTX,
 703                                (char *) ptx_objs->code, ptx_objs->size,
 704                                0, 0, 0, 0);
 705       if (r != CUDA_SUCCESS)
 706         {
 707           GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
 708           GOMP_PLUGIN_error ("cuLinkAddData (ptx_code) error: %s",
 709                              cuda_error (r));
 710           return false;
 711         }
 712     }
 713
 714   GOMP_PLUGIN_debug (0, "Linking\n");
 715   r = CUDA_CALL_NOCHECK (cuLinkComplete, linkstate, &linkout, &linkoutsize);
 716
 717   GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed);
 718   GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]);
 719
 720   if (r != CUDA_SUCCESS)
 721     {
 722       GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
 723       GOMP_PLUGIN_error ("cuLinkComplete error: %s", cuda_error (r));
 724       return false;
 725     }
 726
 727   CUDA_CALL (cuModuleLoadData, module, linkout);
 728   CUDA_CALL (cuLinkDestroy, linkstate);
 729   return true;
 730 }
 731
 732 static void
 733 nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
 734             unsigned *dims, void *targ_mem_desc,
 735             CUdeviceptr dp, CUstream stream)
 736 {
 737   struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
 738   CUfunction function;
 739   int i;
 740   void *kargs[1];
 741   struct nvptx_thread *nvthd = nvptx_thread ();
 742   int warp_size = nvthd->ptx_dev->warp_size;
 743
 744   function = targ_fn->fn;
 745
 746   /* Initialize the launch dimensions.  Typically this is constant,
 747      provided by the device compiler, but we must permit runtime
 748      values.  */
 749   int seen_zero = 0;
 750   for (i = 0; i != GOMP_DIM_MAX; i++)
 751     {
 752       if (targ_fn->launch->dim[i])
 753        dims[i] = targ_fn->launch->dim[i];
 754       if (!dims[i])
 755        seen_zero = 1;
 756     }
 757
 758   if (seen_zero)
 759     {
 760       pthread_mutex_lock (&ptx_dev_lock);
 761
 762       static int gomp_openacc_dims[GOMP_DIM_MAX];
 763       if (!gomp_openacc_dims[0])
 764         {
 765           /* See if the user provided GOMP_OPENACC_DIM environment
 766              variable to specify runtime defaults.  */
 767           for (int i = 0; i < GOMP_DIM_MAX; ++i)
 768             gomp_openacc_dims[i] = GOMP_PLUGIN_acc_default_dim (i);
 769         }
 770
 771       if (!nvthd->ptx_dev->default_dims[0])
 772         {
 773           int default_dims[GOMP_DIM_MAX];
 774           for (int i = 0; i < GOMP_DIM_MAX; ++i)
 775             default_dims[i] = gomp_openacc_dims[i];
 776
 777           int gang, worker, vector;
 778           {
 779             int block_size = nvthd->ptx_dev->max_threads_per_block;
 780             int cpu_size = nvthd->ptx_dev->max_threads_per_multiprocessor;
 781             int dev_size = nvthd->ptx_dev->num_sms;
 782             GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
 783                                " dev_size=%d, cpu_size=%d\n",
 784                                warp_size, block_size, dev_size, cpu_size);
 785
 786             gang = (cpu_size / block_size) * dev_size;
 787             worker = block_size / warp_size;
 788             vector = warp_size;
 789           }
 790
 791           /* There is no upper bound on the gang size.  The best size
 792              matches the hardware configuration.  Logical gangs are
 793              scheduled onto physical hardware.  To maximize usage, we
 794              should guess a large number.  */
 795           if (default_dims[GOMP_DIM_GANG] < 1)
 796             default_dims[GOMP_DIM_GANG] = gang ? gang : 1024;
 797           /* The worker size must not exceed the hardware.  */
 798           if (default_dims[GOMP_DIM_WORKER] < 1
 799               || (default_dims[GOMP_DIM_WORKER] > worker && gang))
 800             default_dims[GOMP_DIM_WORKER] = worker;
 801           /* The vector size must exactly match the hardware.  */
 802           if (default_dims[GOMP_DIM_VECTOR] < 1
 803               || (default_dims[GOMP_DIM_VECTOR] != vector && gang))
 804             default_dims[GOMP_DIM_VECTOR] = vector;
 805
 806           GOMP_PLUGIN_debug (0, " default dimensions [%d,%d,%d]\n",
 807                              default_dims[GOMP_DIM_GANG],
 808                              default_dims[GOMP_DIM_WORKER],
 809                              default_dims[GOMP_DIM_VECTOR]);
 810
 811           for (i = 0; i != GOMP_DIM_MAX; i++)
 812             nvthd->ptx_dev->default_dims[i] = default_dims[i];
 813         }
 814       pthread_mutex_unlock (&ptx_dev_lock);
 815
 816       {
 817         bool default_dim_p[GOMP_DIM_MAX];
 818         for (i = 0; i != GOMP_DIM_MAX; i++)
 819           default_dim_p[i] = !dims[i];
 820
 821         if (!CUDA_CALL_EXISTS (cuOccupancyMaxPotentialBlockSize))
 822           {
 823             for (i = 0; i != GOMP_DIM_MAX; i++)
 824               if (default_dim_p[i])
 825                 dims[i] = nvthd->ptx_dev->default_dims[i];
 826
 827             if (default_dim_p[GOMP_DIM_VECTOR])
 828               dims[GOMP_DIM_VECTOR]
 829                 = MIN (dims[GOMP_DIM_VECTOR],
 830                        (targ_fn->max_threads_per_block / warp_size
 831                         * warp_size));
 832
 833             if (default_dim_p[GOMP_DIM_WORKER])
 834               dims[GOMP_DIM_WORKER]
 835                 = MIN (dims[GOMP_DIM_WORKER],
 836                        targ_fn->max_threads_per_block / dims[GOMP_DIM_VECTOR]);
 837           }
 838         else
 839           {
 840             /* Handle the case that the compiler allows the runtime to choose
 841                the vector-length conservatively, by ignoring
 842                gomp_openacc_dims[GOMP_DIM_VECTOR].  TODO: actually handle
 843                it.  */
 844             int vectors = 0;
 845             /* TODO: limit gomp_openacc_dims[GOMP_DIM_WORKER] such that that
 846                gomp_openacc_dims[GOMP_DIM_WORKER] * actual_vectors does not
 847                exceed targ_fn->max_threads_per_block. */
 848             int workers = gomp_openacc_dims[GOMP_DIM_WORKER];
 849             int gangs = gomp_openacc_dims[GOMP_DIM_GANG];
 850             int grids, blocks;
 851
 852             CUDA_CALL_ASSERT (cuOccupancyMaxPotentialBlockSize, &grids,
 853                               &blocks, function, NULL, 0,
 854                               dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]);
 855             GOMP_PLUGIN_debug (0, "cuOccupancyMaxPotentialBlockSize: "
 856                                "grid = %d, block = %d\n", grids, blocks);
 857
 858             /* Keep the num_gangs proportional to the block size.  In
 859                the case were a block size is limited by shared-memory
 860                or the register file capacity, the runtime will not
 861                excessively over assign gangs to the multiprocessor
 862                units if their state is going to be swapped out even
 863                more than necessary. The constant factor 2 is there to
 864                prevent threads from idling when there is insufficient
 865                work for them.  */
 866             if (gangs == 0)
 867               gangs = 2 * grids * (blocks / warp_size);
 868
 869             if (vectors == 0)
 870               vectors = warp_size;
 871
 872             if (workers == 0)
 873               {
 874                 int actual_vectors = (default_dim_p[GOMP_DIM_VECTOR]
 875                                       ? vectors
 876                                       : dims[GOMP_DIM_VECTOR]);
 877                 workers = blocks / actual_vectors;
 878                 workers = MAX (workers, 1);
 879                 /* If we need a per-worker barrier ... .  */
 880                 if (actual_vectors > 32)
 881                   /* Don't use more barriers than available.  */
 882                   workers = MIN (workers, 15);
 883               }
 884
 885             for (i = 0; i != GOMP_DIM_MAX; i++)
 886               if (default_dim_p[i])
 887                 switch (i)
 888                   {
 889                   case GOMP_DIM_GANG: dims[i] = gangs; break;
 890                   case GOMP_DIM_WORKER: dims[i] = workers; break;
 891                   case GOMP_DIM_VECTOR: dims[i] = vectors; break;
 892                   default: GOMP_PLUGIN_fatal ("invalid dim");
 893                   }
 894           }
 895       }
 896     }
 897
 898   /* Check if the accelerator has sufficient hardware resources to
 899      launch the offloaded kernel.  */
 900   if (dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]
 901       > targ_fn->max_threads_per_block)
 902     {
 903       const char *msg
 904         = ("The Nvidia accelerator has insufficient resources to launch '%s'"
 905            " with num_workers = %d and vector_length = %d"
 906            "; "
 907            "recompile the program with 'num_workers = x and vector_length = y'"
 908            " on that offloaded region or '-fopenacc-dim=:x:y' where"
 909            " x * y <= %d"
 910            ".\n");
 911       GOMP_PLUGIN_fatal (msg, targ_fn->launch->fn, dims[GOMP_DIM_WORKER],
 912                          dims[GOMP_DIM_VECTOR], targ_fn->max_threads_per_block);
 913     }
 914
 915   /* Check if the accelerator has sufficient barrier resources to
 916      launch the offloaded kernel.  */
 917   if (dims[GOMP_DIM_WORKER] > 15 && dims[GOMP_DIM_VECTOR] > 32)
 918     {
 919       const char *msg
 920         = ("The Nvidia accelerator has insufficient barrier resources to launch"
 921            " '%s' with num_workers = %d and vector_length = %d"
 922            "; "
 923            "recompile the program with 'num_workers = x' on that offloaded"
 924            " region or '-fopenacc-dim=:x:' where x <= 15"
 925            "; "
 926            "or, recompile the program with 'vector_length = 32' on that"
 927            " offloaded region or '-fopenacc-dim=::32'"
 928            ".\n");
 929         GOMP_PLUGIN_fatal (msg, targ_fn->launch->fn, dims[GOMP_DIM_WORKER],
 930                            dims[GOMP_DIM_VECTOR]);
 931     }
 932
 933   GOMP_PLUGIN_debug (0, "  %s: kernel %s: launch"
 934                      " gangs=%u, workers=%u, vectors=%u\n",
 935                      __FUNCTION__, targ_fn->launch->fn, dims[GOMP_DIM_GANG],
 936                      dims[GOMP_DIM_WORKER], dims[GOMP_DIM_VECTOR]);
 937
 938   // OpenACC            CUDA
 939   //
 940   // num_gangs          nctaid.x
 941   // num_workers        ntid.y
 942   // vector length      ntid.x
 943
 944   struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
 945   acc_prof_info *prof_info = thr->prof_info;
 946   acc_event_info enqueue_launch_event_info;
 947   acc_api_info *api_info = thr->api_info;
 948   bool profiling_p = __builtin_expect (prof_info != NULL, false);
 949   if (profiling_p)
 950     {
 951       prof_info->event_type = acc_ev_enqueue_launch_start;
 952
 953       enqueue_launch_event_info.launch_event.event_type
 954         = prof_info->event_type;
 955       enqueue_launch_event_info.launch_event.valid_bytes
 956         = _ACC_LAUNCH_EVENT_INFO_VALID_BYTES;
 957       enqueue_launch_event_info.launch_event.parent_construct
 958         = acc_construct_parallel;
 959       enqueue_launch_event_info.launch_event.implicit = 1;
 960       enqueue_launch_event_info.launch_event.tool_info = NULL;
 961       enqueue_launch_event_info.launch_event.kernel_name = targ_fn->launch->fn;
 962       enqueue_launch_event_info.launch_event.num_gangs
 963         = dims[GOMP_DIM_GANG];
 964       enqueue_launch_event_info.launch_event.num_workers
 965         = dims[GOMP_DIM_WORKER];
 966       enqueue_launch_event_info.launch_event.vector_length
 967         = dims[GOMP_DIM_VECTOR];
 968
 969       api_info->device_api = acc_device_api_cuda;
 970
 971       GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &enqueue_launch_event_info,
 972                                             api_info);
 973     }
 974
 975   kargs[0] = &dp;
 976   CUDA_CALL_ASSERT (cuLaunchKernel, function,
 977                     dims[GOMP_DIM_GANG], 1, 1,
 978                     dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1,
 979                     0, stream, kargs, 0);
 980
 981   if (profiling_p)
 982     {
 983       prof_info->event_type = acc_ev_enqueue_launch_end;
 984       enqueue_launch_event_info.launch_event.event_type
 985         = prof_info->event_type;
 986       GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &enqueue_launch_event_info,
 987                                             api_info);
 988     }
 989
 990   GOMP_PLUGIN_debug (0, "  %s: kernel %s: finished\n", __FUNCTION__,
 991                      targ_fn->launch->fn);
 992 }
 993
 994 void * openacc_get_current_cuda_context (void);
 995
 996 static void
 997 goacc_profiling_acc_ev_alloc (struct goacc_thread *thr, void *dp, size_t s)
 998 {
 999   acc_prof_info *prof_info = thr->prof_info;
1000   acc_event_info data_event_info;
1001   acc_api_info *api_info = thr->api_info;
1002
1003   prof_info->event_type = acc_ev_alloc;
1004
1005   data_event_info.data_event.event_type = prof_info->event_type;
1006   data_event_info.data_event.valid_bytes = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1007   data_event_info.data_event.parent_construct = acc_construct_parallel;
1008   data_event_info.data_event.implicit = 1;
1009   data_event_info.data_event.tool_info = NULL;
1010   data_event_info.data_event.var_name = NULL;
1011   data_event_info.data_event.bytes = s;
1012   data_event_info.data_event.host_ptr = NULL;
1013   data_event_info.data_event.device_ptr = dp;
1014
1015   api_info->device_api = acc_device_api_cuda;
1016
1017   GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, api_info);
1018 }
1019
1020 /* Free the cached soft-stacks block if it is above the SOFTSTACK_CACHE_LIMIT
1021    size threshold, or if FORCE is true.  */
1022
1023 static void
1024 nvptx_stacks_free (struct ptx_device *ptx_dev, bool force)
1025 {
1026   pthread_mutex_lock (&ptx_dev->omp_stacks.lock);
1027   if (ptx_dev->omp_stacks.ptr
1028       && (force || ptx_dev->omp_stacks.size > SOFTSTACK_CACHE_LIMIT))
1029     {
1030       CUresult r = CUDA_CALL_NOCHECK (cuMemFree, ptx_dev->omp_stacks.ptr);
1031       if (r != CUDA_SUCCESS)
1032         GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
1033       ptx_dev->omp_stacks.ptr = 0;
1034       ptx_dev->omp_stacks.size = 0;
1035     }
1036   pthread_mutex_unlock (&ptx_dev->omp_stacks.lock);
1037 }
1038
1039 static void *
1040 nvptx_alloc (size_t s, bool suppress_errors)
1041 {
1042   CUdeviceptr d;
1043
1044   CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &d, s);
1045   if (suppress_errors && r == CUDA_ERROR_OUT_OF_MEMORY)
1046     return NULL;
1047   else if (r != CUDA_SUCCESS)
1048     {
1049       GOMP_PLUGIN_error ("nvptx_alloc error: %s", cuda_error (r));
1050       return NULL;
1051     }
1052
1053   /* NOTE: We only do profiling stuff if the memory allocation succeeds.  */
1054   struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1055   bool profiling_p
1056     = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1057   if (profiling_p)
1058     goacc_profiling_acc_ev_alloc (thr, (void *) d, s);
1059
1060   return (void *) d;
1061 }
1062
1063 static void
1064 goacc_profiling_acc_ev_free (struct goacc_thread *thr, void *p)
1065 {
1066   acc_prof_info *prof_info = thr->prof_info;
1067   acc_event_info data_event_info;
1068   acc_api_info *api_info = thr->api_info;
1069
1070   prof_info->event_type = acc_ev_free;
1071
1072   data_event_info.data_event.event_type = prof_info->event_type;
1073   data_event_info.data_event.valid_bytes = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1074   data_event_info.data_event.parent_construct = acc_construct_parallel;
1075   data_event_info.data_event.implicit = 1;
1076   data_event_info.data_event.tool_info = NULL;
1077   data_event_info.data_event.var_name = NULL;
1078   data_event_info.data_event.bytes = -1;
1079   data_event_info.data_event.host_ptr = NULL;
1080   data_event_info.data_event.device_ptr = p;
1081
1082   api_info->device_api = acc_device_api_cuda;
1083
1084   GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, api_info);
1085 }
1086
1087 static bool
1088 nvptx_free (void *p, struct ptx_device *ptx_dev)
1089 {
1090   CUdeviceptr pb;
1091   size_t ps;
1092
1093   CUresult r = CUDA_CALL_NOCHECK (cuMemGetAddressRange, &pb, &ps,
1094                                   (CUdeviceptr) p);
1095   if (r == CUDA_ERROR_NOT_PERMITTED)
1096     {
1097       /* We assume that this error indicates we are in a CUDA callback context,
1098          where all CUDA calls are not allowed (see cuStreamAddCallback
1099          documentation for description). Arrange to free this piece of device
1100          memory later.  */
1101       struct ptx_free_block *n
1102         = GOMP_PLUGIN_malloc (sizeof (struct ptx_free_block));
1103       n->ptr = p;
1104       pthread_mutex_lock (&ptx_dev->free_blocks_lock);
1105       n->next = ptx_dev->free_blocks;
1106       ptx_dev->free_blocks = n;
1107       pthread_mutex_unlock (&ptx_dev->free_blocks_lock);
1108       return true;
1109     }
1110   else if (r != CUDA_SUCCESS)
1111     {
1112       GOMP_PLUGIN_error ("cuMemGetAddressRange error: %s", cuda_error (r));
1113       return false;
1114     }
1115   if ((CUdeviceptr) p != pb)
1116     {
1117       GOMP_PLUGIN_error ("invalid device address");
1118       return false;
1119     }
1120
1121   CUDA_CALL (cuMemFree, (CUdeviceptr) p);
1122   struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1123   bool profiling_p
1124     = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1125   if (profiling_p)
1126     goacc_profiling_acc_ev_free (thr, p);
1127
1128   return true;
1129 }
1130
1131 static void *
1132 nvptx_get_current_cuda_device (void)
1133 {
1134   struct nvptx_thread *nvthd = nvptx_thread ();
1135
1136   if (!nvthd || !nvthd->ptx_dev)
1137     return NULL;
1138
1139   return &nvthd->ptx_dev->dev;
1140 }
1141
1142 static void *
1143 nvptx_get_current_cuda_context (void)
1144 {
1145   struct nvptx_thread *nvthd = nvptx_thread ();
1146
1147   if (!nvthd || !nvthd->ptx_dev)
1148     return NULL;
1149
1150   return nvthd->ptx_dev->ctx;
1151 }
1152
1153 /* Plugin entry points.  */
1154
1155 const char *
1156 GOMP_OFFLOAD_get_name (void)
1157 {
1158   return "nvptx";
1159 }
1160
1161 unsigned int
1162 GOMP_OFFLOAD_get_caps (void)
1163 {
1164   return GOMP_OFFLOAD_CAP_OPENACC_200 | GOMP_OFFLOAD_CAP_OPENMP_400;
1165 }
1166
1167 int
1168 GOMP_OFFLOAD_get_type (void)
1169 {
1170   return OFFLOAD_TARGET_TYPE_NVIDIA_PTX;
1171 }
1172
1173 int
1174 GOMP_OFFLOAD_get_num_devices (void)
1175 {
1176   return nvptx_get_num_devices ();
1177 }
1178
1179 bool
1180 GOMP_OFFLOAD_init_device (int n)
1181 {
1182   struct ptx_device *dev;
1183
1184   pthread_mutex_lock (&ptx_dev_lock);
1185
1186   if (!nvptx_init () || ptx_devices[n] != NULL)
1187     {
1188       pthread_mutex_unlock (&ptx_dev_lock);
1189       return false;
1190     }
1191
1192   dev = nvptx_open_device (n);
1193   if (dev)
1194     {
1195       ptx_devices[n] = dev;
1196       instantiated_devices++;
1197     }
1198
1199   pthread_mutex_unlock (&ptx_dev_lock);
1200
1201   return dev != NULL;
1202 }
1203
1204 bool
1205 GOMP_OFFLOAD_fini_device (int n)
1206 {
1207   pthread_mutex_lock (&ptx_dev_lock);
1208
1209   if (ptx_devices[n] != NULL)
1210     {
1211       if (!nvptx_attach_host_thread_to_device (n)
1212           || !nvptx_close_device (ptx_devices[n]))
1213         {
1214           pthread_mutex_unlock (&ptx_dev_lock);
1215           return false;
1216         }
1217       ptx_devices[n] = NULL;
1218       instantiated_devices--;
1219     }
1220
1221   if (instantiated_devices == 0)
1222     {
1223       free (ptx_devices);
1224       ptx_devices = NULL;
1225     }
1226
1227   pthread_mutex_unlock (&ptx_dev_lock);
1228   return true;
1229 }
1230
1231 /* Return the libgomp version number we're compatible with.  There is
1232    no requirement for cross-version compatibility.  */
1233
1234 unsigned
1235 GOMP_OFFLOAD_version (void)
1236 {
1237   return GOMP_VERSION;
1238 }
1239
1240 /* Initialize __nvptx_clocktick, if present in MODULE.  */
1241
1242 static void
1243 nvptx_set_clocktick (CUmodule module, struct ptx_device *dev)
1244 {
1245   CUdeviceptr dptr;
1246   CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &dptr, NULL,
1247                                   module, "__nvptx_clocktick");
1248   if (r == CUDA_ERROR_NOT_FOUND)
1249     return;
1250   if (r != CUDA_SUCCESS)
1251     GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
1252   double __nvptx_clocktick = 1e-3 / dev->clock_khz;
1253   r = CUDA_CALL_NOCHECK (cuMemcpyHtoD, dptr, &__nvptx_clocktick,
1254                          sizeof (__nvptx_clocktick));
1255   if (r != CUDA_SUCCESS)
1256     GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
1257 }
1258
1259 /* Load the (partial) program described by TARGET_DATA to device
1260    number ORD.  Allocate and return TARGET_TABLE.  */
1261
1262 int
1263 GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
1264                          struct addr_pair **target_table)
1265 {
1266   CUmodule module;
1267   const char *const *var_names;
1268   const struct targ_fn_launch *fn_descs;
1269   unsigned int fn_entries, var_entries, other_entries, i, j;
1270   struct targ_fn_descriptor *targ_fns;
1271   struct addr_pair *targ_tbl;
1272   const nvptx_tdata_t *img_header = (const nvptx_tdata_t *) target_data;
1273   struct ptx_image_data *new_image;
1274   struct ptx_device *dev;
1275
1276   if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1277     {
1278       GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1279                          " (expected %u, received %u)",
1280                          GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1281       return -1;
1282     }
1283
1284   if (!nvptx_attach_host_thread_to_device (ord)
1285       || !link_ptx (&module, img_header->ptx_objs, img_header->ptx_num))
1286     return -1;
1287
1288   dev = ptx_devices[ord];
1289
1290   /* The mkoffload utility emits a struct of pointers/integers at the
1291      start of each offload image.  The array of kernel names and the
1292      functions addresses form a one-to-one correspondence.  */
1293
1294   var_entries = img_header->var_num;
1295   var_names = img_header->var_names;
1296   fn_entries = img_header->fn_num;
1297   fn_descs = img_header->fn_descs;
1298
1299   /* Currently, the only other entry kind is 'device number'.  */
1300   other_entries = 1;
1301
1302   targ_tbl = GOMP_PLUGIN_malloc (sizeof (struct addr_pair)
1303                                  * (fn_entries + var_entries + other_entries));
1304   targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
1305                                  * fn_entries);
1306
1307   *target_table = targ_tbl;
1308
1309   new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data));
1310   new_image->target_data = target_data;
1311   new_image->module = module;
1312   new_image->fns = targ_fns;
1313
1314   pthread_mutex_lock (&dev->image_lock);
1315   new_image->next = dev->images;
1316   dev->images = new_image;
1317   pthread_mutex_unlock (&dev->image_lock);
1318
1319   for (i = 0; i < fn_entries; i++, targ_fns++, targ_tbl++)
1320     {
1321       CUfunction function;
1322       int nregs, mthrs;
1323
1324       CUDA_CALL_ERET (-1, cuModuleGetFunction, &function, module,
1325                       fn_descs[i].fn);
1326       CUDA_CALL_ERET (-1, cuFuncGetAttribute, &nregs,
1327                       CU_FUNC_ATTRIBUTE_NUM_REGS, function);
1328       CUDA_CALL_ERET (-1, cuFuncGetAttribute, &mthrs,
1329                       CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, function);
1330
1331       targ_fns->fn = function;
1332       targ_fns->launch = &fn_descs[i];
1333       targ_fns->regs_per_thread = nregs;
1334       targ_fns->max_threads_per_block = mthrs;
1335
1336       targ_tbl->start = (uintptr_t) targ_fns;
1337       targ_tbl->end = targ_tbl->start + 1;
1338     }
1339
1340   for (j = 0; j < var_entries; j++, targ_tbl++)
1341     {
1342       CUdeviceptr var;
1343       size_t bytes;
1344
1345       CUDA_CALL_ERET (-1, cuModuleGetGlobal,
1346                       &var, &bytes, module, var_names[j]);
1347
1348       targ_tbl->start = (uintptr_t) var;
1349       targ_tbl->end = targ_tbl->start + bytes;
1350     }
1351
1352   CUdeviceptr device_num_varptr;
1353   size_t device_num_varsize;
1354   CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &device_num_varptr,
1355                                   &device_num_varsize, module,
1356                                   STRINGX (GOMP_DEVICE_NUM_VAR));
1357   if (r == CUDA_SUCCESS)
1358     {
1359       targ_tbl->start = (uintptr_t) device_num_varptr;
1360       targ_tbl->end = (uintptr_t) (device_num_varptr + device_num_varsize);
1361     }
1362   else
1363     /* The 'GOMP_DEVICE_NUM_VAR' variable was not in this image.  */
1364     targ_tbl->start = targ_tbl->end = 0;
1365   targ_tbl++;
1366
1367   nvptx_set_clocktick (module, dev);
1368
1369   return fn_entries + var_entries + other_entries;
1370 }
1371
1372 /* Unload the program described by TARGET_DATA.  DEV_DATA is the
1373    function descriptors allocated by G_O_load_image.  */
1374
1375 bool
1376 GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data)
1377 {
1378   struct ptx_image_data *image, **prev_p;
1379   struct ptx_device *dev = ptx_devices[ord];
1380
1381   if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1382     {
1383       GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1384                          " (expected %u, received %u)",
1385                          GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1386       return false;
1387     }
1388
1389   bool ret = true;
1390   pthread_mutex_lock (&dev->image_lock);
1391   for (prev_p = &dev->images; (image = *prev_p) != 0; prev_p = &image->next)
1392     if (image->target_data == target_data)
1393       {
1394         *prev_p = image->next;
1395         if (CUDA_CALL_NOCHECK (cuModuleUnload, image->module) != CUDA_SUCCESS)
1396           ret = false;
1397         free (image->fns);
1398         free (image);
1399         break;
1400       }
1401   pthread_mutex_unlock (&dev->image_lock);
1402   return ret;
1403 }
1404
1405 void *
1406 GOMP_OFFLOAD_alloc (int ord, size_t size)
1407 {
1408   if (!nvptx_attach_host_thread_to_device (ord))
1409     return NULL;
1410
1411   struct ptx_device *ptx_dev = ptx_devices[ord];
1412   struct ptx_free_block *blocks, *tmp;
1413
1414   pthread_mutex_lock (&ptx_dev->free_blocks_lock);
1415   blocks = ptx_dev->free_blocks;
1416   ptx_dev->free_blocks = NULL;
1417   pthread_mutex_unlock (&ptx_dev->free_blocks_lock);
1418
1419   nvptx_stacks_free (ptx_dev, false);
1420
1421   while (blocks)
1422     {
1423       tmp = blocks->next;
1424       nvptx_free (blocks->ptr, ptx_dev);
1425       free (blocks);
1426       blocks = tmp;
1427     }
1428
1429   void *d = nvptx_alloc (size, true);
1430   if (d)
1431     return d;
1432   else
1433     {
1434       /* Memory allocation failed.  Try freeing the stacks block, and
1435          retrying.  */
1436       nvptx_stacks_free (ptx_dev, true);
1437       return nvptx_alloc (size, false);
1438     }
1439 }
1440
1441 bool
1442 GOMP_OFFLOAD_free (int ord, void *ptr)
1443 {
1444   return (nvptx_attach_host_thread_to_device (ord)
1445           && nvptx_free (ptr, ptx_devices[ord]));
1446 }
1447
1448 void
1449 GOMP_OFFLOAD_openacc_exec (void (*fn) (void *), size_t mapnum,
1450                            void **hostaddrs, void **devaddrs,
1451                            unsigned *dims, void *targ_mem_desc)
1452 {
1453   GOMP_PLUGIN_debug (0, "  %s: prepare mappings\n", __FUNCTION__);
1454
1455   struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1456   acc_prof_info *prof_info = thr->prof_info;
1457   acc_event_info data_event_info;
1458   acc_api_info *api_info = thr->api_info;
1459   bool profiling_p = __builtin_expect (prof_info != NULL, false);
1460
1461   void **hp = NULL;
1462   CUdeviceptr dp = 0;
1463
1464   if (mapnum > 0)
1465     {
1466       size_t s = mapnum * sizeof (void *);
1467       hp = alloca (s);
1468       for (int i = 0; i < mapnum; i++)
1469         hp[i] = (devaddrs[i] ? devaddrs[i] : hostaddrs[i]);
1470       CUDA_CALL_ASSERT (cuMemAlloc, &dp, s);
1471       if (profiling_p)
1472         goacc_profiling_acc_ev_alloc (thr, (void *) dp, s);
1473     }
1474
1475   /* Copy the (device) pointers to arguments to the device (dp and hp might in
1476      fact have the same value on a unified-memory system).  */
1477   if (mapnum > 0)
1478     {
1479       if (profiling_p)
1480         {
1481           prof_info->event_type = acc_ev_enqueue_upload_start;
1482
1483           data_event_info.data_event.event_type = prof_info->event_type;
1484           data_event_info.data_event.valid_bytes
1485             = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1486           data_event_info.data_event.parent_construct
1487             = acc_construct_parallel;
1488           data_event_info.data_event.implicit = 1; /* Always implicit.  */
1489           data_event_info.data_event.tool_info = NULL;
1490           data_event_info.data_event.var_name = NULL;
1491           data_event_info.data_event.bytes = mapnum * sizeof (void *);
1492           data_event_info.data_event.host_ptr = hp;
1493           data_event_info.data_event.device_ptr = (const void *) dp;
1494
1495           api_info->device_api = acc_device_api_cuda;
1496
1497           GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1498                                                 api_info);
1499         }
1500       CUDA_CALL_ASSERT (cuMemcpyHtoD, dp, (void *) hp,
1501                         mapnum * sizeof (void *));
1502       if (profiling_p)
1503         {
1504           prof_info->event_type = acc_ev_enqueue_upload_end;
1505           data_event_info.data_event.event_type = prof_info->event_type;
1506           GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1507                                                 api_info);
1508         }
1509     }
1510
1511   nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc,
1512               dp, NULL);
1513
1514   CUresult r = CUDA_CALL_NOCHECK (cuStreamSynchronize, NULL);
1515   const char *maybe_abort_msg = "(perhaps abort was called)";
1516   if (r == CUDA_ERROR_LAUNCH_FAILED)
1517     GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
1518                        maybe_abort_msg);
1519   else if (r != CUDA_SUCCESS)
1520     GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1521
1522   CUDA_CALL_ASSERT (cuMemFree, dp);
1523   if (profiling_p)
1524     goacc_profiling_acc_ev_free (thr, (void *) dp);
1525 }
1526
1527 static void
1528 cuda_free_argmem (void *ptr)
1529 {
1530   void **block = (void **) ptr;
1531   nvptx_free (block[0], (struct ptx_device *) block[1]);
1532   free (block);
1533 }
1534
1535 void
1536 GOMP_OFFLOAD_openacc_async_exec (void (*fn) (void *), size_t mapnum,
1537                                  void **hostaddrs, void **devaddrs,
1538                                  unsigned *dims, void *targ_mem_desc,
1539                                  struct goacc_asyncqueue *aq)
1540 {
1541   GOMP_PLUGIN_debug (0, "  %s: prepare mappings\n", __FUNCTION__);
1542
1543   struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1544   acc_prof_info *prof_info = thr->prof_info;
1545   acc_event_info data_event_info;
1546   acc_api_info *api_info = thr->api_info;
1547   bool profiling_p = __builtin_expect (prof_info != NULL, false);
1548
1549   void **hp = NULL;
1550   CUdeviceptr dp = 0;
1551   void **block = NULL;
1552
1553   if (mapnum > 0)
1554     {
1555       size_t s = mapnum * sizeof (void *);
1556       block = (void **) GOMP_PLUGIN_malloc (2 * sizeof (void *) + s);
1557       hp = block + 2;
1558       for (int i = 0; i < mapnum; i++)
1559         hp[i] = (devaddrs[i] ? devaddrs[i] : hostaddrs[i]);
1560       CUDA_CALL_ASSERT (cuMemAlloc, &dp, s);
1561       if (profiling_p)
1562         goacc_profiling_acc_ev_alloc (thr, (void *) dp, s);
1563     }
1564
1565   /* Copy the (device) pointers to arguments to the device (dp and hp might in
1566      fact have the same value on a unified-memory system).  */
1567   if (mapnum > 0)
1568     {
1569       if (profiling_p)
1570         {
1571           prof_info->event_type = acc_ev_enqueue_upload_start;
1572
1573           data_event_info.data_event.event_type = prof_info->event_type;
1574           data_event_info.data_event.valid_bytes
1575             = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1576           data_event_info.data_event.parent_construct
1577             = acc_construct_parallel;
1578           data_event_info.data_event.implicit = 1; /* Always implicit.  */
1579           data_event_info.data_event.tool_info = NULL;
1580           data_event_info.data_event.var_name = NULL;
1581           data_event_info.data_event.bytes = mapnum * sizeof (void *);
1582           data_event_info.data_event.host_ptr = hp;
1583           data_event_info.data_event.device_ptr = (const void *) dp;
1584
1585           api_info->device_api = acc_device_api_cuda;
1586
1587           GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1588                                                 api_info);
1589         }
1590
1591       CUDA_CALL_ASSERT (cuMemcpyHtoDAsync, dp, (void *) hp,
1592                         mapnum * sizeof (void *), aq->cuda_stream);
1593       block[0] = (void *) dp;
1594
1595       struct nvptx_thread *nvthd =
1596         (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
1597       block[1] = (void *) nvthd->ptx_dev;
1598
1599       if (profiling_p)
1600         {
1601           prof_info->event_type = acc_ev_enqueue_upload_end;
1602           data_event_info.data_event.event_type = prof_info->event_type;
1603           GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1604                                                 api_info);
1605         }
1606     }
1607
1608   nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc,
1609               dp, aq->cuda_stream);
1610
1611   if (mapnum > 0)
1612     GOMP_OFFLOAD_openacc_async_queue_callback (aq, cuda_free_argmem, block);
1613 }
1614
1615 void *
1616 GOMP_OFFLOAD_openacc_create_thread_data (int ord)
1617 {
1618   struct ptx_device *ptx_dev;
1619   struct nvptx_thread *nvthd
1620     = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
1621   CUcontext thd_ctx;
1622
1623   ptx_dev = ptx_devices[ord];
1624
1625   assert (ptx_dev);
1626
1627   CUDA_CALL_ASSERT (cuCtxGetCurrent, &thd_ctx);
1628
1629   assert (ptx_dev->ctx);
1630
1631   if (!thd_ctx)
1632     CUDA_CALL_ASSERT (cuCtxPushCurrent, ptx_dev->ctx);
1633
1634   nvthd->ptx_dev = ptx_dev;
1635
1636   return (void *) nvthd;
1637 }
1638
1639 void
1640 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
1641 {
1642   free (data);
1643 }
1644
1645 void *
1646 GOMP_OFFLOAD_openacc_cuda_get_current_device (void)
1647 {
1648   return nvptx_get_current_cuda_device ();
1649 }
1650
1651 void *
1652 GOMP_OFFLOAD_openacc_cuda_get_current_context (void)
1653 {
1654   return nvptx_get_current_cuda_context ();
1655 }
1656
1657 /* This returns a CUstream.  */
1658 void *
1659 GOMP_OFFLOAD_openacc_cuda_get_stream (struct goacc_asyncqueue *aq)
1660 {
1661   return (void *) aq->cuda_stream;
1662 }
1663
1664 /* This takes a CUstream.  */
1665 int
1666 GOMP_OFFLOAD_openacc_cuda_set_stream (struct goacc_asyncqueue *aq, void *stream)
1667 {
1668   if (aq->cuda_stream)
1669     {
1670       CUDA_CALL_ASSERT (cuStreamSynchronize, aq->cuda_stream);
1671       CUDA_CALL_ASSERT (cuStreamDestroy, aq->cuda_stream);
1672     }
1673
1674   aq->cuda_stream = (CUstream) stream;
1675   return 1;
1676 }
1677
1678 struct goacc_asyncqueue *
1679 GOMP_OFFLOAD_openacc_async_construct (int device __attribute__((unused)))
1680 {
1681   CUstream stream = NULL;
1682   CUDA_CALL_ERET (NULL, cuStreamCreate, &stream, CU_STREAM_DEFAULT);
1683
1684   struct goacc_asyncqueue *aq
1685     = GOMP_PLUGIN_malloc (sizeof (struct goacc_asyncqueue));
1686   aq->cuda_stream = stream;
1687   return aq;
1688 }
1689
1690 bool
1691 GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
1692 {
1693   CUDA_CALL_ERET (false, cuStreamDestroy, aq->cuda_stream);
1694   free (aq);
1695   return true;
1696 }
1697
1698 int
1699 GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
1700 {
1701   CUresult r = CUDA_CALL_NOCHECK (cuStreamQuery, aq->cuda_stream);
1702   if (r == CUDA_SUCCESS)
1703     return 1;
1704   if (r == CUDA_ERROR_NOT_READY)
1705     return 0;
1706
1707   GOMP_PLUGIN_error ("cuStreamQuery error: %s", cuda_error (r));
1708   return -1;
1709 }
1710
1711 bool
1712 GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
1713 {
1714   CUDA_CALL_ERET (false, cuStreamSynchronize, aq->cuda_stream);
1715   return true;
1716 }
1717
1718 bool
1719 GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
1720                                       struct goacc_asyncqueue *aq2)
1721 {
1722   CUevent e;
1723   CUDA_CALL_ERET (false, cuEventCreate, &e, CU_EVENT_DISABLE_TIMING);
1724   CUDA_CALL_ERET (false, cuEventRecord, e, aq1->cuda_stream);
1725   CUDA_CALL_ERET (false, cuStreamWaitEvent, aq2->cuda_stream, e, 0);
1726   return true;
1727 }
1728
1729 static void
1730 cuda_callback_wrapper (CUstream stream, CUresult res, void *ptr)
1731 {
1732   if (res != CUDA_SUCCESS)
1733     GOMP_PLUGIN_fatal ("%s error: %s", __FUNCTION__, cuda_error (res));
1734   struct nvptx_callback *cb = (struct nvptx_callback *) ptr;
1735   cb->fn (cb->ptr);
1736   free (ptr);
1737 }
1738
1739 void
1740 GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
1741                                            void (*callback_fn)(void *),
1742                                            void *userptr)
1743 {
1744   struct nvptx_callback *b = GOMP_PLUGIN_malloc (sizeof (*b));
1745   b->fn = callback_fn;
1746   b->ptr = userptr;
1747   b->aq = aq;
1748   CUDA_CALL_ASSERT (cuStreamAddCallback, aq->cuda_stream,
1749                     cuda_callback_wrapper, (void *) b, 0);
1750 }
1751
1752 static bool
1753 cuda_memcpy_sanity_check (const void *h, const void *d, size_t s)
1754 {
1755   CUdeviceptr pb;
1756   size_t ps;
1757   if (!s)
1758     return true;
1759   if (!d)
1760     {
1761       GOMP_PLUGIN_error ("invalid device address");
1762       return false;
1763     }
1764   CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
1765   if (!pb)
1766     {
1767       GOMP_PLUGIN_error ("invalid device address");
1768       return false;
1769     }
1770   if (!h)
1771     {
1772       GOMP_PLUGIN_error ("invalid host address");
1773       return false;
1774     }
1775   if (d == h)
1776     {
1777       GOMP_PLUGIN_error ("invalid host or device address");
1778       return false;
1779     }
1780   if ((void *)(d + s) > (void *)(pb + ps))
1781     {
1782       GOMP_PLUGIN_error ("invalid size");
1783       return false;
1784     }
1785   return true;
1786 }
1787
1788 bool
1789 GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
1790 {
1791   if (!nvptx_attach_host_thread_to_device (ord)
1792       || !cuda_memcpy_sanity_check (src, dst, n))
1793     return false;
1794   CUDA_CALL (cuMemcpyHtoD, (CUdeviceptr) dst, src, n);
1795   return true;
1796 }
1797
1798 bool
1799 GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
1800 {
1801   if (!nvptx_attach_host_thread_to_device (ord)
1802       || !cuda_memcpy_sanity_check (dst, src, n))
1803     return false;
1804   CUDA_CALL (cuMemcpyDtoH, dst, (CUdeviceptr) src, n);
1805   return true;
1806 }
1807
1808 bool
1809 GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n)
1810 {
1811   CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n, NULL);
1812   return true;
1813 }
1814
1815 bool
1816 GOMP_OFFLOAD_openacc_async_host2dev (int ord, void *dst, const void *src,
1817                                      size_t n, struct goacc_asyncqueue *aq)
1818 {
1819   if (!nvptx_attach_host_thread_to_device (ord)
1820       || !cuda_memcpy_sanity_check (src, dst, n))
1821     return false;
1822   CUDA_CALL (cuMemcpyHtoDAsync, (CUdeviceptr) dst, src, n, aq->cuda_stream);
1823   return true;
1824 }
1825
1826 bool
1827 GOMP_OFFLOAD_openacc_async_dev2host (int ord, void *dst, const void *src,
1828                                      size_t n, struct goacc_asyncqueue *aq)
1829 {
1830   if (!nvptx_attach_host_thread_to_device (ord)
1831       || !cuda_memcpy_sanity_check (dst, src, n))
1832     return false;
1833   CUDA_CALL (cuMemcpyDtoHAsync, dst, (CUdeviceptr) src, n, aq->cuda_stream);
1834   return true;
1835 }
1836
1837 union goacc_property_value
1838 GOMP_OFFLOAD_openacc_get_property (int n, enum goacc_property prop)
1839 {
1840   union goacc_property_value propval = { .val = 0 };
1841
1842   pthread_mutex_lock (&ptx_dev_lock);
1843
1844   if (n >= nvptx_get_num_devices () || n < 0 || ptx_devices[n] == NULL)
1845     {
1846       pthread_mutex_unlock (&ptx_dev_lock);
1847       return propval;
1848     }
1849
1850   struct ptx_device *ptx_dev = ptx_devices[n];
1851   switch (prop)
1852     {
1853     case GOACC_PROPERTY_MEMORY:
1854       {
1855         size_t total_mem;
1856
1857         CUDA_CALL_ERET (propval, cuDeviceTotalMem, &total_mem, ptx_dev->dev);
1858         propval.val = total_mem;
1859       }
1860       break;
1861     case GOACC_PROPERTY_FREE_MEMORY:
1862       {
1863         size_t total_mem;
1864         size_t free_mem;
1865         CUdevice ctxdev;
1866
1867         CUDA_CALL_ERET (propval, cuCtxGetDevice, &ctxdev);
1868         if (ptx_dev->dev == ctxdev)
1869           CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
1870         else if (ptx_dev->ctx)
1871           {
1872             CUcontext old_ctx;
1873
1874             CUDA_CALL_ERET (propval, cuCtxPushCurrent, ptx_dev->ctx);
1875             CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
1876             CUDA_CALL_ASSERT (cuCtxPopCurrent, &old_ctx);
1877           }
1878         else
1879           {
1880             CUcontext new_ctx;
1881
1882             CUDA_CALL_ERET (propval, cuCtxCreate, &new_ctx, CU_CTX_SCHED_AUTO,
1883                             ptx_dev->dev);
1884             CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
1885             CUDA_CALL_ASSERT (cuCtxDestroy, new_ctx);
1886           }
1887         propval.val = free_mem;
1888       }
1889       break;
1890     case GOACC_PROPERTY_NAME:
1891       propval.ptr = ptx_dev->name;
1892       break;
1893     case GOACC_PROPERTY_VENDOR:
1894       propval.ptr = "Nvidia";
1895       break;
1896     case GOACC_PROPERTY_DRIVER:
1897       propval.ptr = cuda_driver_version_s;
1898       break;
1899     default:
1900       break;
1901     }
1902
1903   pthread_mutex_unlock (&ptx_dev_lock);
1904   return propval;
1905 }
1906
1907 /* Adjust launch dimensions: pick good values for number of blocks and warps
1908    and ensure that number of warps does not exceed CUDA limits as well as GCC's
1909    own limits.  */
1910
1911 static void
1912 nvptx_adjust_launch_bounds (struct targ_fn_descriptor *fn,
1913                             struct ptx_device *ptx_dev,
1914                             int *teams_p, int *threads_p)
1915 {
1916   int max_warps_block = fn->max_threads_per_block / 32;
1917   /* Maximum 32 warps per block is an implementation limit in NVPTX backend
1918      and libgcc, which matches documented limit of all GPUs as of 2015.  */
1919   if (max_warps_block > 32)
1920     max_warps_block = 32;
1921   if (*threads_p <= 0)
1922     *threads_p = 8;
1923   if (*threads_p > max_warps_block)
1924     *threads_p = max_warps_block;
1925
1926   int regs_per_block = fn->regs_per_thread * 32 * *threads_p;
1927   /* This is an estimate of how many blocks the device can host simultaneously.
1928      Actual limit, which may be lower, can be queried with "occupancy control"
1929      driver interface (since CUDA 6.0).  */
1930   int max_blocks = ptx_dev->regs_per_sm / regs_per_block * ptx_dev->num_sms;
1931   if (*teams_p <= 0 || *teams_p > max_blocks)
1932     *teams_p = max_blocks;
1933 }
1934
1935 /* Return the size of per-warp stacks (see gcc -msoft-stack) to use for OpenMP
1936    target regions.  */
1937
1938 static size_t
1939 nvptx_stacks_size ()
1940 {
1941   return 128 * 1024;
1942 }
1943
1944 /* Return contiguous storage for NUM stacks, each SIZE bytes.  The lock for
1945    the storage should be held on entry, and remains held on exit.  */
1946
1947 static void *
1948 nvptx_stacks_acquire (struct ptx_device *ptx_dev, size_t size, int num)
1949 {
1950   if (ptx_dev->omp_stacks.ptr && ptx_dev->omp_stacks.size >= size * num)
1951     return (void *) ptx_dev->omp_stacks.ptr;
1952
1953   /* Free the old, too-small stacks.  */
1954   if (ptx_dev->omp_stacks.ptr)
1955     {
1956       CUresult r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
1957       if (r != CUDA_SUCCESS)
1958         GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s\n", cuda_error (r));
1959       r = CUDA_CALL_NOCHECK (cuMemFree, ptx_dev->omp_stacks.ptr);
1960       if (r != CUDA_SUCCESS)
1961         GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
1962     }
1963
1964   /* Make new and bigger stacks, and remember where we put them and how big
1965      they are.  */
1966   CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &ptx_dev->omp_stacks.ptr,
1967                                   size * num);
1968   if (r != CUDA_SUCCESS)
1969     GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
1970
1971   ptx_dev->omp_stacks.size = size * num;
1972
1973   return (void *) ptx_dev->omp_stacks.ptr;
1974 }
1975
1976 void
1977 GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
1978 {
1979   struct targ_fn_descriptor *tgt_fn_desc
1980     = (struct targ_fn_descriptor *) tgt_fn;
1981   CUfunction function = tgt_fn_desc->fn;
1982   const struct targ_fn_launch *launch = tgt_fn_desc->launch;
1983   const char *fn_name = launch->fn;
1984   CUresult r;
1985   struct ptx_device *ptx_dev = ptx_devices[ord];
1986   const char *maybe_abort_msg = "(perhaps abort was called)";
1987   int teams = 0, threads = 0;
1988
1989   if (!args)
1990     GOMP_PLUGIN_fatal ("No target arguments provided");
1991   while (*args)
1992     {
1993       intptr_t id = (intptr_t) *args++, val;
1994       if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
1995         val = (intptr_t) *args++;
1996       else
1997         val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
1998       if ((id & GOMP_TARGET_ARG_DEVICE_MASK) != GOMP_TARGET_ARG_DEVICE_ALL)
1999         continue;
2000       val = val > INT_MAX ? INT_MAX : val;
2001       id &= GOMP_TARGET_ARG_ID_MASK;
2002       if (id == GOMP_TARGET_ARG_NUM_TEAMS)
2003         teams = val;
2004       else if (id == GOMP_TARGET_ARG_THREAD_LIMIT)
2005         threads = val;
2006     }
2007   nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads);
2008
2009   size_t stack_size = nvptx_stacks_size ();
2010
2011   pthread_mutex_lock (&ptx_dev->omp_stacks.lock);
2012   void *stacks = nvptx_stacks_acquire (ptx_dev, stack_size, teams * threads);
2013   void *fn_args[] = {tgt_vars, stacks, (void *) stack_size};
2014   size_t fn_args_size = sizeof fn_args;
2015   void *config[] = {
2016     CU_LAUNCH_PARAM_BUFFER_POINTER, fn_args,
2017     CU_LAUNCH_PARAM_BUFFER_SIZE, &fn_args_size,
2018     CU_LAUNCH_PARAM_END
2019   };
2020   GOMP_PLUGIN_debug (0, "  %s: kernel %s: launch"
2021                      " [(teams: %u), 1, 1] [(lanes: 32), (threads: %u), 1]\n",
2022                      __FUNCTION__, fn_name, teams, threads);
2023   r = CUDA_CALL_NOCHECK (cuLaunchKernel, function, teams, 1, 1,
2024                          32, threads, 1, 0, NULL, NULL, config);
2025   if (r != CUDA_SUCCESS)
2026     GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
2027
2028   r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
2029   if (r == CUDA_ERROR_LAUNCH_FAILED)
2030     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
2031                        maybe_abort_msg);
2032   else if (r != CUDA_SUCCESS)
2033     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
2034
2035   pthread_mutex_unlock (&ptx_dev->omp_stacks.lock);
2036 }
2037
2038 /* TODO: Implement GOMP_OFFLOAD_async_run. */