libgomp/plugin/plugin-nvptx.c

   1 /* Plugin for NVPTX execution.
   2
   3    Copyright (C) 2013-2024 Free Software Foundation, Inc.
   4
   5    Contributed by Mentor Embedded.
   6
   7    This file is part of the GNU Offloading and Multi Processing Library
   8    (libgomp).
   9
  10    Libgomp is free software; you can redistribute it and/or modify it
  11    under the terms of the GNU General Public License as published by
  12    the Free Software Foundation; either version 3, or (at your option)
  13    any later version.
  14
  15    Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
  16    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  17    FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  18    more details.
  19
  20    Under Section 7 of GPL version 3, you are granted additional
  21    permissions described in the GCC Runtime Library Exception, version
  22    3.1, as published by the Free Software Foundation.
  23
  24    You should have received a copy of the GNU General Public License and
  25    a copy of the GCC Runtime Library Exception along with this program;
  26    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  27    <http://www.gnu.org/licenses/>.  */
  28
  29 /* Nvidia PTX-specific parts of OpenACC support.  The cuda driver
  30    library appears to hold some implicit state, but the documentation
  31    is not clear as to what that state might be.  Or how one might
  32    propagate it from one thread to another.  */
  33
  34 #define _GNU_SOURCE
  35 #include "openacc.h"
  36 #include "config.h"
  37 #include "symcat.h"
  38 #include "libgomp-plugin.h"
  39 #include "oacc-plugin.h"
  40 #include "gomp-constants.h"
  41 #include "oacc-int.h"
  42
  43 /* For struct rev_offload + GOMP_REV_OFFLOAD_VAR. */
  44 #include "config/nvptx/libgomp-nvptx.h"
  45
  46 #include <pthread.h>
  47 #ifndef PLUGIN_NVPTX_INCLUDE_SYSTEM_CUDA_H
  48 # include "cuda/cuda.h"
  49 #else
  50 # include <cuda.h>
  51 #endif
  52 #include <stdbool.h>
  53 #include <limits.h>
  54 #include <string.h>
  55 #include <stdio.h>
  56 #include <unistd.h>
  57 #include <assert.h>
  58 #include <errno.h>
  59 #include <stdlib.h>
  60
  61 /* An arbitrary fixed limit (128MB) for the size of the OpenMP soft stacks
  62    block to cache between kernel invocations.  For soft-stacks blocks bigger
  63    than this, we will free the block before attempting another GPU memory
  64    allocation (i.e. in GOMP_OFFLOAD_alloc).  Otherwise, if an allocation fails,
  65    we will free the cached soft-stacks block anyway then retry the
  66    allocation.  If that fails too, we lose.  */
  67
  68 #define SOFTSTACK_CACHE_LIMIT 134217728
  69
  70 #if CUDA_VERSION < 6000
  71 extern CUresult cuGetErrorString (CUresult, const char **);
  72 #define CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR 82
  73 #endif
  74
  75 #if CUDA_VERSION >= 6050
  76 #undef cuLinkCreate
  77 #undef cuLinkAddData
  78 CUresult cuLinkAddData (CUlinkState, CUjitInputType, void *, size_t,
  79                         const char *, unsigned, CUjit_option *, void **);
  80 CUresult cuLinkCreate (unsigned, CUjit_option *, void **, CUlinkState *);
  81 #else
  82 typedef size_t (*CUoccupancyB2DSize)(int);
  83 CUresult cuLinkAddData_v2 (CUlinkState, CUjitInputType, void *, size_t,
  84                            const char *, unsigned, CUjit_option *, void **);
  85 CUresult cuLinkCreate_v2 (unsigned, CUjit_option *, void **, CUlinkState *);
  86 CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
  87                                           CUoccupancyB2DSize, size_t, int);
  88 #endif
  89
  90 #define DO_PRAGMA(x) _Pragma (#x)
  91
  92 #ifndef PLUGIN_NVPTX_LINK_LIBCUDA
  93 # include <dlfcn.h>
  94
  95 struct cuda_lib_s {
  96
  97 # define CUDA_ONE_CALL(call)                    \
  98   __typeof (call) *call;
  99 # define CUDA_ONE_CALL_MAYBE_NULL(call)         \
 100   CUDA_ONE_CALL (call)
 101 #include "cuda-lib.def"
 102 # undef CUDA_ONE_CALL
 103 # undef CUDA_ONE_CALL_MAYBE_NULL
 104
 105 } cuda_lib;
 106
 107 /* -1 if init_cuda_lib has not been called yet, false
 108    if it has been and failed, true if it has been and succeeded.  */
 109 static signed char cuda_lib_inited = -1;
 110
 111 /* Dynamically load the CUDA runtime library and initialize function
 112    pointers, return false if unsuccessful, true if successful.  */
 113 static bool
 114 init_cuda_lib (void)
 115 {
 116   if (cuda_lib_inited != -1)
 117     return cuda_lib_inited;
 118   const char *cuda_runtime_lib = "libcuda.so.1";
 119   void *h = dlopen (cuda_runtime_lib, RTLD_LAZY);
 120   cuda_lib_inited = false;
 121   if (h == NULL)
 122     return false;
 123
 124 # define CUDA_ONE_CALL(call) CUDA_ONE_CALL_1 (call, false)
 125 # define CUDA_ONE_CALL_MAYBE_NULL(call) CUDA_ONE_CALL_1 (call, true)
 126 # define CUDA_ONE_CALL_1(call, allow_null)              \
 127   cuda_lib.call = dlsym (h, #call);     \
 128   if (!allow_null && cuda_lib.call == NULL)             \
 129     return false;
 130 #include "cuda-lib.def"
 131 # undef CUDA_ONE_CALL
 132 # undef CUDA_ONE_CALL_1
 133 # undef CUDA_ONE_CALL_MAYBE_NULL
 134
 135   cuda_lib_inited = true;
 136   return true;
 137 }
 138 # define CUDA_CALL_PREFIX cuda_lib.
 139 #else
 140
 141 # define CUDA_ONE_CALL(call)
 142 # define CUDA_ONE_CALL_MAYBE_NULL(call) DO_PRAGMA (weak call)
 143 #include "cuda-lib.def"
 144 #undef CUDA_ONE_CALL_MAYBE_NULL
 145 #undef CUDA_ONE_CALL
 146
 147 # define CUDA_CALL_PREFIX
 148 # define init_cuda_lib() true
 149 #endif
 150
 151 #include "secure_getenv.h"
 152
 153 #undef MIN
 154 #undef MAX
 155 #define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
 156 #define MAX(X,Y) ((X) > (Y) ? (X) : (Y))
 157
 158 /* Convenience macros for the frequently used CUDA library call and
 159    error handling sequence as well as CUDA library calls that
 160    do the error checking themselves or don't do it at all.  */
 161
 162 #define CUDA_CALL_ERET(ERET, FN, ...)           \
 163   do {                                          \
 164     unsigned __r                                \
 165       = CUDA_CALL_PREFIX FN (__VA_ARGS__);      \
 166     if (__r != CUDA_SUCCESS)                    \
 167       {                                         \
 168         GOMP_PLUGIN_error (#FN " error: %s",    \
 169                            cuda_error (__r));   \
 170         return ERET;                            \
 171       }                                         \
 172   } while (0)
 173
 174 #define CUDA_CALL(FN, ...)                      \
 175   CUDA_CALL_ERET (false, FN, __VA_ARGS__)
 176
 177 #define CUDA_CALL_ASSERT(FN, ...)               \
 178   do {                                          \
 179     unsigned __r                                \
 180       = CUDA_CALL_PREFIX FN (__VA_ARGS__);      \
 181     if (__r != CUDA_SUCCESS)                    \
 182       {                                         \
 183         GOMP_PLUGIN_fatal (#FN " error: %s",    \
 184                            cuda_error (__r));   \
 185       }                                         \
 186   } while (0)
 187
 188 #define CUDA_CALL_NOCHECK(FN, ...)              \
 189   CUDA_CALL_PREFIX FN (__VA_ARGS__)
 190
 191 #define CUDA_CALL_EXISTS(FN)                    \
 192   CUDA_CALL_PREFIX FN
 193
 194 static const char *
 195 cuda_error (CUresult r)
 196 {
 197   const char *fallback = "unknown cuda error";
 198   const char *desc;
 199
 200   if (!CUDA_CALL_EXISTS (cuGetErrorString))
 201     return fallback;
 202
 203   r = CUDA_CALL_NOCHECK (cuGetErrorString, r, &desc);
 204   if (r == CUDA_SUCCESS)
 205     return desc;
 206
 207   return fallback;
 208 }
 209
 210 /* Version of the CUDA Toolkit in the same MAJOR.MINOR format that is used by
 211    Nvidia, such as in the 'deviceQuery' program (Nvidia's CUDA samples). */
 212 static char cuda_driver_version_s[30];
 213
 214 static unsigned int instantiated_devices = 0;
 215 static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
 216
 217 /* NVPTX/CUDA specific definition of asynchronous queues.  */
 218 struct goacc_asyncqueue
 219 {
 220   CUstream cuda_stream;
 221 };
 222
 223 struct nvptx_callback
 224 {
 225   void (*fn) (void *);
 226   void *ptr;
 227   struct goacc_asyncqueue *aq;
 228   struct nvptx_callback *next;
 229 };
 230
 231 /* Thread-specific data for PTX.  */
 232
 233 struct nvptx_thread
 234 {
 235   /* We currently have this embedded inside the plugin because libgomp manages
 236      devices through integer target_ids.  This might be better if using an
 237      opaque target-specific pointer directly from gomp_device_descr.  */
 238   struct ptx_device *ptx_dev;
 239 };
 240
 241 /* Target data function launch information.  */
 242
 243 struct targ_fn_launch
 244 {
 245   const char *fn;
 246   unsigned short dim[GOMP_DIM_MAX];
 247 };
 248
 249 /* Target PTX object information.  */
 250
 251 struct targ_ptx_obj
 252 {
 253   const char *code;
 254   size_t size;
 255 };
 256
 257 /* Target data image information.  */
 258
 259 typedef struct nvptx_tdata
 260 {
 261   const struct targ_ptx_obj *ptx_objs;
 262   unsigned ptx_num;
 263
 264   const char *const *var_names;
 265   unsigned var_num;
 266
 267   const struct targ_fn_launch *fn_descs;
 268   unsigned fn_num;
 269
 270   unsigned ind_fn_num;
 271 } nvptx_tdata_t;
 272
 273 /* Descriptor of a loaded function.  */
 274
 275 struct targ_fn_descriptor
 276 {
 277   CUfunction fn;
 278   const struct targ_fn_launch *launch;
 279   int regs_per_thread;
 280   int max_threads_per_block;
 281 };
 282
 283 /* A loaded PTX image.  */
 284 struct ptx_image_data
 285 {
 286   const void *target_data;
 287   CUmodule module;
 288
 289   struct targ_fn_descriptor *fns;  /* Array of functions.  */
 290
 291   struct ptx_image_data *next;
 292 };
 293
 294 struct ptx_free_block
 295 {
 296   void *ptr;
 297   struct ptx_free_block *next;
 298 };
 299
 300 struct ptx_device
 301 {
 302   CUcontext ctx;
 303   bool ctx_shared;
 304   CUdevice dev;
 305
 306   int ord;
 307   bool overlap;
 308   bool map;
 309   bool concur;
 310   bool mkern;
 311   int mode;
 312   int clock_khz;
 313   int num_sms;
 314   int regs_per_block;
 315   int regs_per_sm;
 316   int warp_size;
 317   int max_threads_per_block;
 318   int max_threads_per_multiprocessor;
 319   int default_dims[GOMP_DIM_MAX];
 320
 321   /* Length as used by the CUDA Runtime API ('struct cudaDeviceProp').  */
 322   char name[256];
 323
 324   struct ptx_image_data *images;  /* Images loaded on device.  */
 325   pthread_mutex_t image_lock;     /* Lock for above list.  */
 326
 327   struct ptx_free_block *free_blocks;
 328   pthread_mutex_t free_blocks_lock;
 329
 330   /* OpenMP stacks, cached between kernel invocations.  */
 331   struct
 332     {
 333       CUdeviceptr ptr;
 334       size_t size;
 335       pthread_mutex_t lock;
 336     } omp_stacks;
 337
 338   struct rev_offload *rev_data;
 339   struct ptx_device *next;
 340 };
 341
 342 static struct ptx_device **ptx_devices;
 343
 344 /* OpenMP kernels reserve a small amount of ".shared" space for use by
 345    omp_alloc.  The size is configured using GOMP_NVPTX_LOWLAT_POOL, but the
 346    default is set here.  */
 347 static unsigned lowlat_pool_size = 8 * 1024;
 348
 349 static inline struct nvptx_thread *
 350 nvptx_thread (void)
 351 {
 352   return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
 353 }
 354
 355 /* Initialize the device.  Return TRUE on success, else FALSE.  PTX_DEV_LOCK
 356    should be locked on entry and remains locked on exit.  */
 357
 358 static bool
 359 nvptx_init (void)
 360 {
 361   int ndevs;
 362
 363   if (instantiated_devices != 0)
 364     return true;
 365
 366   if (!init_cuda_lib ())
 367     return false;
 368
 369   CUDA_CALL (cuInit, 0);
 370
 371   int cuda_driver_version;
 372   CUDA_CALL_ERET (NULL, cuDriverGetVersion, &cuda_driver_version);
 373   snprintf (cuda_driver_version_s, sizeof cuda_driver_version_s,
 374             "CUDA Driver %u.%u",
 375             cuda_driver_version / 1000, cuda_driver_version % 1000 / 10);
 376
 377   CUDA_CALL (cuDeviceGetCount, &ndevs);
 378   ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *)
 379                                             * ndevs);
 380
 381   return true;
 382 }
 383
 384 /* Select the N'th PTX device for the current host thread.  The device must
 385    have been previously opened before calling this function.  */
 386
 387 static bool
 388 nvptx_attach_host_thread_to_device (int n)
 389 {
 390   CUdevice dev;
 391   CUresult r;
 392   struct ptx_device *ptx_dev;
 393   CUcontext thd_ctx;
 394
 395   r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &dev);
 396   if (r == CUDA_ERROR_NOT_PERMITTED)
 397     {
 398       /* Assume we're in a CUDA callback, just return true.  */
 399       return true;
 400     }
 401   if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
 402     {
 403       GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
 404       return false;
 405     }
 406
 407   if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n)
 408     return true;
 409   else
 410     {
 411       CUcontext old_ctx;
 412
 413       ptx_dev = ptx_devices[n];
 414       if (!ptx_dev)
 415         {
 416           GOMP_PLUGIN_error ("device %d not found", n);
 417           return false;
 418         }
 419
 420       CUDA_CALL (cuCtxGetCurrent, &thd_ctx);
 421
 422       /* We don't necessarily have a current context (e.g. if it has been
 423          destroyed.  Pop it if we do though.  */
 424       if (thd_ctx != NULL)
 425         CUDA_CALL (cuCtxPopCurrent, &old_ctx);
 426
 427       CUDA_CALL (cuCtxPushCurrent, ptx_dev->ctx);
 428     }
 429   return true;
 430 }
 431
 432 static struct ptx_device *
 433 nvptx_open_device (int n)
 434 {
 435   struct ptx_device *ptx_dev;
 436   CUdevice dev, ctx_dev;
 437   CUresult r;
 438   int pi;
 439
 440   CUDA_CALL_ERET (NULL, cuDeviceGet, &dev, n);
 441
 442   ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device));
 443
 444   ptx_dev->ord = n;
 445   ptx_dev->dev = dev;
 446   ptx_dev->ctx_shared = false;
 447
 448   r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &ctx_dev);
 449   if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
 450     {
 451       GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
 452       return NULL;
 453     }
 454
 455   if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev)
 456     {
 457       /* The current host thread has an active context for a different device.
 458          Detach it.  */
 459       CUcontext old_ctx;
 460       CUDA_CALL_ERET (NULL, cuCtxPopCurrent, &old_ctx);
 461     }
 462
 463   CUDA_CALL_ERET (NULL, cuCtxGetCurrent, &ptx_dev->ctx);
 464
 465   if (!ptx_dev->ctx)
 466     CUDA_CALL_ERET (NULL, cuCtxCreate, &ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
 467   else
 468     ptx_dev->ctx_shared = true;
 469
 470   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 471                   &pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
 472   ptx_dev->overlap = pi;
 473
 474   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 475                   &pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
 476   ptx_dev->map = pi;
 477
 478   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 479                   &pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
 480   ptx_dev->concur = pi;
 481
 482   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 483                   &pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
 484   ptx_dev->mode = pi;
 485
 486   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 487                   &pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
 488   ptx_dev->mkern = pi;
 489
 490   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 491                   &pi, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
 492   ptx_dev->clock_khz = pi;
 493
 494   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 495                   &pi, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
 496   ptx_dev->num_sms = pi;
 497
 498   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 499                   &pi, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev);
 500   ptx_dev->regs_per_block = pi;
 501
 502   /* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR is defined only
 503      in CUDA 6.0 and newer.  */
 504   r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi,
 505                          CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR,
 506                          dev);
 507   /* Fallback: use limit of registers per block, which is usually equal.  */
 508   if (r == CUDA_ERROR_INVALID_VALUE)
 509     pi = ptx_dev->regs_per_block;
 510   else if (r != CUDA_SUCCESS)
 511     {
 512       GOMP_PLUGIN_error ("cuDeviceGetAttribute error: %s", cuda_error (r));
 513       return NULL;
 514     }
 515   ptx_dev->regs_per_sm = pi;
 516
 517   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 518                   &pi, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev);
 519   if (pi != 32)
 520     {
 521       GOMP_PLUGIN_error ("Only warp size 32 is supported");
 522       return NULL;
 523     }
 524   ptx_dev->warp_size = pi;
 525
 526   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
 527                   CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev);
 528   ptx_dev->max_threads_per_block = pi;
 529
 530   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
 531                   CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev);
 532   ptx_dev->max_threads_per_multiprocessor = pi;
 533
 534   /* Required below for reverse offload as implemented, but with compute
 535      capability >= 2.0 and 64bit device processes, this should be universally be
 536      the case; hence, an assert.  */
 537   r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi,
 538                          CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, dev);
 539   assert (r == CUDA_SUCCESS && pi);
 540
 541   for (int i = 0; i != GOMP_DIM_MAX; i++)
 542     ptx_dev->default_dims[i] = 0;
 543
 544   CUDA_CALL_ERET (NULL, cuDeviceGetName, ptx_dev->name, sizeof ptx_dev->name,
 545                   dev);
 546
 547   ptx_dev->images = NULL;
 548   pthread_mutex_init (&ptx_dev->image_lock, NULL);
 549
 550   ptx_dev->free_blocks = NULL;
 551   pthread_mutex_init (&ptx_dev->free_blocks_lock, NULL);
 552
 553   ptx_dev->omp_stacks.ptr = 0;
 554   ptx_dev->omp_stacks.size = 0;
 555   pthread_mutex_init (&ptx_dev->omp_stacks.lock, NULL);
 556
 557   ptx_dev->rev_data = NULL;
 558
 559   return ptx_dev;
 560 }
 561
 562 static bool
 563 nvptx_close_device (struct ptx_device *ptx_dev)
 564 {
 565   if (!ptx_dev)
 566     return true;
 567
 568   for (struct ptx_free_block *b = ptx_dev->free_blocks; b;)
 569     {
 570       struct ptx_free_block *b_next = b->next;
 571       CUDA_CALL (cuMemFree, (CUdeviceptr) b->ptr);
 572       free (b);
 573       b = b_next;
 574     }
 575
 576   pthread_mutex_destroy (&ptx_dev->free_blocks_lock);
 577   pthread_mutex_destroy (&ptx_dev->image_lock);
 578
 579   pthread_mutex_destroy (&ptx_dev->omp_stacks.lock);
 580
 581   if (ptx_dev->omp_stacks.ptr)
 582     CUDA_CALL (cuMemFree, ptx_dev->omp_stacks.ptr);
 583
 584   if (!ptx_dev->ctx_shared)
 585     CUDA_CALL (cuCtxDestroy, ptx_dev->ctx);
 586
 587   free (ptx_dev);
 588   return true;
 589 }
 590
 591 static int
 592 nvptx_get_num_devices (void)
 593 {
 594   int n;
 595
 596   /* This function will be called before the plugin has been initialized in
 597      order to enumerate available devices, but CUDA API routines can't be used
 598      until cuInit has been called.  Just call it now (but don't yet do any
 599      further initialization).  */
 600   if (instantiated_devices == 0)
 601     {
 602       if (!init_cuda_lib ())
 603         return 0;
 604       CUresult r = CUDA_CALL_NOCHECK (cuInit, 0);
 605       /* This is not an error: e.g. we may have CUDA libraries installed but
 606          no devices available.  */
 607       if (r != CUDA_SUCCESS)
 608         {
 609           GOMP_PLUGIN_debug (0, "Disabling nvptx offloading; cuInit: %s\n",
 610                              cuda_error (r));
 611           return 0;
 612         }
 613     }
 614
 615   CUDA_CALL_ERET (-1, cuDeviceGetCount, &n);
 616   return n;
 617 }
 618
 619 static void
 620 notify_var (const char *var_name, const char *env_var)
 621 {
 622   if (env_var == NULL)
 623     GOMP_PLUGIN_debug (0, "%s: <Not defined>\n", var_name);
 624   else
 625     GOMP_PLUGIN_debug (0, "%s: '%s'\n", var_name, env_var);
 626 }
 627
 628 static void
 629 process_GOMP_NVPTX_JIT (intptr_t *gomp_nvptx_o)
 630 {
 631   const char *var_name = "GOMP_NVPTX_JIT";
 632   const char *env_var = secure_getenv (var_name);
 633   notify_var (var_name, env_var);
 634
 635   if (env_var == NULL)
 636     return;
 637
 638   const char *c = env_var;
 639   while (*c != '\0')
 640     {
 641       while (*c == ' ')
 642         c++;
 643
 644       if (c[0] == '-' && c[1] == 'O'
 645           && '0' <= c[2] && c[2] <= '4'
 646           && (c[3] == '\0' || c[3] == ' '))
 647         {
 648           *gomp_nvptx_o = c[2] - '0';
 649           c += 3;
 650           continue;
 651         }
 652
 653       GOMP_PLUGIN_error ("Error parsing %s", var_name);
 654       break;
 655     }
 656 }
 657
 658 static bool
 659 link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs,
 660           unsigned num_objs)
 661 {
 662   CUjit_option opts[7];
 663   void *optvals[7];
 664   float elapsed = 0.0;
 665   char elog[1024];
 666   char ilog[16384];
 667   CUlinkState linkstate;
 668   CUresult r;
 669   void *linkout;
 670   size_t linkoutsize __attribute__ ((unused));
 671
 672   opts[0] = CU_JIT_WALL_TIME;
 673   optvals[0] = &elapsed;
 674
 675   opts[1] = CU_JIT_INFO_LOG_BUFFER;
 676   optvals[1] = &ilog[0];
 677
 678   opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
 679   optvals[2] = (void *) sizeof ilog;
 680
 681   opts[3] = CU_JIT_ERROR_LOG_BUFFER;
 682   optvals[3] = &elog[0];
 683
 684   opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
 685   optvals[4] = (void *) sizeof elog;
 686
 687   opts[5] = CU_JIT_LOG_VERBOSE;
 688   optvals[5] = (void *) 1;
 689
 690   static intptr_t gomp_nvptx_o = -1;
 691
 692   static bool init_done = false;
 693   if (!init_done)
 694     {
 695       process_GOMP_NVPTX_JIT (&gomp_nvptx_o);
 696       init_done = true;
 697   }
 698
 699   int nopts = 6;
 700   if (gomp_nvptx_o != -1)
 701     {
 702       opts[nopts] = CU_JIT_OPTIMIZATION_LEVEL;
 703       optvals[nopts] = (void *) gomp_nvptx_o;
 704       nopts++;
 705     }
 706
 707   if (CUDA_CALL_EXISTS (cuLinkCreate_v2))
 708     CUDA_CALL (cuLinkCreate_v2, nopts, opts, optvals, &linkstate);
 709   else
 710     CUDA_CALL (cuLinkCreate, nopts, opts, optvals, &linkstate);
 711
 712   for (; num_objs--; ptx_objs++)
 713     {
 714       /* cuLinkAddData's 'data' argument erroneously omits the const
 715          qualifier.  */
 716       GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs->code);
 717       if (CUDA_CALL_EXISTS (cuLinkAddData_v2))
 718         r = CUDA_CALL_NOCHECK (cuLinkAddData_v2, linkstate, CU_JIT_INPUT_PTX,
 719                                (char *) ptx_objs->code, ptx_objs->size,
 720                                0, 0, 0, 0);
 721       else
 722         r = CUDA_CALL_NOCHECK (cuLinkAddData, linkstate, CU_JIT_INPUT_PTX,
 723                                (char *) ptx_objs->code, ptx_objs->size,
 724                                0, 0, 0, 0);
 725       if (r != CUDA_SUCCESS)
 726         {
 727           GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
 728           GOMP_PLUGIN_error ("cuLinkAddData (ptx_code) error: %s",
 729                              cuda_error (r));
 730           return false;
 731         }
 732     }
 733
 734   GOMP_PLUGIN_debug (0, "Linking\n");
 735   r = CUDA_CALL_NOCHECK (cuLinkComplete, linkstate, &linkout, &linkoutsize);
 736
 737   GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed);
 738   GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]);
 739
 740   if (r != CUDA_SUCCESS)
 741     {
 742       GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
 743       GOMP_PLUGIN_error ("cuLinkComplete error: %s", cuda_error (r));
 744       return false;
 745     }
 746
 747   CUDA_CALL (cuModuleLoadData, module, linkout);
 748   CUDA_CALL (cuLinkDestroy, linkstate);
 749   return true;
 750 }
 751
 752 static void
 753 nvptx_exec (void (*fn), unsigned *dims, void *targ_mem_desc,
 754             CUdeviceptr dp, CUstream stream)
 755 {
 756   struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
 757   CUfunction function;
 758   int i;
 759   void *kargs[1];
 760   struct nvptx_thread *nvthd = nvptx_thread ();
 761   int warp_size = nvthd->ptx_dev->warp_size;
 762
 763   function = targ_fn->fn;
 764
 765   /* Initialize the launch dimensions.  Typically this is constant,
 766      provided by the device compiler, but we must permit runtime
 767      values.  */
 768   int seen_zero = 0;
 769   for (i = 0; i != GOMP_DIM_MAX; i++)
 770     {
 771       if (targ_fn->launch->dim[i])
 772        dims[i] = targ_fn->launch->dim[i];
 773       if (!dims[i])
 774        seen_zero = 1;
 775     }
 776
 777   if (seen_zero)
 778     {
 779       pthread_mutex_lock (&ptx_dev_lock);
 780
 781       static int gomp_openacc_dims[GOMP_DIM_MAX];
 782       if (!gomp_openacc_dims[0])
 783         {
 784           /* See if the user provided GOMP_OPENACC_DIM environment
 785              variable to specify runtime defaults.  */
 786           for (int i = 0; i < GOMP_DIM_MAX; ++i)
 787             gomp_openacc_dims[i] = GOMP_PLUGIN_acc_default_dim (i);
 788         }
 789
 790       if (!nvthd->ptx_dev->default_dims[0])
 791         {
 792           int default_dims[GOMP_DIM_MAX];
 793           for (int i = 0; i < GOMP_DIM_MAX; ++i)
 794             default_dims[i] = gomp_openacc_dims[i];
 795
 796           int gang, worker, vector;
 797           {
 798             int block_size = nvthd->ptx_dev->max_threads_per_block;
 799             int cpu_size = nvthd->ptx_dev->max_threads_per_multiprocessor;
 800             int dev_size = nvthd->ptx_dev->num_sms;
 801             GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
 802                                " dev_size=%d, cpu_size=%d\n",
 803                                warp_size, block_size, dev_size, cpu_size);
 804
 805             gang = (cpu_size / block_size) * dev_size;
 806             worker = block_size / warp_size;
 807             vector = warp_size;
 808           }
 809
 810           /* There is no upper bound on the gang size.  The best size
 811              matches the hardware configuration.  Logical gangs are
 812              scheduled onto physical hardware.  To maximize usage, we
 813              should guess a large number.  */
 814           if (default_dims[GOMP_DIM_GANG] < 1)
 815             default_dims[GOMP_DIM_GANG] = gang ? gang : 1024;
 816           /* The worker size must not exceed the hardware.  */
 817           if (default_dims[GOMP_DIM_WORKER] < 1
 818               || (default_dims[GOMP_DIM_WORKER] > worker && gang))
 819             default_dims[GOMP_DIM_WORKER] = worker;
 820           /* The vector size must exactly match the hardware.  */
 821           if (default_dims[GOMP_DIM_VECTOR] < 1
 822               || (default_dims[GOMP_DIM_VECTOR] != vector && gang))
 823             default_dims[GOMP_DIM_VECTOR] = vector;
 824
 825           GOMP_PLUGIN_debug (0, " default dimensions [%d,%d,%d]\n",
 826                              default_dims[GOMP_DIM_GANG],
 827                              default_dims[GOMP_DIM_WORKER],
 828                              default_dims[GOMP_DIM_VECTOR]);
 829
 830           for (i = 0; i != GOMP_DIM_MAX; i++)
 831             nvthd->ptx_dev->default_dims[i] = default_dims[i];
 832         }
 833       pthread_mutex_unlock (&ptx_dev_lock);
 834
 835       {
 836         bool default_dim_p[GOMP_DIM_MAX];
 837         for (i = 0; i != GOMP_DIM_MAX; i++)
 838           default_dim_p[i] = !dims[i];
 839
 840         if (!CUDA_CALL_EXISTS (cuOccupancyMaxPotentialBlockSize))
 841           {
 842             for (i = 0; i != GOMP_DIM_MAX; i++)
 843               if (default_dim_p[i])
 844                 dims[i] = nvthd->ptx_dev->default_dims[i];
 845
 846             if (default_dim_p[GOMP_DIM_VECTOR])
 847               dims[GOMP_DIM_VECTOR]
 848                 = MIN (dims[GOMP_DIM_VECTOR],
 849                        (targ_fn->max_threads_per_block / warp_size
 850                         * warp_size));
 851
 852             if (default_dim_p[GOMP_DIM_WORKER])
 853               dims[GOMP_DIM_WORKER]
 854                 = MIN (dims[GOMP_DIM_WORKER],
 855                        targ_fn->max_threads_per_block / dims[GOMP_DIM_VECTOR]);
 856           }
 857         else
 858           {
 859             /* Handle the case that the compiler allows the runtime to choose
 860                the vector-length conservatively, by ignoring
 861                gomp_openacc_dims[GOMP_DIM_VECTOR].  TODO: actually handle
 862                it.  */
 863             int vectors = 0;
 864             /* TODO: limit gomp_openacc_dims[GOMP_DIM_WORKER] such that that
 865                gomp_openacc_dims[GOMP_DIM_WORKER] * actual_vectors does not
 866                exceed targ_fn->max_threads_per_block. */
 867             int workers = gomp_openacc_dims[GOMP_DIM_WORKER];
 868             int gangs = gomp_openacc_dims[GOMP_DIM_GANG];
 869             int grids, blocks;
 870
 871             CUDA_CALL_ASSERT (cuOccupancyMaxPotentialBlockSize, &grids,
 872                               &blocks, function, NULL, 0,
 873                               dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]);
 874             GOMP_PLUGIN_debug (0, "cuOccupancyMaxPotentialBlockSize: "
 875                                "grid = %d, block = %d\n", grids, blocks);
 876
 877             /* Keep the num_gangs proportional to the block size.  In
 878                the case were a block size is limited by shared-memory
 879                or the register file capacity, the runtime will not
 880                excessively over assign gangs to the multiprocessor
 881                units if their state is going to be swapped out even
 882                more than necessary. The constant factor 2 is there to
 883                prevent threads from idling when there is insufficient
 884                work for them.  */
 885             if (gangs == 0)
 886               gangs = 2 * grids * (blocks / warp_size);
 887
 888             if (vectors == 0)
 889               vectors = warp_size;
 890
 891             if (workers == 0)
 892               {
 893                 int actual_vectors = (default_dim_p[GOMP_DIM_VECTOR]
 894                                       ? vectors
 895                                       : dims[GOMP_DIM_VECTOR]);
 896                 workers = blocks / actual_vectors;
 897                 workers = MAX (workers, 1);
 898                 /* If we need a per-worker barrier ... .  */
 899                 if (actual_vectors > 32)
 900                   /* Don't use more barriers than available.  */
 901                   workers = MIN (workers, 15);
 902               }
 903
 904             for (i = 0; i != GOMP_DIM_MAX; i++)
 905               if (default_dim_p[i])
 906                 switch (i)
 907                   {
 908                   case GOMP_DIM_GANG: dims[i] = gangs; break;
 909                   case GOMP_DIM_WORKER: dims[i] = workers; break;
 910                   case GOMP_DIM_VECTOR: dims[i] = vectors; break;
 911                   default: GOMP_PLUGIN_fatal ("invalid dim");
 912                   }
 913           }
 914       }
 915     }
 916
 917   /* Check if the accelerator has sufficient hardware resources to
 918      launch the offloaded kernel.  */
 919   if (dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]
 920       > targ_fn->max_threads_per_block)
 921     {
 922       const char *msg
 923         = ("The Nvidia accelerator has insufficient resources to launch '%s'"
 924            " with num_workers = %d and vector_length = %d"
 925            "; "
 926            "recompile the program with 'num_workers = x and vector_length = y'"
 927            " on that offloaded region or '-fopenacc-dim=:x:y' where"
 928            " x * y <= %d"
 929            ".\n");
 930       GOMP_PLUGIN_fatal (msg, targ_fn->launch->fn, dims[GOMP_DIM_WORKER],
 931                          dims[GOMP_DIM_VECTOR], targ_fn->max_threads_per_block);
 932     }
 933
 934   /* Check if the accelerator has sufficient barrier resources to
 935      launch the offloaded kernel.  */
 936   if (dims[GOMP_DIM_WORKER] > 15 && dims[GOMP_DIM_VECTOR] > 32)
 937     {
 938       const char *msg
 939         = ("The Nvidia accelerator has insufficient barrier resources to launch"
 940            " '%s' with num_workers = %d and vector_length = %d"
 941            "; "
 942            "recompile the program with 'num_workers = x' on that offloaded"
 943            " region or '-fopenacc-dim=:x:' where x <= 15"
 944            "; "
 945            "or, recompile the program with 'vector_length = 32' on that"
 946            " offloaded region or '-fopenacc-dim=::32'"
 947            ".\n");
 948         GOMP_PLUGIN_fatal (msg, targ_fn->launch->fn, dims[GOMP_DIM_WORKER],
 949                            dims[GOMP_DIM_VECTOR]);
 950     }
 951
 952   GOMP_PLUGIN_debug (0, "  %s: kernel %s: launch"
 953                      " gangs=%u, workers=%u, vectors=%u\n",
 954                      __FUNCTION__, targ_fn->launch->fn, dims[GOMP_DIM_GANG],
 955                      dims[GOMP_DIM_WORKER], dims[GOMP_DIM_VECTOR]);
 956
 957   // OpenACC            CUDA
 958   //
 959   // num_gangs          nctaid.x
 960   // num_workers        ntid.y
 961   // vector length      ntid.x
 962
 963   struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
 964   acc_prof_info *prof_info = thr->prof_info;
 965   acc_event_info enqueue_launch_event_info;
 966   acc_api_info *api_info = thr->api_info;
 967   bool profiling_p = __builtin_expect (prof_info != NULL, false);
 968   if (profiling_p)
 969     {
 970       prof_info->event_type = acc_ev_enqueue_launch_start;
 971
 972       enqueue_launch_event_info.launch_event.event_type
 973         = prof_info->event_type;
 974       enqueue_launch_event_info.launch_event.valid_bytes
 975         = _ACC_LAUNCH_EVENT_INFO_VALID_BYTES;
 976       enqueue_launch_event_info.launch_event.parent_construct
 977         = acc_construct_parallel;
 978       enqueue_launch_event_info.launch_event.implicit = 1;
 979       enqueue_launch_event_info.launch_event.tool_info = NULL;
 980       enqueue_launch_event_info.launch_event.kernel_name = targ_fn->launch->fn;
 981       enqueue_launch_event_info.launch_event.num_gangs
 982         = dims[GOMP_DIM_GANG];
 983       enqueue_launch_event_info.launch_event.num_workers
 984         = dims[GOMP_DIM_WORKER];
 985       enqueue_launch_event_info.launch_event.vector_length
 986         = dims[GOMP_DIM_VECTOR];
 987
 988       api_info->device_api = acc_device_api_cuda;
 989
 990       GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &enqueue_launch_event_info,
 991                                             api_info);
 992     }
 993
 994   kargs[0] = &dp;
 995   CUDA_CALL_ASSERT (cuLaunchKernel, function,
 996                     dims[GOMP_DIM_GANG], 1, 1,
 997                     dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1,
 998                     0, stream, kargs, 0);
 999
1000   if (profiling_p)
1001     {
1002       prof_info->event_type = acc_ev_enqueue_launch_end;
1003       enqueue_launch_event_info.launch_event.event_type
1004         = prof_info->event_type;
1005       GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &enqueue_launch_event_info,
1006                                             api_info);
1007     }
1008
1009   GOMP_PLUGIN_debug (0, "  %s: kernel %s: finished\n", __FUNCTION__,
1010                      targ_fn->launch->fn);
1011 }
1012
1013 void * openacc_get_current_cuda_context (void);
1014
1015 static void
1016 goacc_profiling_acc_ev_alloc (struct goacc_thread *thr, void *dp, size_t s)
1017 {
1018   acc_prof_info *prof_info = thr->prof_info;
1019   acc_event_info data_event_info;
1020   acc_api_info *api_info = thr->api_info;
1021
1022   prof_info->event_type = acc_ev_alloc;
1023
1024   data_event_info.data_event.event_type = prof_info->event_type;
1025   data_event_info.data_event.valid_bytes = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1026   data_event_info.data_event.parent_construct = acc_construct_parallel;
1027   data_event_info.data_event.implicit = 1;
1028   data_event_info.data_event.tool_info = NULL;
1029   data_event_info.data_event.var_name = NULL;
1030   data_event_info.data_event.bytes = s;
1031   data_event_info.data_event.host_ptr = NULL;
1032   data_event_info.data_event.device_ptr = dp;
1033
1034   api_info->device_api = acc_device_api_cuda;
1035
1036   GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, api_info);
1037 }
1038
1039 /* Free the cached soft-stacks block if it is above the SOFTSTACK_CACHE_LIMIT
1040    size threshold, or if FORCE is true.  */
1041
1042 static void
1043 nvptx_stacks_free (struct ptx_device *ptx_dev, bool force)
1044 {
1045   pthread_mutex_lock (&ptx_dev->omp_stacks.lock);
1046   if (ptx_dev->omp_stacks.ptr
1047       && (force || ptx_dev->omp_stacks.size > SOFTSTACK_CACHE_LIMIT))
1048     {
1049       CUresult r = CUDA_CALL_NOCHECK (cuMemFree, ptx_dev->omp_stacks.ptr);
1050       if (r != CUDA_SUCCESS)
1051         GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
1052       ptx_dev->omp_stacks.ptr = 0;
1053       ptx_dev->omp_stacks.size = 0;
1054     }
1055   pthread_mutex_unlock (&ptx_dev->omp_stacks.lock);
1056 }
1057
1058 static void *
1059 nvptx_alloc (size_t s, bool suppress_errors)
1060 {
1061   CUdeviceptr d;
1062
1063   CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &d, s);
1064   if (suppress_errors && r == CUDA_ERROR_OUT_OF_MEMORY)
1065     return NULL;
1066   else if (r != CUDA_SUCCESS)
1067     {
1068       GOMP_PLUGIN_error ("nvptx_alloc error: %s", cuda_error (r));
1069       return NULL;
1070     }
1071
1072   /* NOTE: We only do profiling stuff if the memory allocation succeeds.  */
1073   struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1074   bool profiling_p
1075     = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1076   if (profiling_p)
1077     goacc_profiling_acc_ev_alloc (thr, (void *) d, s);
1078
1079   return (void *) d;
1080 }
1081
1082 static void
1083 goacc_profiling_acc_ev_free (struct goacc_thread *thr, void *p)
1084 {
1085   acc_prof_info *prof_info = thr->prof_info;
1086   acc_event_info data_event_info;
1087   acc_api_info *api_info = thr->api_info;
1088
1089   prof_info->event_type = acc_ev_free;
1090
1091   data_event_info.data_event.event_type = prof_info->event_type;
1092   data_event_info.data_event.valid_bytes = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1093   data_event_info.data_event.parent_construct = acc_construct_parallel;
1094   data_event_info.data_event.implicit = 1;
1095   data_event_info.data_event.tool_info = NULL;
1096   data_event_info.data_event.var_name = NULL;
1097   data_event_info.data_event.bytes = -1;
1098   data_event_info.data_event.host_ptr = NULL;
1099   data_event_info.data_event.device_ptr = p;
1100
1101   api_info->device_api = acc_device_api_cuda;
1102
1103   GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, api_info);
1104 }
1105
1106 static bool
1107 nvptx_free (void *p, struct ptx_device *ptx_dev)
1108 {
1109   CUdeviceptr pb;
1110   size_t ps;
1111
1112   CUresult r = CUDA_CALL_NOCHECK (cuMemGetAddressRange, &pb, &ps,
1113                                   (CUdeviceptr) p);
1114   if (r == CUDA_ERROR_NOT_PERMITTED)
1115     {
1116       /* We assume that this error indicates we are in a CUDA callback context,
1117          where all CUDA calls are not allowed (see cuStreamAddCallback
1118          documentation for description). Arrange to free this piece of device
1119          memory later.  */
1120       struct ptx_free_block *n
1121         = GOMP_PLUGIN_malloc (sizeof (struct ptx_free_block));
1122       n->ptr = p;
1123       pthread_mutex_lock (&ptx_dev->free_blocks_lock);
1124       n->next = ptx_dev->free_blocks;
1125       ptx_dev->free_blocks = n;
1126       pthread_mutex_unlock (&ptx_dev->free_blocks_lock);
1127       return true;
1128     }
1129   else if (r != CUDA_SUCCESS)
1130     {
1131       GOMP_PLUGIN_error ("cuMemGetAddressRange error: %s", cuda_error (r));
1132       return false;
1133     }
1134   if ((CUdeviceptr) p != pb)
1135     {
1136       GOMP_PLUGIN_error ("invalid device address");
1137       return false;
1138     }
1139
1140   CUDA_CALL (cuMemFree, (CUdeviceptr) p);
1141   struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1142   bool profiling_p
1143     = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1144   if (profiling_p)
1145     goacc_profiling_acc_ev_free (thr, p);
1146
1147   return true;
1148 }
1149
1150 static void *
1151 nvptx_get_current_cuda_device (void)
1152 {
1153   struct nvptx_thread *nvthd = nvptx_thread ();
1154
1155   if (!nvthd || !nvthd->ptx_dev)
1156     return NULL;
1157
1158   return &nvthd->ptx_dev->dev;
1159 }
1160
1161 static void *
1162 nvptx_get_current_cuda_context (void)
1163 {
1164   struct nvptx_thread *nvthd = nvptx_thread ();
1165
1166   if (!nvthd || !nvthd->ptx_dev)
1167     return NULL;
1168
1169   return nvthd->ptx_dev->ctx;
1170 }
1171
1172 /* Plugin entry points.  */
1173
1174 const char *
1175 GOMP_OFFLOAD_get_name (void)
1176 {
1177   return "nvptx";
1178 }
1179
1180 unsigned int
1181 GOMP_OFFLOAD_get_caps (void)
1182 {
1183   return GOMP_OFFLOAD_CAP_OPENACC_200 | GOMP_OFFLOAD_CAP_OPENMP_400;
1184 }
1185
1186 int
1187 GOMP_OFFLOAD_get_type (void)
1188 {
1189   return OFFLOAD_TARGET_TYPE_NVIDIA_PTX;
1190 }
1191
1192 int
1193 GOMP_OFFLOAD_get_num_devices (unsigned int omp_requires_mask)
1194 {
1195   int num_devices = nvptx_get_num_devices ();
1196   /* Return -1 if no omp_requires_mask cannot be fulfilled but
1197      devices were present.  Unified-shared address: see comment in
1198      nvptx_open_device for CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING.  */
1199   if (num_devices > 0
1200       && ((omp_requires_mask
1201            & ~(GOMP_REQUIRES_UNIFIED_ADDRESS
1202                | GOMP_REQUIRES_REVERSE_OFFLOAD)) != 0))
1203     return -1;
1204   return num_devices;
1205 }
1206
1207 bool
1208 GOMP_OFFLOAD_init_device (int n)
1209 {
1210   struct ptx_device *dev;
1211
1212   pthread_mutex_lock (&ptx_dev_lock);
1213
1214   if (!nvptx_init () || ptx_devices[n] != NULL)
1215     {
1216       pthread_mutex_unlock (&ptx_dev_lock);
1217       return false;
1218     }
1219
1220   dev = nvptx_open_device (n);
1221   if (dev)
1222     {
1223       ptx_devices[n] = dev;
1224       instantiated_devices++;
1225     }
1226
1227   const char *var_name = "GOMP_NVPTX_LOWLAT_POOL";
1228   const char *env_var = secure_getenv (var_name);
1229   notify_var (var_name, env_var);
1230
1231   if (env_var != NULL)
1232     {
1233       char *endptr;
1234       unsigned long val = strtoul (env_var, &endptr, 10);
1235       if (endptr == NULL || *endptr != '\0'
1236           || errno == ERANGE || errno == EINVAL
1237           || val > UINT_MAX)
1238         GOMP_PLUGIN_error ("Error parsing %s", var_name);
1239       else
1240         lowlat_pool_size = val;
1241     }
1242
1243   pthread_mutex_unlock (&ptx_dev_lock);
1244
1245   return dev != NULL;
1246 }
1247
1248 bool
1249 GOMP_OFFLOAD_fini_device (int n)
1250 {
1251   pthread_mutex_lock (&ptx_dev_lock);
1252
1253   if (ptx_devices[n] != NULL)
1254     {
1255       if (!nvptx_attach_host_thread_to_device (n)
1256           || !nvptx_close_device (ptx_devices[n]))
1257         {
1258           pthread_mutex_unlock (&ptx_dev_lock);
1259           return false;
1260         }
1261       ptx_devices[n] = NULL;
1262       instantiated_devices--;
1263     }
1264
1265   if (instantiated_devices == 0)
1266     {
1267       free (ptx_devices);
1268       ptx_devices = NULL;
1269     }
1270
1271   pthread_mutex_unlock (&ptx_dev_lock);
1272   return true;
1273 }
1274
1275 /* Return the libgomp version number we're compatible with.  There is
1276    no requirement for cross-version compatibility.  */
1277
1278 unsigned
1279 GOMP_OFFLOAD_version (void)
1280 {
1281   return GOMP_VERSION;
1282 }
1283
1284 /* Initialize __nvptx_clocktick, if present in MODULE.  */
1285
1286 static void
1287 nvptx_set_clocktick (CUmodule module, struct ptx_device *dev)
1288 {
1289   CUdeviceptr dptr;
1290   CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &dptr, NULL,
1291                                   module, "__nvptx_clocktick");
1292   if (r == CUDA_ERROR_NOT_FOUND)
1293     return;
1294   if (r != CUDA_SUCCESS)
1295     GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
1296   double __nvptx_clocktick = 1e-3 / dev->clock_khz;
1297   r = CUDA_CALL_NOCHECK (cuMemcpyHtoD, dptr, &__nvptx_clocktick,
1298                          sizeof (__nvptx_clocktick));
1299   if (r != CUDA_SUCCESS)
1300     GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
1301 }
1302
1303 /* Load the (partial) program described by TARGET_DATA to device
1304    number ORD.  Allocate and return TARGET_TABLE.  If not NULL, REV_FN_TABLE
1305    will contain the on-device addresses of the functions for reverse offload.
1306    To be freed by the caller.  */
1307
1308 int
1309 GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
1310                          struct addr_pair **target_table,
1311                          uint64_t **rev_fn_table,
1312                          uint64_t *host_ind_fn_table)
1313 {
1314   CUmodule module;
1315   const char *const *var_names;
1316   const struct targ_fn_launch *fn_descs;
1317   unsigned int fn_entries, var_entries, ind_fn_entries, other_entries, i, j;
1318   struct targ_fn_descriptor *targ_fns;
1319   struct addr_pair *targ_tbl;
1320   const nvptx_tdata_t *img_header = (const nvptx_tdata_t *) target_data;
1321   struct ptx_image_data *new_image;
1322   struct ptx_device *dev;
1323
1324   if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1325     {
1326       GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1327                          " (expected %u, received %u)",
1328                          GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1329       return -1;
1330     }
1331
1332   if (!nvptx_attach_host_thread_to_device (ord)
1333       || !link_ptx (&module, img_header->ptx_objs, img_header->ptx_num))
1334     return -1;
1335
1336   dev = ptx_devices[ord];
1337
1338   /* The mkoffload utility emits a struct of pointers/integers at the
1339      start of each offload image.  The array of kernel names and the
1340      functions addresses form a one-to-one correspondence.  */
1341
1342   var_entries = img_header->var_num;
1343   var_names = img_header->var_names;
1344   fn_entries = img_header->fn_num;
1345   fn_descs = img_header->fn_descs;
1346   ind_fn_entries = GOMP_VERSION_SUPPORTS_INDIRECT_FUNCS (version)
1347                      ? img_header->ind_fn_num : 0;
1348
1349   /* Currently, other_entries contains only the struct of ICVs.  */
1350   other_entries = 1;
1351
1352   targ_tbl = GOMP_PLUGIN_malloc (sizeof (struct addr_pair)
1353                                  * (fn_entries + var_entries + other_entries));
1354   targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
1355                                  * fn_entries);
1356
1357   *target_table = targ_tbl;
1358
1359   new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data));
1360   new_image->target_data = target_data;
1361   new_image->module = module;
1362   new_image->fns = targ_fns;
1363
1364   pthread_mutex_lock (&dev->image_lock);
1365   new_image->next = dev->images;
1366   dev->images = new_image;
1367   pthread_mutex_unlock (&dev->image_lock);
1368
1369   for (i = 0; i < fn_entries; i++, targ_fns++, targ_tbl++)
1370     {
1371       CUfunction function;
1372       int nregs, mthrs;
1373
1374       CUDA_CALL_ERET (-1, cuModuleGetFunction, &function, module,
1375                       fn_descs[i].fn);
1376       CUDA_CALL_ERET (-1, cuFuncGetAttribute, &nregs,
1377                       CU_FUNC_ATTRIBUTE_NUM_REGS, function);
1378       CUDA_CALL_ERET (-1, cuFuncGetAttribute, &mthrs,
1379                       CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, function);
1380
1381       targ_fns->fn = function;
1382       targ_fns->launch = &fn_descs[i];
1383       targ_fns->regs_per_thread = nregs;
1384       targ_fns->max_threads_per_block = mthrs;
1385
1386       targ_tbl->start = (uintptr_t) targ_fns;
1387       targ_tbl->end = targ_tbl->start + 1;
1388     }
1389
1390   for (j = 0; j < var_entries; j++, targ_tbl++)
1391     {
1392       CUdeviceptr var;
1393       size_t bytes;
1394
1395       CUDA_CALL_ERET (-1, cuModuleGetGlobal,
1396                       &var, &bytes, module, var_names[j]);
1397
1398       targ_tbl->start = (uintptr_t) var;
1399       targ_tbl->end = targ_tbl->start + bytes;
1400     }
1401
1402   if (ind_fn_entries > 0)
1403     {
1404       CUdeviceptr var;
1405       size_t bytes;
1406
1407       /* Read indirect function table from image.  */
1408       CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &var, &bytes, module,
1409                                       "$offload_ind_func_table");
1410       if (r != CUDA_SUCCESS)
1411         GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
1412       assert (bytes == sizeof (uint64_t) * ind_fn_entries);
1413
1414       uint64_t ind_fn_table[ind_fn_entries];
1415       r = CUDA_CALL_NOCHECK (cuMemcpyDtoH, ind_fn_table, var, bytes);
1416       if (r != CUDA_SUCCESS)
1417         GOMP_PLUGIN_fatal ("cuMemcpyDtoH error: %s", cuda_error (r));
1418
1419       /* Build host->target address map for indirect functions.  */
1420       uint64_t ind_fn_map[ind_fn_entries * 2 + 1];
1421       for (unsigned k = 0; k < ind_fn_entries; k++)
1422         {
1423           ind_fn_map[k * 2] = host_ind_fn_table[k];
1424           ind_fn_map[k * 2 + 1] = ind_fn_table[k];
1425           GOMP_PLUGIN_debug (0, "Indirect function %d: %lx->%lx\n",
1426                              k, host_ind_fn_table[k], ind_fn_table[k]);
1427         }
1428       ind_fn_map[ind_fn_entries * 2] = 0;
1429
1430       /* Write the map onto the target.  */
1431       void *map_target_addr
1432         = GOMP_OFFLOAD_alloc (ord, sizeof (ind_fn_map));
1433       GOMP_PLUGIN_debug (0, "Allocated indirect map at %p\n", map_target_addr);
1434
1435       GOMP_OFFLOAD_host2dev (ord, map_target_addr,
1436                              (void*) ind_fn_map,
1437                              sizeof (ind_fn_map));
1438
1439       /* Write address of the map onto the target.  */
1440       CUdeviceptr varptr;
1441       size_t varsize;
1442       r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &varptr, &varsize,
1443                              module, XSTRING (GOMP_INDIRECT_ADDR_MAP));
1444       if (r != CUDA_SUCCESS)
1445         GOMP_PLUGIN_fatal ("Indirect map variable not found in image: %s",
1446                            cuda_error (r));
1447
1448       GOMP_PLUGIN_debug (0,
1449                          "Indirect map variable found at %llx with size %ld\n",
1450                          varptr, varsize);
1451
1452       GOMP_OFFLOAD_host2dev (ord, (void *) varptr, &map_target_addr,
1453                              sizeof (map_target_addr));
1454     }
1455
1456   CUdeviceptr varptr;
1457   size_t varsize;
1458   CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &varptr, &varsize,
1459                                   module, XSTRING (GOMP_ADDITIONAL_ICVS));
1460
1461   if (r == CUDA_SUCCESS)
1462     {
1463       targ_tbl->start = (uintptr_t) varptr;
1464       targ_tbl->end = (uintptr_t) (varptr + varsize);
1465     }
1466   else
1467     /* The variable was not in this image.  */
1468     targ_tbl->start = targ_tbl->end = 0;
1469
1470   if (rev_fn_table && fn_entries == 0)
1471     *rev_fn_table = NULL;
1472   else if (rev_fn_table)
1473     {
1474       CUdeviceptr var;
1475       size_t bytes;
1476       unsigned int i;
1477       r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &var, &bytes, module,
1478                              "$offload_func_table");
1479       if (r != CUDA_SUCCESS)
1480         GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
1481       assert (bytes == sizeof (uint64_t) * fn_entries);
1482       *rev_fn_table = GOMP_PLUGIN_malloc (sizeof (uint64_t) * fn_entries);
1483       r = CUDA_CALL_NOCHECK (cuMemcpyDtoH, *rev_fn_table, var, bytes);
1484       if (r != CUDA_SUCCESS)
1485         GOMP_PLUGIN_fatal ("cuMemcpyDtoH error: %s", cuda_error (r));
1486       /* Free if only NULL entries.  */
1487       for (i = 0; i < fn_entries; ++i)
1488         if ((*rev_fn_table)[i] != 0)
1489           break;
1490       if (i == fn_entries)
1491         {
1492           free (*rev_fn_table);
1493           *rev_fn_table = NULL;
1494         }
1495     }
1496
1497   if (rev_fn_table && *rev_fn_table && dev->rev_data == NULL)
1498     {
1499       /* Get the on-device GOMP_REV_OFFLOAD_VAR variable.  It should be
1500          available but it might be not.  One reason could be: if the user code
1501          has 'omp target device(ancestor:1)' in pure hostcode, GOMP_target_ext
1502          is not called on the device and, hence, it and GOMP_REV_OFFLOAD_VAR
1503          are not linked in.  */
1504       CUdeviceptr device_rev_offload_var;
1505       size_t device_rev_offload_size;
1506       CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal,
1507                                       &device_rev_offload_var,
1508                                       &device_rev_offload_size, module,
1509                                       XSTRING (GOMP_REV_OFFLOAD_VAR));
1510       if (r != CUDA_SUCCESS)
1511         {
1512           free (*rev_fn_table);
1513           *rev_fn_table = NULL;
1514         }
1515       else
1516         {
1517           /* cuMemHostAlloc memory is accessible on the device, if
1518              unified-shared address is supported; this is assumed - see comment
1519              in nvptx_open_device for CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING. */
1520           CUDA_CALL_ASSERT (cuMemHostAlloc, (void **) &dev->rev_data,
1521                             sizeof (*dev->rev_data), CU_MEMHOSTALLOC_DEVICEMAP);
1522           CUdeviceptr dp = (CUdeviceptr) dev->rev_data;
1523           r = CUDA_CALL_NOCHECK (cuMemcpyHtoD, device_rev_offload_var, &dp,
1524                                  sizeof (dp));
1525           if (r != CUDA_SUCCESS)
1526             GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
1527         }
1528     }
1529
1530   nvptx_set_clocktick (module, dev);
1531
1532   return fn_entries + var_entries + other_entries;
1533 }
1534
1535 /* Unload the program described by TARGET_DATA.  DEV_DATA is the
1536    function descriptors allocated by G_O_load_image.  */
1537
1538 bool
1539 GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data)
1540 {
1541   struct ptx_image_data *image, **prev_p;
1542   struct ptx_device *dev = ptx_devices[ord];
1543
1544   if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1545     {
1546       GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1547                          " (expected %u, received %u)",
1548                          GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1549       return false;
1550     }
1551
1552   bool ret = true;
1553   pthread_mutex_lock (&dev->image_lock);
1554   for (prev_p = &dev->images; (image = *prev_p) != 0; prev_p = &image->next)
1555     if (image->target_data == target_data)
1556       {
1557         *prev_p = image->next;
1558         if (CUDA_CALL_NOCHECK (cuModuleUnload, image->module) != CUDA_SUCCESS)
1559           ret = false;
1560         free (image->fns);
1561         free (image);
1562         break;
1563       }
1564   pthread_mutex_unlock (&dev->image_lock);
1565   return ret;
1566 }
1567
1568 void *
1569 GOMP_OFFLOAD_alloc (int ord, size_t size)
1570 {
1571   if (!nvptx_attach_host_thread_to_device (ord))
1572     return NULL;
1573
1574   struct ptx_device *ptx_dev = ptx_devices[ord];
1575   struct ptx_free_block *blocks, *tmp;
1576
1577   pthread_mutex_lock (&ptx_dev->free_blocks_lock);
1578   blocks = ptx_dev->free_blocks;
1579   ptx_dev->free_blocks = NULL;
1580   pthread_mutex_unlock (&ptx_dev->free_blocks_lock);
1581
1582   nvptx_stacks_free (ptx_dev, false);
1583
1584   while (blocks)
1585     {
1586       tmp = blocks->next;
1587       nvptx_free (blocks->ptr, ptx_dev);
1588       free (blocks);
1589       blocks = tmp;
1590     }
1591
1592   void *d = nvptx_alloc (size, true);
1593   if (d)
1594     return d;
1595   else
1596     {
1597       /* Memory allocation failed.  Try freeing the stacks block, and
1598          retrying.  */
1599       nvptx_stacks_free (ptx_dev, true);
1600       return nvptx_alloc (size, false);
1601     }
1602 }
1603
1604 bool
1605 GOMP_OFFLOAD_free (int ord, void *ptr)
1606 {
1607   return (nvptx_attach_host_thread_to_device (ord)
1608           && nvptx_free (ptr, ptx_devices[ord]));
1609 }
1610
1611 void
1612 GOMP_OFFLOAD_openacc_exec (void (*fn) (void *),
1613                            size_t mapnum  __attribute__((unused)),
1614                            void **hostaddrs __attribute__((unused)),
1615                            void **devaddrs,
1616                            unsigned *dims, void *targ_mem_desc)
1617 {
1618   GOMP_PLUGIN_debug (0, "nvptx %s\n", __FUNCTION__);
1619
1620   CUdeviceptr dp = (CUdeviceptr) devaddrs;
1621   nvptx_exec (fn, dims, targ_mem_desc, dp, NULL);
1622
1623   CUresult r = CUDA_CALL_NOCHECK (cuStreamSynchronize, NULL);
1624   const char *maybe_abort_msg = "(perhaps abort was called)";
1625   if (r == CUDA_ERROR_LAUNCH_FAILED)
1626     GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
1627                        maybe_abort_msg);
1628   else if (r != CUDA_SUCCESS)
1629     GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1630 }
1631
1632 void
1633 GOMP_OFFLOAD_openacc_async_exec (void (*fn) (void *),
1634                                  size_t mapnum __attribute__((unused)),
1635                                  void **hostaddrs __attribute__((unused)),
1636                                  void **devaddrs,
1637                                  unsigned *dims, void *targ_mem_desc,
1638                                  struct goacc_asyncqueue *aq)
1639 {
1640   GOMP_PLUGIN_debug (0, "nvptx %s\n", __FUNCTION__);
1641
1642   CUdeviceptr dp = (CUdeviceptr) devaddrs;
1643   nvptx_exec (fn, dims, targ_mem_desc, dp, aq->cuda_stream);
1644 }
1645
1646 void *
1647 GOMP_OFFLOAD_openacc_create_thread_data (int ord)
1648 {
1649   struct ptx_device *ptx_dev;
1650   struct nvptx_thread *nvthd
1651     = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
1652   CUcontext thd_ctx;
1653
1654   ptx_dev = ptx_devices[ord];
1655
1656   assert (ptx_dev);
1657
1658   CUDA_CALL_ASSERT (cuCtxGetCurrent, &thd_ctx);
1659
1660   assert (ptx_dev->ctx);
1661
1662   if (!thd_ctx)
1663     CUDA_CALL_ASSERT (cuCtxPushCurrent, ptx_dev->ctx);
1664
1665   nvthd->ptx_dev = ptx_dev;
1666
1667   return (void *) nvthd;
1668 }
1669
1670 void
1671 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
1672 {
1673   free (data);
1674 }
1675
1676 void *
1677 GOMP_OFFLOAD_openacc_cuda_get_current_device (void)
1678 {
1679   return nvptx_get_current_cuda_device ();
1680 }
1681
1682 void *
1683 GOMP_OFFLOAD_openacc_cuda_get_current_context (void)
1684 {
1685   return nvptx_get_current_cuda_context ();
1686 }
1687
1688 /* This returns a CUstream.  */
1689 void *
1690 GOMP_OFFLOAD_openacc_cuda_get_stream (struct goacc_asyncqueue *aq)
1691 {
1692   return (void *) aq->cuda_stream;
1693 }
1694
1695 /* This takes a CUstream.  */
1696 int
1697 GOMP_OFFLOAD_openacc_cuda_set_stream (struct goacc_asyncqueue *aq, void *stream)
1698 {
1699   if (aq->cuda_stream)
1700     {
1701       CUDA_CALL_ASSERT (cuStreamSynchronize, aq->cuda_stream);
1702       CUDA_CALL_ASSERT (cuStreamDestroy, aq->cuda_stream);
1703     }
1704
1705   aq->cuda_stream = (CUstream) stream;
1706   return 1;
1707 }
1708
1709 static struct goacc_asyncqueue *
1710 nvptx_goacc_asyncqueue_construct (unsigned int flags)
1711 {
1712   CUstream stream = NULL;
1713   CUDA_CALL_ERET (NULL, cuStreamCreate, &stream, flags);
1714
1715   struct goacc_asyncqueue *aq
1716     = GOMP_PLUGIN_malloc (sizeof (struct goacc_asyncqueue));
1717   aq->cuda_stream = stream;
1718   return aq;
1719 }
1720
1721 struct goacc_asyncqueue *
1722 GOMP_OFFLOAD_openacc_async_construct (int device __attribute__((unused)))
1723 {
1724   return nvptx_goacc_asyncqueue_construct (CU_STREAM_DEFAULT);
1725 }
1726
1727 static bool
1728 nvptx_goacc_asyncqueue_destruct (struct goacc_asyncqueue *aq)
1729 {
1730   CUDA_CALL_ERET (false, cuStreamDestroy, aq->cuda_stream);
1731   free (aq);
1732   return true;
1733 }
1734
1735 bool
1736 GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
1737 {
1738   return nvptx_goacc_asyncqueue_destruct (aq);
1739 }
1740
1741 int
1742 GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
1743 {
1744   CUresult r = CUDA_CALL_NOCHECK (cuStreamQuery, aq->cuda_stream);
1745   if (r == CUDA_SUCCESS)
1746     return 1;
1747   if (r == CUDA_ERROR_NOT_READY)
1748     return 0;
1749
1750   GOMP_PLUGIN_error ("cuStreamQuery error: %s", cuda_error (r));
1751   return -1;
1752 }
1753
1754 static bool
1755 nvptx_goacc_asyncqueue_synchronize (struct goacc_asyncqueue *aq)
1756 {
1757   CUDA_CALL_ERET (false, cuStreamSynchronize, aq->cuda_stream);
1758   return true;
1759 }
1760
1761 bool
1762 GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
1763 {
1764   return nvptx_goacc_asyncqueue_synchronize (aq);
1765 }
1766
1767 bool
1768 GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
1769                                       struct goacc_asyncqueue *aq2)
1770 {
1771   CUevent e;
1772   CUDA_CALL_ERET (false, cuEventCreate, &e, CU_EVENT_DISABLE_TIMING);
1773   CUDA_CALL_ERET (false, cuEventRecord, e, aq1->cuda_stream);
1774   CUDA_CALL_ERET (false, cuStreamWaitEvent, aq2->cuda_stream, e, 0);
1775   return true;
1776 }
1777
1778 static void
1779 cuda_callback_wrapper (CUstream stream, CUresult res, void *ptr)
1780 {
1781   if (res != CUDA_SUCCESS)
1782     GOMP_PLUGIN_fatal ("%s error: %s", __FUNCTION__, cuda_error (res));
1783   struct nvptx_callback *cb = (struct nvptx_callback *) ptr;
1784   cb->fn (cb->ptr);
1785   free (ptr);
1786 }
1787
1788 void
1789 GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
1790                                            void (*callback_fn)(void *),
1791                                            void *userptr)
1792 {
1793   struct nvptx_callback *b = GOMP_PLUGIN_malloc (sizeof (*b));
1794   b->fn = callback_fn;
1795   b->ptr = userptr;
1796   b->aq = aq;
1797   CUDA_CALL_ASSERT (cuStreamAddCallback, aq->cuda_stream,
1798                     cuda_callback_wrapper, (void *) b, 0);
1799 }
1800
1801 static bool
1802 cuda_memcpy_sanity_check (const void *h, const void *d, size_t s)
1803 {
1804   CUdeviceptr pb;
1805   size_t ps;
1806   if (!s)
1807     return true;
1808   if (!d)
1809     {
1810       GOMP_PLUGIN_error ("invalid device address");
1811       return false;
1812     }
1813   CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
1814   if (!pb)
1815     {
1816       GOMP_PLUGIN_error ("invalid device address");
1817       return false;
1818     }
1819   if (!h)
1820     {
1821       GOMP_PLUGIN_error ("invalid host address");
1822       return false;
1823     }
1824   if (d == h)
1825     {
1826       GOMP_PLUGIN_error ("invalid host or device address");
1827       return false;
1828     }
1829   if ((void *)(d + s) > (void *)(pb + ps))
1830     {
1831       GOMP_PLUGIN_error ("invalid size");
1832       return false;
1833     }
1834   return true;
1835 }
1836
1837 bool
1838 GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
1839 {
1840   if (!nvptx_attach_host_thread_to_device (ord)
1841       || !cuda_memcpy_sanity_check (src, dst, n))
1842     return false;
1843   CUDA_CALL (cuMemcpyHtoD, (CUdeviceptr) dst, src, n);
1844   return true;
1845 }
1846
1847 bool
1848 GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
1849 {
1850   if (!nvptx_attach_host_thread_to_device (ord)
1851       || !cuda_memcpy_sanity_check (dst, src, n))
1852     return false;
1853   CUDA_CALL (cuMemcpyDtoH, dst, (CUdeviceptr) src, n);
1854   return true;
1855 }
1856
1857 bool
1858 GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n)
1859 {
1860   CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n, NULL);
1861   return true;
1862 }
1863
1864 int
1865 GOMP_OFFLOAD_memcpy2d (int dst_ord, int src_ord, size_t dim1_size,
1866                        size_t dim0_len, void *dst, size_t dst_offset1_size,
1867                        size_t dst_offset0_len, size_t dst_dim1_size,
1868                        const void *src, size_t src_offset1_size,
1869                        size_t src_offset0_len, size_t src_dim1_size)
1870 {
1871   if (!nvptx_attach_host_thread_to_device (src_ord != -1 ? src_ord : dst_ord))
1872     return false;
1873
1874   /* TODO: Consider using CU_MEMORYTYPE_UNIFIED if supported.  */
1875
1876   CUDA_MEMCPY2D data;
1877
1878   memset (&data, 0, sizeof (data));
1879   data.WidthInBytes = dim1_size;
1880   data.Height = dim0_len;
1881
1882   if (dst_ord == -1)
1883     {
1884       data.dstMemoryType = CU_MEMORYTYPE_HOST;
1885       data.dstHost = dst;
1886     }
1887   else
1888     {
1889       data.dstMemoryType = CU_MEMORYTYPE_DEVICE;
1890       data.dstDevice = (CUdeviceptr) dst;
1891     }
1892   data.dstPitch = dst_dim1_size;
1893   data.dstXInBytes = dst_offset1_size;
1894   data.dstY = dst_offset0_len;
1895
1896   if (src_ord == -1)
1897     {
1898       data.srcMemoryType = CU_MEMORYTYPE_HOST;
1899       data.srcHost = src;
1900     }
1901   else
1902     {
1903       data.srcMemoryType = CU_MEMORYTYPE_DEVICE;
1904       data.srcDevice = (CUdeviceptr) src;
1905     }
1906   data.srcPitch = src_dim1_size;
1907   data.srcXInBytes = src_offset1_size;
1908   data.srcY = src_offset0_len;
1909
1910   if (data.srcXInBytes != 0 || data.srcY != 0)
1911     {
1912       /* Adjust origin to the actual array data, else the CUDA 2D memory
1913          copy API calls below may fail to validate source/dest pointers
1914          correctly (especially for Fortran where the "virtual origin" of an
1915          array is often outside the stored data).  */
1916       if (src_ord == -1)
1917         data.srcHost = (const void *) ((const char *) data.srcHost
1918                                       + data.srcY * data.srcPitch
1919                                       + data.srcXInBytes);
1920       else
1921         data.srcDevice += data.srcY * data.srcPitch + data.srcXInBytes;
1922       data.srcXInBytes = 0;
1923       data.srcY = 0;
1924     }
1925
1926   if (data.dstXInBytes != 0 || data.dstY != 0)
1927     {
1928       /* As above.  */
1929       if (dst_ord == -1)
1930         data.dstHost = (void *) ((char *) data.dstHost
1931                                  + data.dstY * data.dstPitch
1932                                  + data.dstXInBytes);
1933       else
1934         data.dstDevice += data.dstY * data.dstPitch + data.dstXInBytes;
1935       data.dstXInBytes = 0;
1936       data.dstY = 0;
1937     }
1938
1939   CUresult res = CUDA_CALL_NOCHECK (cuMemcpy2D, &data);
1940   if (res == CUDA_ERROR_INVALID_VALUE)
1941     /* If pitch > CU_DEVICE_ATTRIBUTE_MAX_PITCH or for device-to-device
1942        for (some) memory not allocated by cuMemAllocPitch, cuMemcpy2D fails
1943        with an error; try the slower cuMemcpy2DUnaligned now.  */
1944     CUDA_CALL (cuMemcpy2DUnaligned, &data);
1945   else if (res != CUDA_SUCCESS)
1946     {
1947       GOMP_PLUGIN_error ("cuMemcpy2D error: %s", cuda_error (res));
1948       return false;
1949     }
1950   return true;
1951 }
1952
1953 int
1954 GOMP_OFFLOAD_memcpy3d (int dst_ord, int src_ord, size_t dim2_size,
1955                        size_t dim1_len, size_t dim0_len, void *dst,
1956                        size_t dst_offset2_size, size_t dst_offset1_len,
1957                        size_t dst_offset0_len, size_t dst_dim2_size,
1958                        size_t dst_dim1_len, const void *src,
1959                        size_t src_offset2_size, size_t src_offset1_len,
1960                        size_t src_offset0_len, size_t src_dim2_size,
1961                        size_t src_dim1_len)
1962 {
1963   if (!nvptx_attach_host_thread_to_device (src_ord != -1 ? src_ord : dst_ord))
1964     return false;
1965
1966   /* TODO: Consider using CU_MEMORYTYPE_UNIFIED if supported.  */
1967
1968   CUDA_MEMCPY3D data;
1969
1970   memset (&data, 0, sizeof (data));
1971   data.WidthInBytes = dim2_size;
1972   data.Height = dim1_len;
1973   data.Depth = dim0_len;
1974
1975   if (dst_ord == -1)
1976     {
1977       data.dstMemoryType = CU_MEMORYTYPE_HOST;
1978       data.dstHost = dst;
1979     }
1980   else
1981     {
1982       data.dstMemoryType = CU_MEMORYTYPE_DEVICE;
1983       data.dstDevice = (CUdeviceptr) dst;
1984     }
1985   data.dstPitch = dst_dim2_size;
1986   data.dstHeight = dst_dim1_len;
1987   data.dstXInBytes = dst_offset2_size;
1988   data.dstY = dst_offset1_len;
1989   data.dstZ = dst_offset0_len;
1990
1991   if (src_ord == -1)
1992     {
1993       data.srcMemoryType = CU_MEMORYTYPE_HOST;
1994       data.srcHost = src;
1995     }
1996   else
1997     {
1998       data.srcMemoryType = CU_MEMORYTYPE_DEVICE;
1999       data.srcDevice = (CUdeviceptr) src;
2000     }
2001   data.srcPitch = src_dim2_size;
2002   data.srcHeight = src_dim1_len;
2003   data.srcXInBytes = src_offset2_size;
2004   data.srcY = src_offset1_len;
2005   data.srcZ = src_offset0_len;
2006
2007   if (data.srcXInBytes != 0 || data.srcY != 0 || data.srcZ != 0)
2008     {
2009       /* Adjust origin to the actual array data, else the CUDA 3D memory
2010          copy API call below may fail to validate source/dest pointers
2011          correctly (especially for Fortran where the "virtual origin" of an
2012          array is often outside the stored data).  */
2013       if (src_ord == -1)
2014         data.srcHost
2015           = (const void *) ((const char *) data.srcHost
2016                             + (data.srcZ * data.srcHeight + data.srcY)
2017                               * data.srcPitch
2018                             + data.srcXInBytes);
2019       else
2020         data.srcDevice
2021           += (data.srcZ * data.srcHeight + data.srcY) * data.srcPitch
2022              + data.srcXInBytes;
2023       data.srcXInBytes = 0;
2024       data.srcY = 0;
2025       data.srcZ = 0;
2026     }
2027
2028   if (data.dstXInBytes != 0 || data.dstY != 0 || data.dstZ != 0)
2029     {
2030       /* As above.  */
2031       if (dst_ord == -1)
2032         data.dstHost = (void *) ((char *) data.dstHost
2033                                  + (data.dstZ * data.dstHeight + data.dstY)
2034                                    * data.dstPitch
2035                                  + data.dstXInBytes);
2036       else
2037         data.dstDevice
2038           += (data.dstZ * data.dstHeight + data.dstY) * data.dstPitch
2039              + data.dstXInBytes;
2040       data.dstXInBytes = 0;
2041       data.dstY = 0;
2042       data.dstZ = 0;
2043     }
2044
2045   CUDA_CALL (cuMemcpy3D, &data);
2046   return true;
2047 }
2048
2049 bool
2050 GOMP_OFFLOAD_openacc_async_host2dev (int ord, void *dst, const void *src,
2051                                      size_t n, struct goacc_asyncqueue *aq)
2052 {
2053   if (!nvptx_attach_host_thread_to_device (ord)
2054       || !cuda_memcpy_sanity_check (src, dst, n))
2055     return false;
2056   CUDA_CALL (cuMemcpyHtoDAsync, (CUdeviceptr) dst, src, n, aq->cuda_stream);
2057   return true;
2058 }
2059
2060 bool
2061 GOMP_OFFLOAD_openacc_async_dev2host (int ord, void *dst, const void *src,
2062                                      size_t n, struct goacc_asyncqueue *aq)
2063 {
2064   if (!nvptx_attach_host_thread_to_device (ord)
2065       || !cuda_memcpy_sanity_check (dst, src, n))
2066     return false;
2067   CUDA_CALL (cuMemcpyDtoHAsync, dst, (CUdeviceptr) src, n, aq->cuda_stream);
2068   return true;
2069 }
2070
2071 union goacc_property_value
2072 GOMP_OFFLOAD_openacc_get_property (int n, enum goacc_property prop)
2073 {
2074   union goacc_property_value propval = { .val = 0 };
2075
2076   pthread_mutex_lock (&ptx_dev_lock);
2077
2078   if (n >= nvptx_get_num_devices () || n < 0 || ptx_devices[n] == NULL)
2079     {
2080       pthread_mutex_unlock (&ptx_dev_lock);
2081       return propval;
2082     }
2083
2084   struct ptx_device *ptx_dev = ptx_devices[n];
2085   switch (prop)
2086     {
2087     case GOACC_PROPERTY_MEMORY:
2088       {
2089         size_t total_mem;
2090
2091         CUDA_CALL_ERET (propval, cuDeviceTotalMem, &total_mem, ptx_dev->dev);
2092         propval.val = total_mem;
2093       }
2094       break;
2095     case GOACC_PROPERTY_FREE_MEMORY:
2096       {
2097         size_t total_mem;
2098         size_t free_mem;
2099         CUdevice ctxdev;
2100
2101         CUDA_CALL_ERET (propval, cuCtxGetDevice, &ctxdev);
2102         if (ptx_dev->dev == ctxdev)
2103           CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
2104         else if (ptx_dev->ctx)
2105           {
2106             CUcontext old_ctx;
2107
2108             CUDA_CALL_ERET (propval, cuCtxPushCurrent, ptx_dev->ctx);
2109             CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
2110             CUDA_CALL_ASSERT (cuCtxPopCurrent, &old_ctx);
2111           }
2112         else
2113           {
2114             CUcontext new_ctx;
2115
2116             CUDA_CALL_ERET (propval, cuCtxCreate, &new_ctx, CU_CTX_SCHED_AUTO,
2117                             ptx_dev->dev);
2118             CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
2119             CUDA_CALL_ASSERT (cuCtxDestroy, new_ctx);
2120           }
2121         propval.val = free_mem;
2122       }
2123       break;
2124     case GOACC_PROPERTY_NAME:
2125       propval.ptr = ptx_dev->name;
2126       break;
2127     case GOACC_PROPERTY_VENDOR:
2128       propval.ptr = "Nvidia";
2129       break;
2130     case GOACC_PROPERTY_DRIVER:
2131       propval.ptr = cuda_driver_version_s;
2132       break;
2133     default:
2134       break;
2135     }
2136
2137   pthread_mutex_unlock (&ptx_dev_lock);
2138   return propval;
2139 }
2140
2141 /* Adjust launch dimensions: pick good values for number of blocks and warps
2142    and ensure that number of warps does not exceed CUDA limits as well as GCC's
2143    own limits.  */
2144
2145 static void
2146 nvptx_adjust_launch_bounds (struct targ_fn_descriptor *fn,
2147                             struct ptx_device *ptx_dev,
2148                             int *teams_p, int *threads_p)
2149 {
2150   int max_warps_block = fn->max_threads_per_block / 32;
2151   /* Maximum 32 warps per block is an implementation limit in NVPTX backend
2152      and libgcc, which matches documented limit of all GPUs as of 2015.  */
2153   if (max_warps_block > 32)
2154     max_warps_block = 32;
2155   if (*threads_p <= 0)
2156     *threads_p = 8;
2157   if (*threads_p > max_warps_block)
2158     *threads_p = max_warps_block;
2159
2160   int regs_per_block = fn->regs_per_thread * 32 * *threads_p;
2161   /* This is an estimate of how many blocks the device can host simultaneously.
2162      Actual limit, which may be lower, can be queried with "occupancy control"
2163      driver interface (since CUDA 6.0).  */
2164   int max_blocks = ptx_dev->regs_per_sm / regs_per_block * ptx_dev->num_sms;
2165   if (*teams_p <= 0 || *teams_p > max_blocks)
2166     *teams_p = max_blocks;
2167 }
2168
2169 /* Return the size of per-warp stacks (see gcc -msoft-stack) to use for OpenMP
2170    target regions.  */
2171
2172 static size_t
2173 nvptx_stacks_size ()
2174 {
2175   return 128 * 1024;
2176 }
2177
2178 /* Return contiguous storage for NUM stacks, each SIZE bytes.  The lock for
2179    the storage should be held on entry, and remains held on exit.  */
2180
2181 static void *
2182 nvptx_stacks_acquire (struct ptx_device *ptx_dev, size_t size, int num)
2183 {
2184   if (ptx_dev->omp_stacks.ptr && ptx_dev->omp_stacks.size >= size * num)
2185     return (void *) ptx_dev->omp_stacks.ptr;
2186
2187   /* Free the old, too-small stacks.  */
2188   if (ptx_dev->omp_stacks.ptr)
2189     {
2190       CUresult r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
2191       if (r != CUDA_SUCCESS)
2192         GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s\n", cuda_error (r));
2193       r = CUDA_CALL_NOCHECK (cuMemFree, ptx_dev->omp_stacks.ptr);
2194       if (r != CUDA_SUCCESS)
2195         GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
2196     }
2197
2198   /* Make new and bigger stacks, and remember where we put them and how big
2199      they are.  */
2200   CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &ptx_dev->omp_stacks.ptr,
2201                                   size * num);
2202   if (r != CUDA_SUCCESS)
2203     GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
2204
2205   ptx_dev->omp_stacks.size = size * num;
2206
2207   return (void *) ptx_dev->omp_stacks.ptr;
2208 }
2209
2210
2211 void
2212 GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
2213 {
2214   struct targ_fn_descriptor *tgt_fn_desc
2215     = (struct targ_fn_descriptor *) tgt_fn;
2216   CUfunction function = tgt_fn_desc->fn;
2217   const struct targ_fn_launch *launch = tgt_fn_desc->launch;
2218   const char *fn_name = launch->fn;
2219   CUresult r;
2220   struct ptx_device *ptx_dev = ptx_devices[ord];
2221   const char *maybe_abort_msg = "(perhaps abort was called)";
2222   int teams = 0, threads = 0;
2223
2224   if (!args)
2225     GOMP_PLUGIN_fatal ("No target arguments provided");
2226   while (*args)
2227     {
2228       intptr_t id = (intptr_t) *args++, val;
2229       if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
2230         val = (intptr_t) *args++;
2231       else
2232         val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
2233       if ((id & GOMP_TARGET_ARG_DEVICE_MASK) != GOMP_TARGET_ARG_DEVICE_ALL)
2234         continue;
2235       val = val > INT_MAX ? INT_MAX : val;
2236       id &= GOMP_TARGET_ARG_ID_MASK;
2237       if (id == GOMP_TARGET_ARG_NUM_TEAMS)
2238         teams = val;
2239       else if (id == GOMP_TARGET_ARG_THREAD_LIMIT)
2240         threads = val;
2241     }
2242   nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads);
2243
2244   bool reverse_offload = ptx_dev->rev_data != NULL;
2245   struct goacc_asyncqueue *reverse_offload_aq = NULL;
2246   if (reverse_offload)
2247     {
2248       reverse_offload_aq
2249         = nvptx_goacc_asyncqueue_construct (CU_STREAM_NON_BLOCKING);
2250       if (!reverse_offload_aq)
2251         exit (EXIT_FAILURE);
2252     }
2253
2254   size_t stack_size = nvptx_stacks_size ();
2255
2256   pthread_mutex_lock (&ptx_dev->omp_stacks.lock);
2257   void *stacks = nvptx_stacks_acquire (ptx_dev, stack_size, teams * threads);
2258   void *fn_args[] = {tgt_vars, stacks, (void *) stack_size};
2259   size_t fn_args_size = sizeof fn_args;
2260   void *config[] = {
2261     CU_LAUNCH_PARAM_BUFFER_POINTER, fn_args,
2262     CU_LAUNCH_PARAM_BUFFER_SIZE, &fn_args_size,
2263     CU_LAUNCH_PARAM_END
2264   };
2265   GOMP_PLUGIN_debug (0, "  %s: kernel %s: launch"
2266                      " [(teams: %u), 1, 1] [(lanes: 32), (threads: %u), 1]\n",
2267                      __FUNCTION__, fn_name, teams, threads);
2268   r = CUDA_CALL_NOCHECK (cuLaunchKernel, function, teams, 1, 1,
2269                          32, threads, 1, lowlat_pool_size, NULL, NULL, config);
2270   if (r != CUDA_SUCCESS)
2271     GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
2272   if (reverse_offload)
2273     while (true)
2274       {
2275         r = CUDA_CALL_NOCHECK (cuStreamQuery, NULL);
2276         if (r == CUDA_SUCCESS)
2277           break;
2278         if (r == CUDA_ERROR_LAUNCH_FAILED)
2279           GOMP_PLUGIN_fatal ("cuStreamQuery error: %s %s\n", cuda_error (r),
2280                              maybe_abort_msg);
2281         else if (r != CUDA_ERROR_NOT_READY)
2282           GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
2283
2284         if (__atomic_load_n (&ptx_dev->rev_data->fn, __ATOMIC_ACQUIRE) != 0)
2285           {
2286             struct rev_offload *rev_data = ptx_dev->rev_data;
2287             GOMP_PLUGIN_target_rev (rev_data->fn, rev_data->mapnum,
2288                                     rev_data->addrs, rev_data->sizes,
2289                                     rev_data->kinds, rev_data->dev_num,
2290                                     reverse_offload_aq);
2291             if (!nvptx_goacc_asyncqueue_synchronize (reverse_offload_aq))
2292               exit (EXIT_FAILURE);
2293             __atomic_store_n (&rev_data->fn, 0, __ATOMIC_RELEASE);
2294           }
2295         usleep (1);
2296       }
2297   else
2298     r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
2299   if (r == CUDA_ERROR_LAUNCH_FAILED)
2300     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
2301                        maybe_abort_msg);
2302   else if (r != CUDA_SUCCESS)
2303     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
2304
2305   pthread_mutex_unlock (&ptx_dev->omp_stacks.lock);
2306
2307   if (reverse_offload)
2308     {
2309       if (!nvptx_goacc_asyncqueue_destruct (reverse_offload_aq))
2310         exit (EXIT_FAILURE);
2311     }
2312 }
2313
2314 /* TODO: Implement GOMP_OFFLOAD_async_run. */