libgomp/plugin/plugin-nvptx.c

   1 /* Plugin for NVPTX execution.
   2
   3    Copyright (C) 2013-2023 Free Software Foundation, Inc.
   4
   5    Contributed by Mentor Embedded.
   6
   7    This file is part of the GNU Offloading and Multi Processing Library
   8    (libgomp).
   9
  10    Libgomp is free software; you can redistribute it and/or modify it
  11    under the terms of the GNU General Public License as published by
  12    the Free Software Foundation; either version 3, or (at your option)
  13    any later version.
  14
  15    Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
  16    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  17    FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  18    more details.
  19
  20    Under Section 7 of GPL version 3, you are granted additional
  21    permissions described in the GCC Runtime Library Exception, version
  22    3.1, as published by the Free Software Foundation.
  23
  24    You should have received a copy of the GNU General Public License and
  25    a copy of the GCC Runtime Library Exception along with this program;
  26    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  27    <http://www.gnu.org/licenses/>.  */
  28
  29 /* Nvidia PTX-specific parts of OpenACC support.  The cuda driver
  30    library appears to hold some implicit state, but the documentation
  31    is not clear as to what that state might be.  Or how one might
  32    propagate it from one thread to another.  */
  33
  34 #define _GNU_SOURCE
  35 #include "openacc.h"
  36 #include "config.h"
  37 #include "symcat.h"
  38 #include "libgomp-plugin.h"
  39 #include "oacc-plugin.h"
  40 #include "gomp-constants.h"
  41 #include "oacc-int.h"
  42
  43 /* For struct rev_offload + GOMP_REV_OFFLOAD_VAR. */
  44 #include "config/nvptx/libgomp-nvptx.h"
  45
  46 #include <pthread.h>
  47 #ifndef PLUGIN_NVPTX_INCLUDE_SYSTEM_CUDA_H
  48 # include "cuda/cuda.h"
  49 #else
  50 # include <cuda.h>
  51 #endif
  52 #include <stdbool.h>
  53 #include <limits.h>
  54 #include <string.h>
  55 #include <stdio.h>
  56 #include <unistd.h>
  57 #include <assert.h>
  58 #include <errno.h>
  59 #include <stdlib.h>
  60
  61 /* An arbitrary fixed limit (128MB) for the size of the OpenMP soft stacks
  62    block to cache between kernel invocations.  For soft-stacks blocks bigger
  63    than this, we will free the block before attempting another GPU memory
  64    allocation (i.e. in GOMP_OFFLOAD_alloc).  Otherwise, if an allocation fails,
  65    we will free the cached soft-stacks block anyway then retry the
  66    allocation.  If that fails too, we lose.  */
  67
  68 #define SOFTSTACK_CACHE_LIMIT 134217728
  69
  70 #if CUDA_VERSION < 6000
  71 extern CUresult cuGetErrorString (CUresult, const char **);
  72 #define CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR 82
  73 #endif
  74
  75 #if CUDA_VERSION >= 6050
  76 #undef cuLinkCreate
  77 #undef cuLinkAddData
  78 CUresult cuLinkAddData (CUlinkState, CUjitInputType, void *, size_t,
  79                         const char *, unsigned, CUjit_option *, void **);
  80 CUresult cuLinkCreate (unsigned, CUjit_option *, void **, CUlinkState *);
  81 #else
  82 typedef size_t (*CUoccupancyB2DSize)(int);
  83 CUresult cuLinkAddData_v2 (CUlinkState, CUjitInputType, void *, size_t,
  84                            const char *, unsigned, CUjit_option *, void **);
  85 CUresult cuLinkCreate_v2 (unsigned, CUjit_option *, void **, CUlinkState *);
  86 CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
  87                                           CUoccupancyB2DSize, size_t, int);
  88 #endif
  89
  90 #define DO_PRAGMA(x) _Pragma (#x)
  91
  92 #ifndef PLUGIN_NVPTX_LINK_LIBCUDA
  93 # include <dlfcn.h>
  94
  95 struct cuda_lib_s {
  96
  97 # define CUDA_ONE_CALL(call)                    \
  98   __typeof (call) *call;
  99 # define CUDA_ONE_CALL_MAYBE_NULL(call)         \
 100   CUDA_ONE_CALL (call)
 101 #include "cuda-lib.def"
 102 # undef CUDA_ONE_CALL
 103 # undef CUDA_ONE_CALL_MAYBE_NULL
 104
 105 } cuda_lib;
 106
 107 /* -1 if init_cuda_lib has not been called yet, false
 108    if it has been and failed, true if it has been and succeeded.  */
 109 static signed char cuda_lib_inited = -1;
 110
 111 /* Dynamically load the CUDA runtime library and initialize function
 112    pointers, return false if unsuccessful, true if successful.  */
 113 static bool
 114 init_cuda_lib (void)
 115 {
 116   if (cuda_lib_inited != -1)
 117     return cuda_lib_inited;
 118   const char *cuda_runtime_lib = "libcuda.so.1";
 119   void *h = dlopen (cuda_runtime_lib, RTLD_LAZY);
 120   cuda_lib_inited = false;
 121   if (h == NULL)
 122     return false;
 123
 124 # define CUDA_ONE_CALL(call) CUDA_ONE_CALL_1 (call, false)
 125 # define CUDA_ONE_CALL_MAYBE_NULL(call) CUDA_ONE_CALL_1 (call, true)
 126 # define CUDA_ONE_CALL_1(call, allow_null)              \
 127   cuda_lib.call = dlsym (h, #call);     \
 128   if (!allow_null && cuda_lib.call == NULL)             \
 129     return false;
 130 #include "cuda-lib.def"
 131 # undef CUDA_ONE_CALL
 132 # undef CUDA_ONE_CALL_1
 133 # undef CUDA_ONE_CALL_MAYBE_NULL
 134
 135   cuda_lib_inited = true;
 136   return true;
 137 }
 138 # define CUDA_CALL_PREFIX cuda_lib.
 139 #else
 140
 141 # define CUDA_ONE_CALL(call)
 142 # define CUDA_ONE_CALL_MAYBE_NULL(call) DO_PRAGMA (weak call)
 143 #include "cuda-lib.def"
 144 #undef CUDA_ONE_CALL_MAYBE_NULL
 145 #undef CUDA_ONE_CALL
 146
 147 # define CUDA_CALL_PREFIX
 148 # define init_cuda_lib() true
 149 #endif
 150
 151 #include "secure_getenv.h"
 152
 153 #undef MIN
 154 #undef MAX
 155 #define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
 156 #define MAX(X,Y) ((X) > (Y) ? (X) : (Y))
 157
 158 /* Convenience macros for the frequently used CUDA library call and
 159    error handling sequence as well as CUDA library calls that
 160    do the error checking themselves or don't do it at all.  */
 161
 162 #define CUDA_CALL_ERET(ERET, FN, ...)           \
 163   do {                                          \
 164     unsigned __r                                \
 165       = CUDA_CALL_PREFIX FN (__VA_ARGS__);      \
 166     if (__r != CUDA_SUCCESS)                    \
 167       {                                         \
 168         GOMP_PLUGIN_error (#FN " error: %s",    \
 169                            cuda_error (__r));   \
 170         return ERET;                            \
 171       }                                         \
 172   } while (0)
 173
 174 #define CUDA_CALL(FN, ...)                      \
 175   CUDA_CALL_ERET (false, FN, __VA_ARGS__)
 176
 177 #define CUDA_CALL_ASSERT(FN, ...)               \
 178   do {                                          \
 179     unsigned __r                                \
 180       = CUDA_CALL_PREFIX FN (__VA_ARGS__);      \
 181     if (__r != CUDA_SUCCESS)                    \
 182       {                                         \
 183         GOMP_PLUGIN_fatal (#FN " error: %s",    \
 184                            cuda_error (__r));   \
 185       }                                         \
 186   } while (0)
 187
 188 #define CUDA_CALL_NOCHECK(FN, ...)              \
 189   CUDA_CALL_PREFIX FN (__VA_ARGS__)
 190
 191 #define CUDA_CALL_EXISTS(FN)                    \
 192   CUDA_CALL_PREFIX FN
 193
 194 static const char *
 195 cuda_error (CUresult r)
 196 {
 197   const char *fallback = "unknown cuda error";
 198   const char *desc;
 199
 200   if (!CUDA_CALL_EXISTS (cuGetErrorString))
 201     return fallback;
 202
 203   r = CUDA_CALL_NOCHECK (cuGetErrorString, r, &desc);
 204   if (r == CUDA_SUCCESS)
 205     return desc;
 206
 207   return fallback;
 208 }
 209
 210 /* Version of the CUDA Toolkit in the same MAJOR.MINOR format that is used by
 211    Nvidia, such as in the 'deviceQuery' program (Nvidia's CUDA samples). */
 212 static char cuda_driver_version_s[30];
 213
 214 static unsigned int instantiated_devices = 0;
 215 static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
 216
 217 /* NVPTX/CUDA specific definition of asynchronous queues.  */
 218 struct goacc_asyncqueue
 219 {
 220   CUstream cuda_stream;
 221 };
 222
 223 struct nvptx_callback
 224 {
 225   void (*fn) (void *);
 226   void *ptr;
 227   struct goacc_asyncqueue *aq;
 228   struct nvptx_callback *next;
 229 };
 230
 231 /* Thread-specific data for PTX.  */
 232
 233 struct nvptx_thread
 234 {
 235   /* We currently have this embedded inside the plugin because libgomp manages
 236      devices through integer target_ids.  This might be better if using an
 237      opaque target-specific pointer directly from gomp_device_descr.  */
 238   struct ptx_device *ptx_dev;
 239 };
 240
 241 /* Target data function launch information.  */
 242
 243 struct targ_fn_launch
 244 {
 245   const char *fn;
 246   unsigned short dim[GOMP_DIM_MAX];
 247 };
 248
 249 /* Target PTX object information.  */
 250
 251 struct targ_ptx_obj
 252 {
 253   const char *code;
 254   size_t size;
 255 };
 256
 257 /* Target data image information.  */
 258
 259 typedef struct nvptx_tdata
 260 {
 261   const struct targ_ptx_obj *ptx_objs;
 262   unsigned ptx_num;
 263
 264   const char *const *var_names;
 265   unsigned var_num;
 266
 267   const struct targ_fn_launch *fn_descs;
 268   unsigned fn_num;
 269
 270   unsigned ind_fn_num;
 271 } nvptx_tdata_t;
 272
 273 /* Descriptor of a loaded function.  */
 274
 275 struct targ_fn_descriptor
 276 {
 277   CUfunction fn;
 278   const struct targ_fn_launch *launch;
 279   int regs_per_thread;
 280   int max_threads_per_block;
 281 };
 282
 283 /* A loaded PTX image.  */
 284 struct ptx_image_data
 285 {
 286   const void *target_data;
 287   CUmodule module;
 288
 289   struct targ_fn_descriptor *fns;  /* Array of functions.  */
 290
 291   struct ptx_image_data *next;
 292 };
 293
 294 struct ptx_free_block
 295 {
 296   void *ptr;
 297   struct ptx_free_block *next;
 298 };
 299
 300 struct ptx_device
 301 {
 302   CUcontext ctx;
 303   bool ctx_shared;
 304   CUdevice dev;
 305
 306   int ord;
 307   bool overlap;
 308   bool map;
 309   bool concur;
 310   bool mkern;
 311   int mode;
 312   int clock_khz;
 313   int num_sms;
 314   int regs_per_block;
 315   int regs_per_sm;
 316   int warp_size;
 317   int max_threads_per_block;
 318   int max_threads_per_multiprocessor;
 319   int default_dims[GOMP_DIM_MAX];
 320
 321   /* Length as used by the CUDA Runtime API ('struct cudaDeviceProp').  */
 322   char name[256];
 323
 324   struct ptx_image_data *images;  /* Images loaded on device.  */
 325   pthread_mutex_t image_lock;     /* Lock for above list.  */
 326
 327   struct ptx_free_block *free_blocks;
 328   pthread_mutex_t free_blocks_lock;
 329
 330   /* OpenMP stacks, cached between kernel invocations.  */
 331   struct
 332     {
 333       CUdeviceptr ptr;
 334       size_t size;
 335       pthread_mutex_t lock;
 336     } omp_stacks;
 337
 338   struct rev_offload *rev_data;
 339   struct ptx_device *next;
 340 };
 341
 342 static struct ptx_device **ptx_devices;
 343
 344 static inline struct nvptx_thread *
 345 nvptx_thread (void)
 346 {
 347   return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
 348 }
 349
 350 /* Initialize the device.  Return TRUE on success, else FALSE.  PTX_DEV_LOCK
 351    should be locked on entry and remains locked on exit.  */
 352
 353 static bool
 354 nvptx_init (void)
 355 {
 356   int ndevs;
 357
 358   if (instantiated_devices != 0)
 359     return true;
 360
 361   if (!init_cuda_lib ())
 362     return false;
 363
 364   CUDA_CALL (cuInit, 0);
 365
 366   int cuda_driver_version;
 367   CUDA_CALL_ERET (NULL, cuDriverGetVersion, &cuda_driver_version);
 368   snprintf (cuda_driver_version_s, sizeof cuda_driver_version_s,
 369             "CUDA Driver %u.%u",
 370             cuda_driver_version / 1000, cuda_driver_version % 1000 / 10);
 371
 372   CUDA_CALL (cuDeviceGetCount, &ndevs);
 373   ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *)
 374                                             * ndevs);
 375
 376   return true;
 377 }
 378
 379 /* Select the N'th PTX device for the current host thread.  The device must
 380    have been previously opened before calling this function.  */
 381
 382 static bool
 383 nvptx_attach_host_thread_to_device (int n)
 384 {
 385   CUdevice dev;
 386   CUresult r;
 387   struct ptx_device *ptx_dev;
 388   CUcontext thd_ctx;
 389
 390   r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &dev);
 391   if (r == CUDA_ERROR_NOT_PERMITTED)
 392     {
 393       /* Assume we're in a CUDA callback, just return true.  */
 394       return true;
 395     }
 396   if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
 397     {
 398       GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
 399       return false;
 400     }
 401
 402   if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n)
 403     return true;
 404   else
 405     {
 406       CUcontext old_ctx;
 407
 408       ptx_dev = ptx_devices[n];
 409       if (!ptx_dev)
 410         {
 411           GOMP_PLUGIN_error ("device %d not found", n);
 412           return false;
 413         }
 414
 415       CUDA_CALL (cuCtxGetCurrent, &thd_ctx);
 416
 417       /* We don't necessarily have a current context (e.g. if it has been
 418          destroyed.  Pop it if we do though.  */
 419       if (thd_ctx != NULL)
 420         CUDA_CALL (cuCtxPopCurrent, &old_ctx);
 421
 422       CUDA_CALL (cuCtxPushCurrent, ptx_dev->ctx);
 423     }
 424   return true;
 425 }
 426
 427 static struct ptx_device *
 428 nvptx_open_device (int n)
 429 {
 430   struct ptx_device *ptx_dev;
 431   CUdevice dev, ctx_dev;
 432   CUresult r;
 433   int pi;
 434
 435   CUDA_CALL_ERET (NULL, cuDeviceGet, &dev, n);
 436
 437   ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device));
 438
 439   ptx_dev->ord = n;
 440   ptx_dev->dev = dev;
 441   ptx_dev->ctx_shared = false;
 442
 443   r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &ctx_dev);
 444   if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
 445     {
 446       GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
 447       return NULL;
 448     }
 449
 450   if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev)
 451     {
 452       /* The current host thread has an active context for a different device.
 453          Detach it.  */
 454       CUcontext old_ctx;
 455       CUDA_CALL_ERET (NULL, cuCtxPopCurrent, &old_ctx);
 456     }
 457
 458   CUDA_CALL_ERET (NULL, cuCtxGetCurrent, &ptx_dev->ctx);
 459
 460   if (!ptx_dev->ctx)
 461     CUDA_CALL_ERET (NULL, cuCtxCreate, &ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
 462   else
 463     ptx_dev->ctx_shared = true;
 464
 465   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 466                   &pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
 467   ptx_dev->overlap = pi;
 468
 469   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 470                   &pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
 471   ptx_dev->map = pi;
 472
 473   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 474                   &pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
 475   ptx_dev->concur = pi;
 476
 477   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 478                   &pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
 479   ptx_dev->mode = pi;
 480
 481   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 482                   &pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
 483   ptx_dev->mkern = pi;
 484
 485   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 486                   &pi, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
 487   ptx_dev->clock_khz = pi;
 488
 489   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 490                   &pi, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
 491   ptx_dev->num_sms = pi;
 492
 493   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 494                   &pi, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev);
 495   ptx_dev->regs_per_block = pi;
 496
 497   /* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR is defined only
 498      in CUDA 6.0 and newer.  */
 499   r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi,
 500                          CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR,
 501                          dev);
 502   /* Fallback: use limit of registers per block, which is usually equal.  */
 503   if (r == CUDA_ERROR_INVALID_VALUE)
 504     pi = ptx_dev->regs_per_block;
 505   else if (r != CUDA_SUCCESS)
 506     {
 507       GOMP_PLUGIN_error ("cuDeviceGetAttribute error: %s", cuda_error (r));
 508       return NULL;
 509     }
 510   ptx_dev->regs_per_sm = pi;
 511
 512   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 513                   &pi, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev);
 514   if (pi != 32)
 515     {
 516       GOMP_PLUGIN_error ("Only warp size 32 is supported");
 517       return NULL;
 518     }
 519   ptx_dev->warp_size = pi;
 520
 521   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
 522                   CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev);
 523   ptx_dev->max_threads_per_block = pi;
 524
 525   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
 526                   CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev);
 527   ptx_dev->max_threads_per_multiprocessor = pi;
 528
 529   /* Required below for reverse offload as implemented, but with compute
 530      capability >= 2.0 and 64bit device processes, this should be universally be
 531      the case; hence, an assert.  */
 532   r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi,
 533                          CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, dev);
 534   assert (r == CUDA_SUCCESS && pi);
 535
 536   for (int i = 0; i != GOMP_DIM_MAX; i++)
 537     ptx_dev->default_dims[i] = 0;
 538
 539   CUDA_CALL_ERET (NULL, cuDeviceGetName, ptx_dev->name, sizeof ptx_dev->name,
 540                   dev);
 541
 542   ptx_dev->images = NULL;
 543   pthread_mutex_init (&ptx_dev->image_lock, NULL);
 544
 545   ptx_dev->free_blocks = NULL;
 546   pthread_mutex_init (&ptx_dev->free_blocks_lock, NULL);
 547
 548   ptx_dev->omp_stacks.ptr = 0;
 549   ptx_dev->omp_stacks.size = 0;
 550   pthread_mutex_init (&ptx_dev->omp_stacks.lock, NULL);
 551
 552   ptx_dev->rev_data = NULL;
 553
 554   return ptx_dev;
 555 }
 556
 557 static bool
 558 nvptx_close_device (struct ptx_device *ptx_dev)
 559 {
 560   if (!ptx_dev)
 561     return true;
 562
 563   for (struct ptx_free_block *b = ptx_dev->free_blocks; b;)
 564     {
 565       struct ptx_free_block *b_next = b->next;
 566       CUDA_CALL (cuMemFree, (CUdeviceptr) b->ptr);
 567       free (b);
 568       b = b_next;
 569     }
 570
 571   pthread_mutex_destroy (&ptx_dev->free_blocks_lock);
 572   pthread_mutex_destroy (&ptx_dev->image_lock);
 573
 574   pthread_mutex_destroy (&ptx_dev->omp_stacks.lock);
 575
 576   if (ptx_dev->omp_stacks.ptr)
 577     CUDA_CALL (cuMemFree, ptx_dev->omp_stacks.ptr);
 578
 579   if (!ptx_dev->ctx_shared)
 580     CUDA_CALL (cuCtxDestroy, ptx_dev->ctx);
 581
 582   free (ptx_dev);
 583   return true;
 584 }
 585
 586 static int
 587 nvptx_get_num_devices (void)
 588 {
 589   int n;
 590
 591   /* This function will be called before the plugin has been initialized in
 592      order to enumerate available devices, but CUDA API routines can't be used
 593      until cuInit has been called.  Just call it now (but don't yet do any
 594      further initialization).  */
 595   if (instantiated_devices == 0)
 596     {
 597       if (!init_cuda_lib ())
 598         return 0;
 599       CUresult r = CUDA_CALL_NOCHECK (cuInit, 0);
 600       /* This is not an error: e.g. we may have CUDA libraries installed but
 601          no devices available.  */
 602       if (r != CUDA_SUCCESS)
 603         {
 604           GOMP_PLUGIN_debug (0, "Disabling nvptx offloading; cuInit: %s\n",
 605                              cuda_error (r));
 606           return 0;
 607         }
 608     }
 609
 610   CUDA_CALL_ERET (-1, cuDeviceGetCount, &n);
 611   return n;
 612 }
 613
 614 static void
 615 notify_var (const char *var_name, const char *env_var)
 616 {
 617   if (env_var == NULL)
 618     GOMP_PLUGIN_debug (0, "%s: <Not defined>\n", var_name);
 619   else
 620     GOMP_PLUGIN_debug (0, "%s: '%s'\n", var_name, env_var);
 621 }
 622
 623 static void
 624 process_GOMP_NVPTX_JIT (intptr_t *gomp_nvptx_o)
 625 {
 626   const char *var_name = "GOMP_NVPTX_JIT";
 627   const char *env_var = secure_getenv (var_name);
 628   notify_var (var_name, env_var);
 629
 630   if (env_var == NULL)
 631     return;
 632
 633   const char *c = env_var;
 634   while (*c != '\0')
 635     {
 636       while (*c == ' ')
 637         c++;
 638
 639       if (c[0] == '-' && c[1] == 'O'
 640           && '0' <= c[2] && c[2] <= '4'
 641           && (c[3] == '\0' || c[3] == ' '))
 642         {
 643           *gomp_nvptx_o = c[2] - '0';
 644           c += 3;
 645           continue;
 646         }
 647
 648       GOMP_PLUGIN_error ("Error parsing %s", var_name);
 649       break;
 650     }
 651 }
 652
 653 static bool
 654 link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs,
 655           unsigned num_objs)
 656 {
 657   CUjit_option opts[7];
 658   void *optvals[7];
 659   float elapsed = 0.0;
 660   char elog[1024];
 661   char ilog[16384];
 662   CUlinkState linkstate;
 663   CUresult r;
 664   void *linkout;
 665   size_t linkoutsize __attribute__ ((unused));
 666
 667   opts[0] = CU_JIT_WALL_TIME;
 668   optvals[0] = &elapsed;
 669
 670   opts[1] = CU_JIT_INFO_LOG_BUFFER;
 671   optvals[1] = &ilog[0];
 672
 673   opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
 674   optvals[2] = (void *) sizeof ilog;
 675
 676   opts[3] = CU_JIT_ERROR_LOG_BUFFER;
 677   optvals[3] = &elog[0];
 678
 679   opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
 680   optvals[4] = (void *) sizeof elog;
 681
 682   opts[5] = CU_JIT_LOG_VERBOSE;
 683   optvals[5] = (void *) 1;
 684
 685   static intptr_t gomp_nvptx_o = -1;
 686
 687   static bool init_done = false;
 688   if (!init_done)
 689     {
 690       process_GOMP_NVPTX_JIT (&gomp_nvptx_o);
 691       init_done = true;
 692   }
 693
 694   int nopts = 6;
 695   if (gomp_nvptx_o != -1)
 696     {
 697       opts[nopts] = CU_JIT_OPTIMIZATION_LEVEL;
 698       optvals[nopts] = (void *) gomp_nvptx_o;
 699       nopts++;
 700     }
 701
 702   if (CUDA_CALL_EXISTS (cuLinkCreate_v2))
 703     CUDA_CALL (cuLinkCreate_v2, nopts, opts, optvals, &linkstate);
 704   else
 705     CUDA_CALL (cuLinkCreate, nopts, opts, optvals, &linkstate);
 706
 707   for (; num_objs--; ptx_objs++)
 708     {
 709       /* cuLinkAddData's 'data' argument erroneously omits the const
 710          qualifier.  */
 711       GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs->code);
 712       if (CUDA_CALL_EXISTS (cuLinkAddData_v2))
 713         r = CUDA_CALL_NOCHECK (cuLinkAddData_v2, linkstate, CU_JIT_INPUT_PTX,
 714                                (char *) ptx_objs->code, ptx_objs->size,
 715                                0, 0, 0, 0);
 716       else
 717         r = CUDA_CALL_NOCHECK (cuLinkAddData, linkstate, CU_JIT_INPUT_PTX,
 718                                (char *) ptx_objs->code, ptx_objs->size,
 719                                0, 0, 0, 0);
 720       if (r != CUDA_SUCCESS)
 721         {
 722           GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
 723           GOMP_PLUGIN_error ("cuLinkAddData (ptx_code) error: %s",
 724                              cuda_error (r));
 725           return false;
 726         }
 727     }
 728
 729   GOMP_PLUGIN_debug (0, "Linking\n");
 730   r = CUDA_CALL_NOCHECK (cuLinkComplete, linkstate, &linkout, &linkoutsize);
 731
 732   GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed);
 733   GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]);
 734
 735   if (r != CUDA_SUCCESS)
 736     {
 737       GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
 738       GOMP_PLUGIN_error ("cuLinkComplete error: %s", cuda_error (r));
 739       return false;
 740     }
 741
 742   CUDA_CALL (cuModuleLoadData, module, linkout);
 743   CUDA_CALL (cuLinkDestroy, linkstate);
 744   return true;
 745 }
 746
 747 static void
 748 nvptx_exec (void (*fn), unsigned *dims, void *targ_mem_desc,
 749             CUdeviceptr dp, CUstream stream)
 750 {
 751   struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
 752   CUfunction function;
 753   int i;
 754   void *kargs[1];
 755   struct nvptx_thread *nvthd = nvptx_thread ();
 756   int warp_size = nvthd->ptx_dev->warp_size;
 757
 758   function = targ_fn->fn;
 759
 760   /* Initialize the launch dimensions.  Typically this is constant,
 761      provided by the device compiler, but we must permit runtime
 762      values.  */
 763   int seen_zero = 0;
 764   for (i = 0; i != GOMP_DIM_MAX; i++)
 765     {
 766       if (targ_fn->launch->dim[i])
 767        dims[i] = targ_fn->launch->dim[i];
 768       if (!dims[i])
 769        seen_zero = 1;
 770     }
 771
 772   if (seen_zero)
 773     {
 774       pthread_mutex_lock (&ptx_dev_lock);
 775
 776       static int gomp_openacc_dims[GOMP_DIM_MAX];
 777       if (!gomp_openacc_dims[0])
 778         {
 779           /* See if the user provided GOMP_OPENACC_DIM environment
 780              variable to specify runtime defaults.  */
 781           for (int i = 0; i < GOMP_DIM_MAX; ++i)
 782             gomp_openacc_dims[i] = GOMP_PLUGIN_acc_default_dim (i);
 783         }
 784
 785       if (!nvthd->ptx_dev->default_dims[0])
 786         {
 787           int default_dims[GOMP_DIM_MAX];
 788           for (int i = 0; i < GOMP_DIM_MAX; ++i)
 789             default_dims[i] = gomp_openacc_dims[i];
 790
 791           int gang, worker, vector;
 792           {
 793             int block_size = nvthd->ptx_dev->max_threads_per_block;
 794             int cpu_size = nvthd->ptx_dev->max_threads_per_multiprocessor;
 795             int dev_size = nvthd->ptx_dev->num_sms;
 796             GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
 797                                " dev_size=%d, cpu_size=%d\n",
 798                                warp_size, block_size, dev_size, cpu_size);
 799
 800             gang = (cpu_size / block_size) * dev_size;
 801             worker = block_size / warp_size;
 802             vector = warp_size;
 803           }
 804
 805           /* There is no upper bound on the gang size.  The best size
 806              matches the hardware configuration.  Logical gangs are
 807              scheduled onto physical hardware.  To maximize usage, we
 808              should guess a large number.  */
 809           if (default_dims[GOMP_DIM_GANG] < 1)
 810             default_dims[GOMP_DIM_GANG] = gang ? gang : 1024;
 811           /* The worker size must not exceed the hardware.  */
 812           if (default_dims[GOMP_DIM_WORKER] < 1
 813               || (default_dims[GOMP_DIM_WORKER] > worker && gang))
 814             default_dims[GOMP_DIM_WORKER] = worker;
 815           /* The vector size must exactly match the hardware.  */
 816           if (default_dims[GOMP_DIM_VECTOR] < 1
 817               || (default_dims[GOMP_DIM_VECTOR] != vector && gang))
 818             default_dims[GOMP_DIM_VECTOR] = vector;
 819
 820           GOMP_PLUGIN_debug (0, " default dimensions [%d,%d,%d]\n",
 821                              default_dims[GOMP_DIM_GANG],
 822                              default_dims[GOMP_DIM_WORKER],
 823                              default_dims[GOMP_DIM_VECTOR]);
 824
 825           for (i = 0; i != GOMP_DIM_MAX; i++)
 826             nvthd->ptx_dev->default_dims[i] = default_dims[i];
 827         }
 828       pthread_mutex_unlock (&ptx_dev_lock);
 829
 830       {
 831         bool default_dim_p[GOMP_DIM_MAX];
 832         for (i = 0; i != GOMP_DIM_MAX; i++)
 833           default_dim_p[i] = !dims[i];
 834
 835         if (!CUDA_CALL_EXISTS (cuOccupancyMaxPotentialBlockSize))
 836           {
 837             for (i = 0; i != GOMP_DIM_MAX; i++)
 838               if (default_dim_p[i])
 839                 dims[i] = nvthd->ptx_dev->default_dims[i];
 840
 841             if (default_dim_p[GOMP_DIM_VECTOR])
 842               dims[GOMP_DIM_VECTOR]
 843                 = MIN (dims[GOMP_DIM_VECTOR],
 844                        (targ_fn->max_threads_per_block / warp_size
 845                         * warp_size));
 846
 847             if (default_dim_p[GOMP_DIM_WORKER])
 848               dims[GOMP_DIM_WORKER]
 849                 = MIN (dims[GOMP_DIM_WORKER],
 850                        targ_fn->max_threads_per_block / dims[GOMP_DIM_VECTOR]);
 851           }
 852         else
 853           {
 854             /* Handle the case that the compiler allows the runtime to choose
 855                the vector-length conservatively, by ignoring
 856                gomp_openacc_dims[GOMP_DIM_VECTOR].  TODO: actually handle
 857                it.  */
 858             int vectors = 0;
 859             /* TODO: limit gomp_openacc_dims[GOMP_DIM_WORKER] such that that
 860                gomp_openacc_dims[GOMP_DIM_WORKER] * actual_vectors does not
 861                exceed targ_fn->max_threads_per_block. */
 862             int workers = gomp_openacc_dims[GOMP_DIM_WORKER];
 863             int gangs = gomp_openacc_dims[GOMP_DIM_GANG];
 864             int grids, blocks;
 865
 866             CUDA_CALL_ASSERT (cuOccupancyMaxPotentialBlockSize, &grids,
 867                               &blocks, function, NULL, 0,
 868                               dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]);
 869             GOMP_PLUGIN_debug (0, "cuOccupancyMaxPotentialBlockSize: "
 870                                "grid = %d, block = %d\n", grids, blocks);
 871
 872             /* Keep the num_gangs proportional to the block size.  In
 873                the case were a block size is limited by shared-memory
 874                or the register file capacity, the runtime will not
 875                excessively over assign gangs to the multiprocessor
 876                units if their state is going to be swapped out even
 877                more than necessary. The constant factor 2 is there to
 878                prevent threads from idling when there is insufficient
 879                work for them.  */
 880             if (gangs == 0)
 881               gangs = 2 * grids * (blocks / warp_size);
 882
 883             if (vectors == 0)
 884               vectors = warp_size;
 885
 886             if (workers == 0)
 887               {
 888                 int actual_vectors = (default_dim_p[GOMP_DIM_VECTOR]
 889                                       ? vectors
 890                                       : dims[GOMP_DIM_VECTOR]);
 891                 workers = blocks / actual_vectors;
 892                 workers = MAX (workers, 1);
 893                 /* If we need a per-worker barrier ... .  */
 894                 if (actual_vectors > 32)
 895                   /* Don't use more barriers than available.  */
 896                   workers = MIN (workers, 15);
 897               }
 898
 899             for (i = 0; i != GOMP_DIM_MAX; i++)
 900               if (default_dim_p[i])
 901                 switch (i)
 902                   {
 903                   case GOMP_DIM_GANG: dims[i] = gangs; break;
 904                   case GOMP_DIM_WORKER: dims[i] = workers; break;
 905                   case GOMP_DIM_VECTOR: dims[i] = vectors; break;
 906                   default: GOMP_PLUGIN_fatal ("invalid dim");
 907                   }
 908           }
 909       }
 910     }
 911
 912   /* Check if the accelerator has sufficient hardware resources to
 913      launch the offloaded kernel.  */
 914   if (dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]
 915       > targ_fn->max_threads_per_block)
 916     {
 917       const char *msg
 918         = ("The Nvidia accelerator has insufficient resources to launch '%s'"
 919            " with num_workers = %d and vector_length = %d"
 920            "; "
 921            "recompile the program with 'num_workers = x and vector_length = y'"
 922            " on that offloaded region or '-fopenacc-dim=:x:y' where"
 923            " x * y <= %d"
 924            ".\n");
 925       GOMP_PLUGIN_fatal (msg, targ_fn->launch->fn, dims[GOMP_DIM_WORKER],
 926                          dims[GOMP_DIM_VECTOR], targ_fn->max_threads_per_block);
 927     }
 928
 929   /* Check if the accelerator has sufficient barrier resources to
 930      launch the offloaded kernel.  */
 931   if (dims[GOMP_DIM_WORKER] > 15 && dims[GOMP_DIM_VECTOR] > 32)
 932     {
 933       const char *msg
 934         = ("The Nvidia accelerator has insufficient barrier resources to launch"
 935            " '%s' with num_workers = %d and vector_length = %d"
 936            "; "
 937            "recompile the program with 'num_workers = x' on that offloaded"
 938            " region or '-fopenacc-dim=:x:' where x <= 15"
 939            "; "
 940            "or, recompile the program with 'vector_length = 32' on that"
 941            " offloaded region or '-fopenacc-dim=::32'"
 942            ".\n");
 943         GOMP_PLUGIN_fatal (msg, targ_fn->launch->fn, dims[GOMP_DIM_WORKER],
 944                            dims[GOMP_DIM_VECTOR]);
 945     }
 946
 947   GOMP_PLUGIN_debug (0, "  %s: kernel %s: launch"
 948                      " gangs=%u, workers=%u, vectors=%u\n",
 949                      __FUNCTION__, targ_fn->launch->fn, dims[GOMP_DIM_GANG],
 950                      dims[GOMP_DIM_WORKER], dims[GOMP_DIM_VECTOR]);
 951
 952   // OpenACC            CUDA
 953   //
 954   // num_gangs          nctaid.x
 955   // num_workers        ntid.y
 956   // vector length      ntid.x
 957
 958   struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
 959   acc_prof_info *prof_info = thr->prof_info;
 960   acc_event_info enqueue_launch_event_info;
 961   acc_api_info *api_info = thr->api_info;
 962   bool profiling_p = __builtin_expect (prof_info != NULL, false);
 963   if (profiling_p)
 964     {
 965       prof_info->event_type = acc_ev_enqueue_launch_start;
 966
 967       enqueue_launch_event_info.launch_event.event_type
 968         = prof_info->event_type;
 969       enqueue_launch_event_info.launch_event.valid_bytes
 970         = _ACC_LAUNCH_EVENT_INFO_VALID_BYTES;
 971       enqueue_launch_event_info.launch_event.parent_construct
 972         = acc_construct_parallel;
 973       enqueue_launch_event_info.launch_event.implicit = 1;
 974       enqueue_launch_event_info.launch_event.tool_info = NULL;
 975       enqueue_launch_event_info.launch_event.kernel_name = targ_fn->launch->fn;
 976       enqueue_launch_event_info.launch_event.num_gangs
 977         = dims[GOMP_DIM_GANG];
 978       enqueue_launch_event_info.launch_event.num_workers
 979         = dims[GOMP_DIM_WORKER];
 980       enqueue_launch_event_info.launch_event.vector_length
 981         = dims[GOMP_DIM_VECTOR];
 982
 983       api_info->device_api = acc_device_api_cuda;
 984
 985       GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &enqueue_launch_event_info,
 986                                             api_info);
 987     }
 988
 989   kargs[0] = &dp;
 990   CUDA_CALL_ASSERT (cuLaunchKernel, function,
 991                     dims[GOMP_DIM_GANG], 1, 1,
 992                     dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1,
 993                     0, stream, kargs, 0);
 994
 995   if (profiling_p)
 996     {
 997       prof_info->event_type = acc_ev_enqueue_launch_end;
 998       enqueue_launch_event_info.launch_event.event_type
 999         = prof_info->event_type;
1000       GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &enqueue_launch_event_info,
1001                                             api_info);
1002     }
1003
1004   GOMP_PLUGIN_debug (0, "  %s: kernel %s: finished\n", __FUNCTION__,
1005                      targ_fn->launch->fn);
1006 }
1007
1008 void * openacc_get_current_cuda_context (void);
1009
1010 static void
1011 goacc_profiling_acc_ev_alloc (struct goacc_thread *thr, void *dp, size_t s)
1012 {
1013   acc_prof_info *prof_info = thr->prof_info;
1014   acc_event_info data_event_info;
1015   acc_api_info *api_info = thr->api_info;
1016
1017   prof_info->event_type = acc_ev_alloc;
1018
1019   data_event_info.data_event.event_type = prof_info->event_type;
1020   data_event_info.data_event.valid_bytes = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1021   data_event_info.data_event.parent_construct = acc_construct_parallel;
1022   data_event_info.data_event.implicit = 1;
1023   data_event_info.data_event.tool_info = NULL;
1024   data_event_info.data_event.var_name = NULL;
1025   data_event_info.data_event.bytes = s;
1026   data_event_info.data_event.host_ptr = NULL;
1027   data_event_info.data_event.device_ptr = dp;
1028
1029   api_info->device_api = acc_device_api_cuda;
1030
1031   GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, api_info);
1032 }
1033
1034 /* Free the cached soft-stacks block if it is above the SOFTSTACK_CACHE_LIMIT
1035    size threshold, or if FORCE is true.  */
1036
1037 static void
1038 nvptx_stacks_free (struct ptx_device *ptx_dev, bool force)
1039 {
1040   pthread_mutex_lock (&ptx_dev->omp_stacks.lock);
1041   if (ptx_dev->omp_stacks.ptr
1042       && (force || ptx_dev->omp_stacks.size > SOFTSTACK_CACHE_LIMIT))
1043     {
1044       CUresult r = CUDA_CALL_NOCHECK (cuMemFree, ptx_dev->omp_stacks.ptr);
1045       if (r != CUDA_SUCCESS)
1046         GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
1047       ptx_dev->omp_stacks.ptr = 0;
1048       ptx_dev->omp_stacks.size = 0;
1049     }
1050   pthread_mutex_unlock (&ptx_dev->omp_stacks.lock);
1051 }
1052
1053 static void *
1054 nvptx_alloc (size_t s, bool suppress_errors)
1055 {
1056   CUdeviceptr d;
1057
1058   CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &d, s);
1059   if (suppress_errors && r == CUDA_ERROR_OUT_OF_MEMORY)
1060     return NULL;
1061   else if (r != CUDA_SUCCESS)
1062     {
1063       GOMP_PLUGIN_error ("nvptx_alloc error: %s", cuda_error (r));
1064       return NULL;
1065     }
1066
1067   /* NOTE: We only do profiling stuff if the memory allocation succeeds.  */
1068   struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1069   bool profiling_p
1070     = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1071   if (profiling_p)
1072     goacc_profiling_acc_ev_alloc (thr, (void *) d, s);
1073
1074   return (void *) d;
1075 }
1076
1077 static void
1078 goacc_profiling_acc_ev_free (struct goacc_thread *thr, void *p)
1079 {
1080   acc_prof_info *prof_info = thr->prof_info;
1081   acc_event_info data_event_info;
1082   acc_api_info *api_info = thr->api_info;
1083
1084   prof_info->event_type = acc_ev_free;
1085
1086   data_event_info.data_event.event_type = prof_info->event_type;
1087   data_event_info.data_event.valid_bytes = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1088   data_event_info.data_event.parent_construct = acc_construct_parallel;
1089   data_event_info.data_event.implicit = 1;
1090   data_event_info.data_event.tool_info = NULL;
1091   data_event_info.data_event.var_name = NULL;
1092   data_event_info.data_event.bytes = -1;
1093   data_event_info.data_event.host_ptr = NULL;
1094   data_event_info.data_event.device_ptr = p;
1095
1096   api_info->device_api = acc_device_api_cuda;
1097
1098   GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, api_info);
1099 }
1100
1101 static bool
1102 nvptx_free (void *p, struct ptx_device *ptx_dev)
1103 {
1104   CUdeviceptr pb;
1105   size_t ps;
1106
1107   CUresult r = CUDA_CALL_NOCHECK (cuMemGetAddressRange, &pb, &ps,
1108                                   (CUdeviceptr) p);
1109   if (r == CUDA_ERROR_NOT_PERMITTED)
1110     {
1111       /* We assume that this error indicates we are in a CUDA callback context,
1112          where all CUDA calls are not allowed (see cuStreamAddCallback
1113          documentation for description). Arrange to free this piece of device
1114          memory later.  */
1115       struct ptx_free_block *n
1116         = GOMP_PLUGIN_malloc (sizeof (struct ptx_free_block));
1117       n->ptr = p;
1118       pthread_mutex_lock (&ptx_dev->free_blocks_lock);
1119       n->next = ptx_dev->free_blocks;
1120       ptx_dev->free_blocks = n;
1121       pthread_mutex_unlock (&ptx_dev->free_blocks_lock);
1122       return true;
1123     }
1124   else if (r != CUDA_SUCCESS)
1125     {
1126       GOMP_PLUGIN_error ("cuMemGetAddressRange error: %s", cuda_error (r));
1127       return false;
1128     }
1129   if ((CUdeviceptr) p != pb)
1130     {
1131       GOMP_PLUGIN_error ("invalid device address");
1132       return false;
1133     }
1134
1135   CUDA_CALL (cuMemFree, (CUdeviceptr) p);
1136   struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1137   bool profiling_p
1138     = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1139   if (profiling_p)
1140     goacc_profiling_acc_ev_free (thr, p);
1141
1142   return true;
1143 }
1144
1145 static void *
1146 nvptx_get_current_cuda_device (void)
1147 {
1148   struct nvptx_thread *nvthd = nvptx_thread ();
1149
1150   if (!nvthd || !nvthd->ptx_dev)
1151     return NULL;
1152
1153   return &nvthd->ptx_dev->dev;
1154 }
1155
1156 static void *
1157 nvptx_get_current_cuda_context (void)
1158 {
1159   struct nvptx_thread *nvthd = nvptx_thread ();
1160
1161   if (!nvthd || !nvthd->ptx_dev)
1162     return NULL;
1163
1164   return nvthd->ptx_dev->ctx;
1165 }
1166
1167 /* Plugin entry points.  */
1168
1169 const char *
1170 GOMP_OFFLOAD_get_name (void)
1171 {
1172   return "nvptx";
1173 }
1174
1175 unsigned int
1176 GOMP_OFFLOAD_get_caps (void)
1177 {
1178   return GOMP_OFFLOAD_CAP_OPENACC_200 | GOMP_OFFLOAD_CAP_OPENMP_400;
1179 }
1180
1181 int
1182 GOMP_OFFLOAD_get_type (void)
1183 {
1184   return OFFLOAD_TARGET_TYPE_NVIDIA_PTX;
1185 }
1186
1187 int
1188 GOMP_OFFLOAD_get_num_devices (unsigned int omp_requires_mask)
1189 {
1190   int num_devices = nvptx_get_num_devices ();
1191   /* Return -1 if no omp_requires_mask cannot be fulfilled but
1192      devices were present.  Unified-shared address: see comment in
1193      nvptx_open_device for CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING.  */
1194   if (num_devices > 0
1195       && ((omp_requires_mask
1196            & ~(GOMP_REQUIRES_UNIFIED_ADDRESS
1197                | GOMP_REQUIRES_REVERSE_OFFLOAD)) != 0))
1198     return -1;
1199   return num_devices;
1200 }
1201
1202 bool
1203 GOMP_OFFLOAD_init_device (int n)
1204 {
1205   struct ptx_device *dev;
1206
1207   pthread_mutex_lock (&ptx_dev_lock);
1208
1209   if (!nvptx_init () || ptx_devices[n] != NULL)
1210     {
1211       pthread_mutex_unlock (&ptx_dev_lock);
1212       return false;
1213     }
1214
1215   dev = nvptx_open_device (n);
1216   if (dev)
1217     {
1218       ptx_devices[n] = dev;
1219       instantiated_devices++;
1220     }
1221
1222   pthread_mutex_unlock (&ptx_dev_lock);
1223
1224   return dev != NULL;
1225 }
1226
1227 bool
1228 GOMP_OFFLOAD_fini_device (int n)
1229 {
1230   pthread_mutex_lock (&ptx_dev_lock);
1231
1232   if (ptx_devices[n] != NULL)
1233     {
1234       if (!nvptx_attach_host_thread_to_device (n)
1235           || !nvptx_close_device (ptx_devices[n]))
1236         {
1237           pthread_mutex_unlock (&ptx_dev_lock);
1238           return false;
1239         }
1240       ptx_devices[n] = NULL;
1241       instantiated_devices--;
1242     }
1243
1244   if (instantiated_devices == 0)
1245     {
1246       free (ptx_devices);
1247       ptx_devices = NULL;
1248     }
1249
1250   pthread_mutex_unlock (&ptx_dev_lock);
1251   return true;
1252 }
1253
1254 /* Return the libgomp version number we're compatible with.  There is
1255    no requirement for cross-version compatibility.  */
1256
1257 unsigned
1258 GOMP_OFFLOAD_version (void)
1259 {
1260   return GOMP_VERSION;
1261 }
1262
1263 /* Initialize __nvptx_clocktick, if present in MODULE.  */
1264
1265 static void
1266 nvptx_set_clocktick (CUmodule module, struct ptx_device *dev)
1267 {
1268   CUdeviceptr dptr;
1269   CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &dptr, NULL,
1270                                   module, "__nvptx_clocktick");
1271   if (r == CUDA_ERROR_NOT_FOUND)
1272     return;
1273   if (r != CUDA_SUCCESS)
1274     GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
1275   double __nvptx_clocktick = 1e-3 / dev->clock_khz;
1276   r = CUDA_CALL_NOCHECK (cuMemcpyHtoD, dptr, &__nvptx_clocktick,
1277                          sizeof (__nvptx_clocktick));
1278   if (r != CUDA_SUCCESS)
1279     GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
1280 }
1281
1282 /* Load the (partial) program described by TARGET_DATA to device
1283    number ORD.  Allocate and return TARGET_TABLE.  If not NULL, REV_FN_TABLE
1284    will contain the on-device addresses of the functions for reverse offload.
1285    To be freed by the caller.  */
1286
1287 int
1288 GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
1289                          struct addr_pair **target_table,
1290                          uint64_t **rev_fn_table,
1291                          uint64_t *host_ind_fn_table)
1292 {
1293   CUmodule module;
1294   const char *const *var_names;
1295   const struct targ_fn_launch *fn_descs;
1296   unsigned int fn_entries, var_entries, ind_fn_entries, other_entries, i, j;
1297   struct targ_fn_descriptor *targ_fns;
1298   struct addr_pair *targ_tbl;
1299   const nvptx_tdata_t *img_header = (const nvptx_tdata_t *) target_data;
1300   struct ptx_image_data *new_image;
1301   struct ptx_device *dev;
1302
1303   if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1304     {
1305       GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1306                          " (expected %u, received %u)",
1307                          GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1308       return -1;
1309     }
1310
1311   if (!nvptx_attach_host_thread_to_device (ord)
1312       || !link_ptx (&module, img_header->ptx_objs, img_header->ptx_num))
1313     return -1;
1314
1315   dev = ptx_devices[ord];
1316
1317   /* The mkoffload utility emits a struct of pointers/integers at the
1318      start of each offload image.  The array of kernel names and the
1319      functions addresses form a one-to-one correspondence.  */
1320
1321   var_entries = img_header->var_num;
1322   var_names = img_header->var_names;
1323   fn_entries = img_header->fn_num;
1324   fn_descs = img_header->fn_descs;
1325   ind_fn_entries = GOMP_VERSION_SUPPORTS_INDIRECT_FUNCS (version)
1326                      ? img_header->ind_fn_num : 0;
1327
1328   /* Currently, other_entries contains only the struct of ICVs.  */
1329   other_entries = 1;
1330
1331   targ_tbl = GOMP_PLUGIN_malloc (sizeof (struct addr_pair)
1332                                  * (fn_entries + var_entries + other_entries));
1333   targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
1334                                  * fn_entries);
1335
1336   *target_table = targ_tbl;
1337
1338   new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data));
1339   new_image->target_data = target_data;
1340   new_image->module = module;
1341   new_image->fns = targ_fns;
1342
1343   pthread_mutex_lock (&dev->image_lock);
1344   new_image->next = dev->images;
1345   dev->images = new_image;
1346   pthread_mutex_unlock (&dev->image_lock);
1347
1348   for (i = 0; i < fn_entries; i++, targ_fns++, targ_tbl++)
1349     {
1350       CUfunction function;
1351       int nregs, mthrs;
1352
1353       CUDA_CALL_ERET (-1, cuModuleGetFunction, &function, module,
1354                       fn_descs[i].fn);
1355       CUDA_CALL_ERET (-1, cuFuncGetAttribute, &nregs,
1356                       CU_FUNC_ATTRIBUTE_NUM_REGS, function);
1357       CUDA_CALL_ERET (-1, cuFuncGetAttribute, &mthrs,
1358                       CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, function);
1359
1360       targ_fns->fn = function;
1361       targ_fns->launch = &fn_descs[i];
1362       targ_fns->regs_per_thread = nregs;
1363       targ_fns->max_threads_per_block = mthrs;
1364
1365       targ_tbl->start = (uintptr_t) targ_fns;
1366       targ_tbl->end = targ_tbl->start + 1;
1367     }
1368
1369   for (j = 0; j < var_entries; j++, targ_tbl++)
1370     {
1371       CUdeviceptr var;
1372       size_t bytes;
1373
1374       CUDA_CALL_ERET (-1, cuModuleGetGlobal,
1375                       &var, &bytes, module, var_names[j]);
1376
1377       targ_tbl->start = (uintptr_t) var;
1378       targ_tbl->end = targ_tbl->start + bytes;
1379     }
1380
1381   if (ind_fn_entries > 0)
1382     {
1383       CUdeviceptr var;
1384       size_t bytes;
1385
1386       /* Read indirect function table from image.  */
1387       CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &var, &bytes, module,
1388                                       "$offload_ind_func_table");
1389       if (r != CUDA_SUCCESS)
1390         GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
1391       assert (bytes == sizeof (uint64_t) * ind_fn_entries);
1392
1393       uint64_t ind_fn_table[ind_fn_entries];
1394       r = CUDA_CALL_NOCHECK (cuMemcpyDtoH, ind_fn_table, var, bytes);
1395       if (r != CUDA_SUCCESS)
1396         GOMP_PLUGIN_fatal ("cuMemcpyDtoH error: %s", cuda_error (r));
1397
1398       /* Build host->target address map for indirect functions.  */
1399       uint64_t ind_fn_map[ind_fn_entries * 2 + 1];
1400       for (unsigned k = 0; k < ind_fn_entries; k++)
1401         {
1402           ind_fn_map[k * 2] = host_ind_fn_table[k];
1403           ind_fn_map[k * 2 + 1] = ind_fn_table[k];
1404           GOMP_PLUGIN_debug (0, "Indirect function %d: %lx->%lx\n",
1405                              k, host_ind_fn_table[k], ind_fn_table[k]);
1406         }
1407       ind_fn_map[ind_fn_entries * 2] = 0;
1408
1409       /* Write the map onto the target.  */
1410       void *map_target_addr
1411         = GOMP_OFFLOAD_alloc (ord, sizeof (ind_fn_map));
1412       GOMP_PLUGIN_debug (0, "Allocated indirect map at %p\n", map_target_addr);
1413
1414       GOMP_OFFLOAD_host2dev (ord, map_target_addr,
1415                              (void*) ind_fn_map,
1416                              sizeof (ind_fn_map));
1417
1418       /* Write address of the map onto the target.  */
1419       CUdeviceptr varptr;
1420       size_t varsize;
1421       r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &varptr, &varsize,
1422                              module, XSTRING (GOMP_INDIRECT_ADDR_MAP));
1423       if (r != CUDA_SUCCESS)
1424         GOMP_PLUGIN_fatal ("Indirect map variable not found in image: %s",
1425                            cuda_error (r));
1426
1427       GOMP_PLUGIN_debug (0,
1428                          "Indirect map variable found at %llx with size %ld\n",
1429                          varptr, varsize);
1430
1431       GOMP_OFFLOAD_host2dev (ord, (void *) varptr, &map_target_addr,
1432                              sizeof (map_target_addr));
1433     }
1434
1435   CUdeviceptr varptr;
1436   size_t varsize;
1437   CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &varptr, &varsize,
1438                                   module, XSTRING (GOMP_ADDITIONAL_ICVS));
1439
1440   if (r == CUDA_SUCCESS)
1441     {
1442       targ_tbl->start = (uintptr_t) varptr;
1443       targ_tbl->end = (uintptr_t) (varptr + varsize);
1444     }
1445   else
1446     /* The variable was not in this image.  */
1447     targ_tbl->start = targ_tbl->end = 0;
1448
1449   if (rev_fn_table && fn_entries == 0)
1450     *rev_fn_table = NULL;
1451   else if (rev_fn_table)
1452     {
1453       CUdeviceptr var;
1454       size_t bytes;
1455       unsigned int i;
1456       r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &var, &bytes, module,
1457                              "$offload_func_table");
1458       if (r != CUDA_SUCCESS)
1459         GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
1460       assert (bytes == sizeof (uint64_t) * fn_entries);
1461       *rev_fn_table = GOMP_PLUGIN_malloc (sizeof (uint64_t) * fn_entries);
1462       r = CUDA_CALL_NOCHECK (cuMemcpyDtoH, *rev_fn_table, var, bytes);
1463       if (r != CUDA_SUCCESS)
1464         GOMP_PLUGIN_fatal ("cuMemcpyDtoH error: %s", cuda_error (r));
1465       /* Free if only NULL entries.  */
1466       for (i = 0; i < fn_entries; ++i)
1467         if ((*rev_fn_table)[i] != 0)
1468           break;
1469       if (i == fn_entries)
1470         {
1471           free (*rev_fn_table);
1472           *rev_fn_table = NULL;
1473         }
1474     }
1475
1476   if (rev_fn_table && *rev_fn_table && dev->rev_data == NULL)
1477     {
1478       /* Get the on-device GOMP_REV_OFFLOAD_VAR variable.  It should be
1479          available but it might be not.  One reason could be: if the user code
1480          has 'omp target device(ancestor:1)' in pure hostcode, GOMP_target_ext
1481          is not called on the device and, hence, it and GOMP_REV_OFFLOAD_VAR
1482          are not linked in.  */
1483       CUdeviceptr device_rev_offload_var;
1484       size_t device_rev_offload_size;
1485       CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal,
1486                                       &device_rev_offload_var,
1487                                       &device_rev_offload_size, module,
1488                                       XSTRING (GOMP_REV_OFFLOAD_VAR));
1489       if (r != CUDA_SUCCESS)
1490         {
1491           free (*rev_fn_table);
1492           *rev_fn_table = NULL;
1493         }
1494       else
1495         {
1496           /* cuMemHostAlloc memory is accessible on the device, if
1497              unified-shared address is supported; this is assumed - see comment
1498              in nvptx_open_device for CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING. */
1499           CUDA_CALL_ASSERT (cuMemHostAlloc, (void **) &dev->rev_data,
1500                             sizeof (*dev->rev_data), CU_MEMHOSTALLOC_DEVICEMAP);
1501           CUdeviceptr dp = (CUdeviceptr) dev->rev_data;
1502           r = CUDA_CALL_NOCHECK (cuMemcpyHtoD, device_rev_offload_var, &dp,
1503                                  sizeof (dp));
1504           if (r != CUDA_SUCCESS)
1505             GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
1506         }
1507     }
1508
1509   nvptx_set_clocktick (module, dev);
1510
1511   return fn_entries + var_entries + other_entries;
1512 }
1513
1514 /* Unload the program described by TARGET_DATA.  DEV_DATA is the
1515    function descriptors allocated by G_O_load_image.  */
1516
1517 bool
1518 GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data)
1519 {
1520   struct ptx_image_data *image, **prev_p;
1521   struct ptx_device *dev = ptx_devices[ord];
1522
1523   if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1524     {
1525       GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1526                          " (expected %u, received %u)",
1527                          GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1528       return false;
1529     }
1530
1531   bool ret = true;
1532   pthread_mutex_lock (&dev->image_lock);
1533   for (prev_p = &dev->images; (image = *prev_p) != 0; prev_p = &image->next)
1534     if (image->target_data == target_data)
1535       {
1536         *prev_p = image->next;
1537         if (CUDA_CALL_NOCHECK (cuModuleUnload, image->module) != CUDA_SUCCESS)
1538           ret = false;
1539         free (image->fns);
1540         free (image);
1541         break;
1542       }
1543   pthread_mutex_unlock (&dev->image_lock);
1544   return ret;
1545 }
1546
1547 void *
1548 GOMP_OFFLOAD_alloc (int ord, size_t size)
1549 {
1550   if (!nvptx_attach_host_thread_to_device (ord))
1551     return NULL;
1552
1553   struct ptx_device *ptx_dev = ptx_devices[ord];
1554   struct ptx_free_block *blocks, *tmp;
1555
1556   pthread_mutex_lock (&ptx_dev->free_blocks_lock);
1557   blocks = ptx_dev->free_blocks;
1558   ptx_dev->free_blocks = NULL;
1559   pthread_mutex_unlock (&ptx_dev->free_blocks_lock);
1560
1561   nvptx_stacks_free (ptx_dev, false);
1562
1563   while (blocks)
1564     {
1565       tmp = blocks->next;
1566       nvptx_free (blocks->ptr, ptx_dev);
1567       free (blocks);
1568       blocks = tmp;
1569     }
1570
1571   void *d = nvptx_alloc (size, true);
1572   if (d)
1573     return d;
1574   else
1575     {
1576       /* Memory allocation failed.  Try freeing the stacks block, and
1577          retrying.  */
1578       nvptx_stacks_free (ptx_dev, true);
1579       return nvptx_alloc (size, false);
1580     }
1581 }
1582
1583 bool
1584 GOMP_OFFLOAD_free (int ord, void *ptr)
1585 {
1586   return (nvptx_attach_host_thread_to_device (ord)
1587           && nvptx_free (ptr, ptx_devices[ord]));
1588 }
1589
1590 void
1591 GOMP_OFFLOAD_openacc_exec (void (*fn) (void *),
1592                            size_t mapnum  __attribute__((unused)),
1593                            void **hostaddrs __attribute__((unused)),
1594                            void **devaddrs,
1595                            unsigned *dims, void *targ_mem_desc)
1596 {
1597   GOMP_PLUGIN_debug (0, "nvptx %s\n", __FUNCTION__);
1598
1599   CUdeviceptr dp = (CUdeviceptr) devaddrs;
1600   nvptx_exec (fn, dims, targ_mem_desc, dp, NULL);
1601
1602   CUresult r = CUDA_CALL_NOCHECK (cuStreamSynchronize, NULL);
1603   const char *maybe_abort_msg = "(perhaps abort was called)";
1604   if (r == CUDA_ERROR_LAUNCH_FAILED)
1605     GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
1606                        maybe_abort_msg);
1607   else if (r != CUDA_SUCCESS)
1608     GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1609 }
1610
1611 void
1612 GOMP_OFFLOAD_openacc_async_exec (void (*fn) (void *),
1613                                  size_t mapnum __attribute__((unused)),
1614                                  void **hostaddrs __attribute__((unused)),
1615                                  void **devaddrs,
1616                                  unsigned *dims, void *targ_mem_desc,
1617                                  struct goacc_asyncqueue *aq)
1618 {
1619   GOMP_PLUGIN_debug (0, "nvptx %s\n", __FUNCTION__);
1620
1621   CUdeviceptr dp = (CUdeviceptr) devaddrs;
1622   nvptx_exec (fn, dims, targ_mem_desc, dp, aq->cuda_stream);
1623 }
1624
1625 void *
1626 GOMP_OFFLOAD_openacc_create_thread_data (int ord)
1627 {
1628   struct ptx_device *ptx_dev;
1629   struct nvptx_thread *nvthd
1630     = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
1631   CUcontext thd_ctx;
1632
1633   ptx_dev = ptx_devices[ord];
1634
1635   assert (ptx_dev);
1636
1637   CUDA_CALL_ASSERT (cuCtxGetCurrent, &thd_ctx);
1638
1639   assert (ptx_dev->ctx);
1640
1641   if (!thd_ctx)
1642     CUDA_CALL_ASSERT (cuCtxPushCurrent, ptx_dev->ctx);
1643
1644   nvthd->ptx_dev = ptx_dev;
1645
1646   return (void *) nvthd;
1647 }
1648
1649 void
1650 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
1651 {
1652   free (data);
1653 }
1654
1655 void *
1656 GOMP_OFFLOAD_openacc_cuda_get_current_device (void)
1657 {
1658   return nvptx_get_current_cuda_device ();
1659 }
1660
1661 void *
1662 GOMP_OFFLOAD_openacc_cuda_get_current_context (void)
1663 {
1664   return nvptx_get_current_cuda_context ();
1665 }
1666
1667 /* This returns a CUstream.  */
1668 void *
1669 GOMP_OFFLOAD_openacc_cuda_get_stream (struct goacc_asyncqueue *aq)
1670 {
1671   return (void *) aq->cuda_stream;
1672 }
1673
1674 /* This takes a CUstream.  */
1675 int
1676 GOMP_OFFLOAD_openacc_cuda_set_stream (struct goacc_asyncqueue *aq, void *stream)
1677 {
1678   if (aq->cuda_stream)
1679     {
1680       CUDA_CALL_ASSERT (cuStreamSynchronize, aq->cuda_stream);
1681       CUDA_CALL_ASSERT (cuStreamDestroy, aq->cuda_stream);
1682     }
1683
1684   aq->cuda_stream = (CUstream) stream;
1685   return 1;
1686 }
1687
1688 static struct goacc_asyncqueue *
1689 nvptx_goacc_asyncqueue_construct (unsigned int flags)
1690 {
1691   CUstream stream = NULL;
1692   CUDA_CALL_ERET (NULL, cuStreamCreate, &stream, flags);
1693
1694   struct goacc_asyncqueue *aq
1695     = GOMP_PLUGIN_malloc (sizeof (struct goacc_asyncqueue));
1696   aq->cuda_stream = stream;
1697   return aq;
1698 }
1699
1700 struct goacc_asyncqueue *
1701 GOMP_OFFLOAD_openacc_async_construct (int device __attribute__((unused)))
1702 {
1703   return nvptx_goacc_asyncqueue_construct (CU_STREAM_DEFAULT);
1704 }
1705
1706 static bool
1707 nvptx_goacc_asyncqueue_destruct (struct goacc_asyncqueue *aq)
1708 {
1709   CUDA_CALL_ERET (false, cuStreamDestroy, aq->cuda_stream);
1710   free (aq);
1711   return true;
1712 }
1713
1714 bool
1715 GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
1716 {
1717   return nvptx_goacc_asyncqueue_destruct (aq);
1718 }
1719
1720 int
1721 GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
1722 {
1723   CUresult r = CUDA_CALL_NOCHECK (cuStreamQuery, aq->cuda_stream);
1724   if (r == CUDA_SUCCESS)
1725     return 1;
1726   if (r == CUDA_ERROR_NOT_READY)
1727     return 0;
1728
1729   GOMP_PLUGIN_error ("cuStreamQuery error: %s", cuda_error (r));
1730   return -1;
1731 }
1732
1733 static bool
1734 nvptx_goacc_asyncqueue_synchronize (struct goacc_asyncqueue *aq)
1735 {
1736   CUDA_CALL_ERET (false, cuStreamSynchronize, aq->cuda_stream);
1737   return true;
1738 }
1739
1740 bool
1741 GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
1742 {
1743   return nvptx_goacc_asyncqueue_synchronize (aq);
1744 }
1745
1746 bool
1747 GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
1748                                       struct goacc_asyncqueue *aq2)
1749 {
1750   CUevent e;
1751   CUDA_CALL_ERET (false, cuEventCreate, &e, CU_EVENT_DISABLE_TIMING);
1752   CUDA_CALL_ERET (false, cuEventRecord, e, aq1->cuda_stream);
1753   CUDA_CALL_ERET (false, cuStreamWaitEvent, aq2->cuda_stream, e, 0);
1754   return true;
1755 }
1756
1757 static void
1758 cuda_callback_wrapper (CUstream stream, CUresult res, void *ptr)
1759 {
1760   if (res != CUDA_SUCCESS)
1761     GOMP_PLUGIN_fatal ("%s error: %s", __FUNCTION__, cuda_error (res));
1762   struct nvptx_callback *cb = (struct nvptx_callback *) ptr;
1763   cb->fn (cb->ptr);
1764   free (ptr);
1765 }
1766
1767 void
1768 GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
1769                                            void (*callback_fn)(void *),
1770                                            void *userptr)
1771 {
1772   struct nvptx_callback *b = GOMP_PLUGIN_malloc (sizeof (*b));
1773   b->fn = callback_fn;
1774   b->ptr = userptr;
1775   b->aq = aq;
1776   CUDA_CALL_ASSERT (cuStreamAddCallback, aq->cuda_stream,
1777                     cuda_callback_wrapper, (void *) b, 0);
1778 }
1779
1780 static bool
1781 cuda_memcpy_sanity_check (const void *h, const void *d, size_t s)
1782 {
1783   CUdeviceptr pb;
1784   size_t ps;
1785   if (!s)
1786     return true;
1787   if (!d)
1788     {
1789       GOMP_PLUGIN_error ("invalid device address");
1790       return false;
1791     }
1792   CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
1793   if (!pb)
1794     {
1795       GOMP_PLUGIN_error ("invalid device address");
1796       return false;
1797     }
1798   if (!h)
1799     {
1800       GOMP_PLUGIN_error ("invalid host address");
1801       return false;
1802     }
1803   if (d == h)
1804     {
1805       GOMP_PLUGIN_error ("invalid host or device address");
1806       return false;
1807     }
1808   if ((void *)(d + s) > (void *)(pb + ps))
1809     {
1810       GOMP_PLUGIN_error ("invalid size");
1811       return false;
1812     }
1813   return true;
1814 }
1815
1816 bool
1817 GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
1818 {
1819   if (!nvptx_attach_host_thread_to_device (ord)
1820       || !cuda_memcpy_sanity_check (src, dst, n))
1821     return false;
1822   CUDA_CALL (cuMemcpyHtoD, (CUdeviceptr) dst, src, n);
1823   return true;
1824 }
1825
1826 bool
1827 GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
1828 {
1829   if (!nvptx_attach_host_thread_to_device (ord)
1830       || !cuda_memcpy_sanity_check (dst, src, n))
1831     return false;
1832   CUDA_CALL (cuMemcpyDtoH, dst, (CUdeviceptr) src, n);
1833   return true;
1834 }
1835
1836 bool
1837 GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n)
1838 {
1839   CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n, NULL);
1840   return true;
1841 }
1842
1843 int
1844 GOMP_OFFLOAD_memcpy2d (int dst_ord, int src_ord, size_t dim1_size,
1845                        size_t dim0_len, void *dst, size_t dst_offset1_size,
1846                        size_t dst_offset0_len, size_t dst_dim1_size,
1847                        const void *src, size_t src_offset1_size,
1848                        size_t src_offset0_len, size_t src_dim1_size)
1849 {
1850   if (!nvptx_attach_host_thread_to_device (src_ord != -1 ? src_ord : dst_ord))
1851     return false;
1852
1853   /* TODO: Consider using CU_MEMORYTYPE_UNIFIED if supported.  */
1854
1855   CUDA_MEMCPY2D data;
1856
1857   memset (&data, 0, sizeof (data));
1858   data.WidthInBytes = dim1_size;
1859   data.Height = dim0_len;
1860
1861   if (dst_ord == -1)
1862     {
1863       data.dstMemoryType = CU_MEMORYTYPE_HOST;
1864       data.dstHost = dst;
1865     }
1866   else
1867     {
1868       data.dstMemoryType = CU_MEMORYTYPE_DEVICE;
1869       data.dstDevice = (CUdeviceptr) dst;
1870     }
1871   data.dstPitch = dst_dim1_size;
1872   data.dstXInBytes = dst_offset1_size;
1873   data.dstY = dst_offset0_len;
1874
1875   if (src_ord == -1)
1876     {
1877       data.srcMemoryType = CU_MEMORYTYPE_HOST;
1878       data.srcHost = src;
1879     }
1880   else
1881     {
1882       data.srcMemoryType = CU_MEMORYTYPE_DEVICE;
1883       data.srcDevice = (CUdeviceptr) src;
1884     }
1885   data.srcPitch = src_dim1_size;
1886   data.srcXInBytes = src_offset1_size;
1887   data.srcY = src_offset0_len;
1888
1889   CUresult res = CUDA_CALL_NOCHECK (cuMemcpy2D, &data);
1890   if (res == CUDA_ERROR_INVALID_VALUE)
1891     /* If pitch > CU_DEVICE_ATTRIBUTE_MAX_PITCH or for device-to-device
1892        for (some) memory not allocated by cuMemAllocPitch, cuMemcpy2D fails
1893        with an error; try the slower cuMemcpy2DUnaligned now.  */
1894     CUDA_CALL (cuMemcpy2DUnaligned, &data);
1895   else if (res != CUDA_SUCCESS)
1896     {
1897       GOMP_PLUGIN_error ("cuMemcpy2D error: %s", cuda_error (res));
1898       return false;
1899     }
1900   return true;
1901 }
1902
1903 int
1904 GOMP_OFFLOAD_memcpy3d (int dst_ord, int src_ord, size_t dim2_size,
1905                        size_t dim1_len, size_t dim0_len, void *dst,
1906                        size_t dst_offset2_size, size_t dst_offset1_len,
1907                        size_t dst_offset0_len, size_t dst_dim2_size,
1908                        size_t dst_dim1_len, const void *src,
1909                        size_t src_offset2_size, size_t src_offset1_len,
1910                        size_t src_offset0_len, size_t src_dim2_size,
1911                        size_t src_dim1_len)
1912 {
1913   if (!nvptx_attach_host_thread_to_device (src_ord != -1 ? src_ord : dst_ord))
1914     return false;
1915
1916   /* TODO: Consider using CU_MEMORYTYPE_UNIFIED if supported.  */
1917
1918   CUDA_MEMCPY3D data;
1919
1920   memset (&data, 0, sizeof (data));
1921   data.WidthInBytes = dim2_size;
1922   data.Height = dim1_len;
1923   data.Depth = dim0_len;
1924
1925   if (dst_ord == -1)
1926     {
1927       data.dstMemoryType = CU_MEMORYTYPE_HOST;
1928       data.dstHost = dst;
1929     }
1930   else
1931     {
1932       data.dstMemoryType = CU_MEMORYTYPE_DEVICE;
1933       data.dstDevice = (CUdeviceptr) dst;
1934     }
1935   data.dstPitch = dst_dim2_size;
1936   data.dstHeight = dst_dim1_len;
1937   data.dstXInBytes = dst_offset2_size;
1938   data.dstY = dst_offset1_len;
1939   data.dstZ = dst_offset0_len;
1940
1941   if (src_ord == -1)
1942     {
1943       data.srcMemoryType = CU_MEMORYTYPE_HOST;
1944       data.srcHost = src;
1945     }
1946   else
1947     {
1948       data.srcMemoryType = CU_MEMORYTYPE_DEVICE;
1949       data.srcDevice = (CUdeviceptr) src;
1950     }
1951   data.srcPitch = src_dim2_size;
1952   data.srcHeight = src_dim1_len;
1953   data.srcXInBytes = src_offset2_size;
1954   data.srcY = src_offset1_len;
1955   data.srcZ = src_offset0_len;
1956
1957   CUDA_CALL (cuMemcpy3D, &data);
1958   return true;
1959 }
1960
1961 bool
1962 GOMP_OFFLOAD_openacc_async_host2dev (int ord, void *dst, const void *src,
1963                                      size_t n, struct goacc_asyncqueue *aq)
1964 {
1965   if (!nvptx_attach_host_thread_to_device (ord)
1966       || !cuda_memcpy_sanity_check (src, dst, n))
1967     return false;
1968   CUDA_CALL (cuMemcpyHtoDAsync, (CUdeviceptr) dst, src, n, aq->cuda_stream);
1969   return true;
1970 }
1971
1972 bool
1973 GOMP_OFFLOAD_openacc_async_dev2host (int ord, void *dst, const void *src,
1974                                      size_t n, struct goacc_asyncqueue *aq)
1975 {
1976   if (!nvptx_attach_host_thread_to_device (ord)
1977       || !cuda_memcpy_sanity_check (dst, src, n))
1978     return false;
1979   CUDA_CALL (cuMemcpyDtoHAsync, dst, (CUdeviceptr) src, n, aq->cuda_stream);
1980   return true;
1981 }
1982
1983 union goacc_property_value
1984 GOMP_OFFLOAD_openacc_get_property (int n, enum goacc_property prop)
1985 {
1986   union goacc_property_value propval = { .val = 0 };
1987
1988   pthread_mutex_lock (&ptx_dev_lock);
1989
1990   if (n >= nvptx_get_num_devices () || n < 0 || ptx_devices[n] == NULL)
1991     {
1992       pthread_mutex_unlock (&ptx_dev_lock);
1993       return propval;
1994     }
1995
1996   struct ptx_device *ptx_dev = ptx_devices[n];
1997   switch (prop)
1998     {
1999     case GOACC_PROPERTY_MEMORY:
2000       {
2001         size_t total_mem;
2002
2003         CUDA_CALL_ERET (propval, cuDeviceTotalMem, &total_mem, ptx_dev->dev);
2004         propval.val = total_mem;
2005       }
2006       break;
2007     case GOACC_PROPERTY_FREE_MEMORY:
2008       {
2009         size_t total_mem;
2010         size_t free_mem;
2011         CUdevice ctxdev;
2012
2013         CUDA_CALL_ERET (propval, cuCtxGetDevice, &ctxdev);
2014         if (ptx_dev->dev == ctxdev)
2015           CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
2016         else if (ptx_dev->ctx)
2017           {
2018             CUcontext old_ctx;
2019
2020             CUDA_CALL_ERET (propval, cuCtxPushCurrent, ptx_dev->ctx);
2021             CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
2022             CUDA_CALL_ASSERT (cuCtxPopCurrent, &old_ctx);
2023           }
2024         else
2025           {
2026             CUcontext new_ctx;
2027
2028             CUDA_CALL_ERET (propval, cuCtxCreate, &new_ctx, CU_CTX_SCHED_AUTO,
2029                             ptx_dev->dev);
2030             CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
2031             CUDA_CALL_ASSERT (cuCtxDestroy, new_ctx);
2032           }
2033         propval.val = free_mem;
2034       }
2035       break;
2036     case GOACC_PROPERTY_NAME:
2037       propval.ptr = ptx_dev->name;
2038       break;
2039     case GOACC_PROPERTY_VENDOR:
2040       propval.ptr = "Nvidia";
2041       break;
2042     case GOACC_PROPERTY_DRIVER:
2043       propval.ptr = cuda_driver_version_s;
2044       break;
2045     default:
2046       break;
2047     }
2048
2049   pthread_mutex_unlock (&ptx_dev_lock);
2050   return propval;
2051 }
2052
2053 /* Adjust launch dimensions: pick good values for number of blocks and warps
2054    and ensure that number of warps does not exceed CUDA limits as well as GCC's
2055    own limits.  */
2056
2057 static void
2058 nvptx_adjust_launch_bounds (struct targ_fn_descriptor *fn,
2059                             struct ptx_device *ptx_dev,
2060                             int *teams_p, int *threads_p)
2061 {
2062   int max_warps_block = fn->max_threads_per_block / 32;
2063   /* Maximum 32 warps per block is an implementation limit in NVPTX backend
2064      and libgcc, which matches documented limit of all GPUs as of 2015.  */
2065   if (max_warps_block > 32)
2066     max_warps_block = 32;
2067   if (*threads_p <= 0)
2068     *threads_p = 8;
2069   if (*threads_p > max_warps_block)
2070     *threads_p = max_warps_block;
2071
2072   int regs_per_block = fn->regs_per_thread * 32 * *threads_p;
2073   /* This is an estimate of how many blocks the device can host simultaneously.
2074      Actual limit, which may be lower, can be queried with "occupancy control"
2075      driver interface (since CUDA 6.0).  */
2076   int max_blocks = ptx_dev->regs_per_sm / regs_per_block * ptx_dev->num_sms;
2077   if (*teams_p <= 0 || *teams_p > max_blocks)
2078     *teams_p = max_blocks;
2079 }
2080
2081 /* Return the size of per-warp stacks (see gcc -msoft-stack) to use for OpenMP
2082    target regions.  */
2083
2084 static size_t
2085 nvptx_stacks_size ()
2086 {
2087   return 128 * 1024;
2088 }
2089
2090 /* Return contiguous storage for NUM stacks, each SIZE bytes.  The lock for
2091    the storage should be held on entry, and remains held on exit.  */
2092
2093 static void *
2094 nvptx_stacks_acquire (struct ptx_device *ptx_dev, size_t size, int num)
2095 {
2096   if (ptx_dev->omp_stacks.ptr && ptx_dev->omp_stacks.size >= size * num)
2097     return (void *) ptx_dev->omp_stacks.ptr;
2098
2099   /* Free the old, too-small stacks.  */
2100   if (ptx_dev->omp_stacks.ptr)
2101     {
2102       CUresult r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
2103       if (r != CUDA_SUCCESS)
2104         GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s\n", cuda_error (r));
2105       r = CUDA_CALL_NOCHECK (cuMemFree, ptx_dev->omp_stacks.ptr);
2106       if (r != CUDA_SUCCESS)
2107         GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
2108     }
2109
2110   /* Make new and bigger stacks, and remember where we put them and how big
2111      they are.  */
2112   CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &ptx_dev->omp_stacks.ptr,
2113                                   size * num);
2114   if (r != CUDA_SUCCESS)
2115     GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
2116
2117   ptx_dev->omp_stacks.size = size * num;
2118
2119   return (void *) ptx_dev->omp_stacks.ptr;
2120 }
2121
2122
2123 void
2124 GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
2125 {
2126   struct targ_fn_descriptor *tgt_fn_desc
2127     = (struct targ_fn_descriptor *) tgt_fn;
2128   CUfunction function = tgt_fn_desc->fn;
2129   const struct targ_fn_launch *launch = tgt_fn_desc->launch;
2130   const char *fn_name = launch->fn;
2131   CUresult r;
2132   struct ptx_device *ptx_dev = ptx_devices[ord];
2133   const char *maybe_abort_msg = "(perhaps abort was called)";
2134   int teams = 0, threads = 0;
2135
2136   if (!args)
2137     GOMP_PLUGIN_fatal ("No target arguments provided");
2138   while (*args)
2139     {
2140       intptr_t id = (intptr_t) *args++, val;
2141       if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
2142         val = (intptr_t) *args++;
2143       else
2144         val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
2145       if ((id & GOMP_TARGET_ARG_DEVICE_MASK) != GOMP_TARGET_ARG_DEVICE_ALL)
2146         continue;
2147       val = val > INT_MAX ? INT_MAX : val;
2148       id &= GOMP_TARGET_ARG_ID_MASK;
2149       if (id == GOMP_TARGET_ARG_NUM_TEAMS)
2150         teams = val;
2151       else if (id == GOMP_TARGET_ARG_THREAD_LIMIT)
2152         threads = val;
2153     }
2154   nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads);
2155
2156   bool reverse_offload = ptx_dev->rev_data != NULL;
2157   struct goacc_asyncqueue *reverse_offload_aq = NULL;
2158   if (reverse_offload)
2159     {
2160       reverse_offload_aq
2161         = nvptx_goacc_asyncqueue_construct (CU_STREAM_NON_BLOCKING);
2162       if (!reverse_offload_aq)
2163         exit (EXIT_FAILURE);
2164     }
2165
2166   size_t stack_size = nvptx_stacks_size ();
2167
2168   pthread_mutex_lock (&ptx_dev->omp_stacks.lock);
2169   void *stacks = nvptx_stacks_acquire (ptx_dev, stack_size, teams * threads);
2170   void *fn_args[] = {tgt_vars, stacks, (void *) stack_size};
2171   size_t fn_args_size = sizeof fn_args;
2172   void *config[] = {
2173     CU_LAUNCH_PARAM_BUFFER_POINTER, fn_args,
2174     CU_LAUNCH_PARAM_BUFFER_SIZE, &fn_args_size,
2175     CU_LAUNCH_PARAM_END
2176   };
2177   GOMP_PLUGIN_debug (0, "  %s: kernel %s: launch"
2178                      " [(teams: %u), 1, 1] [(lanes: 32), (threads: %u), 1]\n",
2179                      __FUNCTION__, fn_name, teams, threads);
2180   r = CUDA_CALL_NOCHECK (cuLaunchKernel, function, teams, 1, 1,
2181                          32, threads, 1, 0, NULL, NULL, config);
2182   if (r != CUDA_SUCCESS)
2183     GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
2184   if (reverse_offload)
2185     while (true)
2186       {
2187         r = CUDA_CALL_NOCHECK (cuStreamQuery, NULL);
2188         if (r == CUDA_SUCCESS)
2189           break;
2190         if (r == CUDA_ERROR_LAUNCH_FAILED)
2191           GOMP_PLUGIN_fatal ("cuStreamQuery error: %s %s\n", cuda_error (r),
2192                              maybe_abort_msg);
2193         else if (r != CUDA_ERROR_NOT_READY)
2194           GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
2195
2196         if (__atomic_load_n (&ptx_dev->rev_data->fn, __ATOMIC_ACQUIRE) != 0)
2197           {
2198             struct rev_offload *rev_data = ptx_dev->rev_data;
2199             GOMP_PLUGIN_target_rev (rev_data->fn, rev_data->mapnum,
2200                                     rev_data->addrs, rev_data->sizes,
2201                                     rev_data->kinds, rev_data->dev_num,
2202                                     reverse_offload_aq);
2203             if (!nvptx_goacc_asyncqueue_synchronize (reverse_offload_aq))
2204               exit (EXIT_FAILURE);
2205             __atomic_store_n (&rev_data->fn, 0, __ATOMIC_RELEASE);
2206           }
2207         usleep (1);
2208       }
2209   else
2210     r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
2211   if (r == CUDA_ERROR_LAUNCH_FAILED)
2212     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
2213                        maybe_abort_msg);
2214   else if (r != CUDA_SUCCESS)
2215     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
2216
2217   pthread_mutex_unlock (&ptx_dev->omp_stacks.lock);
2218
2219   if (reverse_offload)
2220     {
2221       if (!nvptx_goacc_asyncqueue_destruct (reverse_offload_aq))
2222         exit (EXIT_FAILURE);
2223     }
2224 }
2225
2226 /* TODO: Implement GOMP_OFFLOAD_async_run. */