libgomp/plugin/plugin-nvptx.c

   1 /* Plugin for NVPTX execution.
   2
   3    Copyright (C) 2013-2018 Free Software Foundation, Inc.
   4
   5    Contributed by Mentor Embedded.
   6
   7    This file is part of the GNU Offloading and Multi Processing Library
   8    (libgomp).
   9
  10    Libgomp is free software; you can redistribute it and/or modify it
  11    under the terms of the GNU General Public License as published by
  12    the Free Software Foundation; either version 3, or (at your option)
  13    any later version.
  14
  15    Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
  16    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  17    FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  18    more details.
  19
  20    Under Section 7 of GPL version 3, you are granted additional
  21    permissions described in the GCC Runtime Library Exception, version
  22    3.1, as published by the Free Software Foundation.
  23
  24    You should have received a copy of the GNU General Public License and
  25    a copy of the GCC Runtime Library Exception along with this program;
  26    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  27    <http://www.gnu.org/licenses/>.  */
  28
  29 /* Nvidia PTX-specific parts of OpenACC support.  The cuda driver
  30    library appears to hold some implicit state, but the documentation
  31    is not clear as to what that state might be.  Or how one might
  32    propagate it from one thread to another.  */
  33
  34 #include "openacc.h"
  35 #include "config.h"
  36 #include "libgomp-plugin.h"
  37 #include "oacc-plugin.h"
  38 #include "gomp-constants.h"
  39
  40 #include <pthread.h>
  41 #include <cuda.h>
  42 #include <stdbool.h>
  43 #include <stdint.h>
  44 #include <limits.h>
  45 #include <string.h>
  46 #include <stdio.h>
  47 #include <unistd.h>
  48 #include <assert.h>
  49 #include <errno.h>
  50
  51 #if PLUGIN_NVPTX_DYNAMIC
  52 # include <dlfcn.h>
  53
  54 # define CUDA_CALLS \
  55 CUDA_ONE_CALL (cuCtxCreate)             \
  56 CUDA_ONE_CALL (cuCtxDestroy)            \
  57 CUDA_ONE_CALL (cuCtxGetCurrent)         \
  58 CUDA_ONE_CALL (cuCtxGetDevice)          \
  59 CUDA_ONE_CALL (cuCtxPopCurrent)         \
  60 CUDA_ONE_CALL (cuCtxPushCurrent)        \
  61 CUDA_ONE_CALL (cuCtxSynchronize)        \
  62 CUDA_ONE_CALL (cuDeviceGet)             \
  63 CUDA_ONE_CALL (cuDeviceGetAttribute)    \
  64 CUDA_ONE_CALL (cuDeviceGetCount)        \
  65 CUDA_ONE_CALL (cuEventCreate)           \
  66 CUDA_ONE_CALL (cuEventDestroy)          \
  67 CUDA_ONE_CALL (cuEventElapsedTime)      \
  68 CUDA_ONE_CALL (cuEventQuery)            \
  69 CUDA_ONE_CALL (cuEventRecord)           \
  70 CUDA_ONE_CALL (cuEventSynchronize)      \
  71 CUDA_ONE_CALL (cuFuncGetAttribute)      \
  72 CUDA_ONE_CALL (cuGetErrorString)        \
  73 CUDA_ONE_CALL (cuInit)                  \
  74 CUDA_ONE_CALL (cuLaunchKernel)          \
  75 CUDA_ONE_CALL (cuLinkAddData)           \
  76 CUDA_ONE_CALL (cuLinkComplete)          \
  77 CUDA_ONE_CALL (cuLinkCreate)            \
  78 CUDA_ONE_CALL (cuLinkDestroy)           \
  79 CUDA_ONE_CALL (cuMemAlloc)              \
  80 CUDA_ONE_CALL (cuMemAllocHost)          \
  81 CUDA_ONE_CALL (cuMemcpy)                \
  82 CUDA_ONE_CALL (cuMemcpyDtoDAsync)       \
  83 CUDA_ONE_CALL (cuMemcpyDtoH)            \
  84 CUDA_ONE_CALL (cuMemcpyDtoHAsync)       \
  85 CUDA_ONE_CALL (cuMemcpyHtoD)            \
  86 CUDA_ONE_CALL (cuMemcpyHtoDAsync)       \
  87 CUDA_ONE_CALL (cuMemFree)               \
  88 CUDA_ONE_CALL (cuMemFreeHost)           \
  89 CUDA_ONE_CALL (cuMemGetAddressRange)    \
  90 CUDA_ONE_CALL (cuMemHostGetDevicePointer)\
  91 CUDA_ONE_CALL (cuModuleGetFunction)     \
  92 CUDA_ONE_CALL (cuModuleGetGlobal)       \
  93 CUDA_ONE_CALL (cuModuleLoad)            \
  94 CUDA_ONE_CALL (cuModuleLoadData)        \
  95 CUDA_ONE_CALL (cuModuleUnload)          \
  96 CUDA_ONE_CALL (cuStreamCreate)          \
  97 CUDA_ONE_CALL (cuStreamDestroy)         \
  98 CUDA_ONE_CALL (cuStreamQuery)           \
  99 CUDA_ONE_CALL (cuStreamSynchronize)     \
 100 CUDA_ONE_CALL (cuStreamWaitEvent)
 101 # define CUDA_ONE_CALL(call) \
 102   __typeof (call) *call;
 103 struct cuda_lib_s {
 104   CUDA_CALLS
 105 } cuda_lib;
 106
 107 /* -1 if init_cuda_lib has not been called yet, false
 108    if it has been and failed, true if it has been and succeeded.  */
 109 static signed char cuda_lib_inited = -1;
 110
 111 /* Dynamically load the CUDA runtime library and initialize function
 112    pointers, return false if unsuccessful, true if successful.  */
 113 static bool
 114 init_cuda_lib (void)
 115 {
 116   if (cuda_lib_inited != -1)
 117     return cuda_lib_inited;
 118   const char *cuda_runtime_lib = "libcuda.so.1";
 119   void *h = dlopen (cuda_runtime_lib, RTLD_LAZY);
 120   cuda_lib_inited = false;
 121   if (h == NULL)
 122     return false;
 123 # undef CUDA_ONE_CALL
 124 # define CUDA_ONE_CALL(call) CUDA_ONE_CALL_1 (call)
 125 # define CUDA_ONE_CALL_1(call) \
 126   cuda_lib.call = dlsym (h, #call);     \
 127   if (cuda_lib.call == NULL)            \
 128     return false;
 129   CUDA_CALLS
 130   cuda_lib_inited = true;
 131   return true;
 132 }
 133 # undef CUDA_ONE_CALL
 134 # undef CUDA_ONE_CALL_1
 135 # define CUDA_CALL_PREFIX cuda_lib.
 136 #else
 137 # define CUDA_CALL_PREFIX
 138 # define init_cuda_lib() true
 139 #endif
 140
 141 /* Convenience macros for the frequently used CUDA library call and
 142    error handling sequence as well as CUDA library calls that
 143    do the error checking themselves or don't do it at all.  */
 144
 145 #define CUDA_CALL_ERET(ERET, FN, ...)           \
 146   do {                                          \
 147     unsigned __r                                \
 148       = CUDA_CALL_PREFIX FN (__VA_ARGS__);      \
 149     if (__r != CUDA_SUCCESS)                    \
 150       {                                         \
 151         GOMP_PLUGIN_error (#FN " error: %s",    \
 152                            cuda_error (__r));   \
 153         return ERET;                            \
 154       }                                         \
 155   } while (0)
 156
 157 #define CUDA_CALL(FN, ...)                      \
 158   CUDA_CALL_ERET (false, FN, __VA_ARGS__)
 159
 160 #define CUDA_CALL_ASSERT(FN, ...)               \
 161   do {                                          \
 162     unsigned __r                                \
 163       = CUDA_CALL_PREFIX FN (__VA_ARGS__);      \
 164     if (__r != CUDA_SUCCESS)                    \
 165       {                                         \
 166         GOMP_PLUGIN_fatal (#FN " error: %s",    \
 167                            cuda_error (__r));   \
 168       }                                         \
 169   } while (0)
 170
 171 #define CUDA_CALL_NOCHECK(FN, ...)              \
 172   CUDA_CALL_PREFIX FN (__VA_ARGS__)
 173
 174 static const char *
 175 cuda_error (CUresult r)
 176 {
 177 #if CUDA_VERSION < 7000
 178   /* Specified in documentation and present in library from at least
 179      5.5.  Not declared in header file prior to 7.0.  */
 180   extern CUresult cuGetErrorString (CUresult, const char **);
 181 #endif
 182   const char *desc;
 183
 184   r = CUDA_CALL_NOCHECK (cuGetErrorString, r, &desc);
 185   if (r != CUDA_SUCCESS)
 186     desc = "unknown cuda error";
 187
 188   return desc;
 189 }
 190
 191 static unsigned int instantiated_devices = 0;
 192 static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
 193
 194 struct ptx_stream
 195 {
 196   CUstream stream;
 197   pthread_t host_thread;
 198   bool multithreaded;
 199
 200   CUdeviceptr d;
 201   void *h;
 202   void *h_begin;
 203   void *h_end;
 204   void *h_next;
 205   void *h_prev;
 206   void *h_tail;
 207
 208   struct ptx_stream *next;
 209 };
 210
 211 /* Thread-specific data for PTX.  */
 212
 213 struct nvptx_thread
 214 {
 215   struct ptx_stream *current_stream;
 216   struct ptx_device *ptx_dev;
 217 };
 218
 219 struct map
 220 {
 221   int     async;
 222   size_t  size;
 223   char    mappings[0];
 224 };
 225
 226 static bool
 227 map_init (struct ptx_stream *s)
 228 {
 229   int size = getpagesize ();
 230
 231   assert (s);
 232   assert (!s->d);
 233   assert (!s->h);
 234
 235   CUDA_CALL (cuMemAllocHost, &s->h, size);
 236   CUDA_CALL (cuMemHostGetDevicePointer, &s->d, s->h, 0);
 237
 238   assert (s->h);
 239
 240   s->h_begin = s->h;
 241   s->h_end = s->h_begin + size;
 242   s->h_next = s->h_prev = s->h_tail = s->h_begin;
 243
 244   assert (s->h_next);
 245   assert (s->h_end);
 246   return true;
 247 }
 248
 249 static bool
 250 map_fini (struct ptx_stream *s)
 251 {
 252   CUDA_CALL (cuMemFreeHost, s->h);
 253   return true;
 254 }
 255
 256 static void
 257 map_pop (struct ptx_stream *s)
 258 {
 259   struct map *m;
 260
 261   assert (s != NULL);
 262   assert (s->h_next);
 263   assert (s->h_prev);
 264   assert (s->h_tail);
 265
 266   m = s->h_tail;
 267
 268   s->h_tail += m->size;
 269
 270   if (s->h_tail >= s->h_end)
 271     s->h_tail = s->h_begin + (int) (s->h_tail - s->h_end);
 272
 273   if (s->h_next == s->h_tail)
 274     s->h_prev = s->h_next;
 275
 276   assert (s->h_next >= s->h_begin);
 277   assert (s->h_tail >= s->h_begin);
 278   assert (s->h_prev >= s->h_begin);
 279
 280   assert (s->h_next <= s->h_end);
 281   assert (s->h_tail <= s->h_end);
 282   assert (s->h_prev <= s->h_end);
 283 }
 284
 285 static void
 286 map_push (struct ptx_stream *s, int async, size_t size, void **h, void **d)
 287 {
 288   int left;
 289   int offset;
 290   struct map *m;
 291
 292   assert (s != NULL);
 293
 294   left = s->h_end - s->h_next;
 295   size += sizeof (struct map);
 296
 297   assert (s->h_prev);
 298   assert (s->h_next);
 299
 300   if (size >= left)
 301     {
 302       m = s->h_prev;
 303       m->size += left;
 304       s->h_next = s->h_begin;
 305
 306       if (s->h_next + size > s->h_end)
 307         GOMP_PLUGIN_fatal ("unable to push map");
 308     }
 309
 310   assert (s->h_next);
 311
 312   m = s->h_next;
 313   m->async = async;
 314   m->size = size;
 315
 316   offset = (void *)&m->mappings[0] - s->h;
 317
 318   *d = (void *)(s->d + offset);
 319   *h = (void *)(s->h + offset);
 320
 321   s->h_prev = s->h_next;
 322   s->h_next += size;
 323
 324   assert (s->h_prev);
 325   assert (s->h_next);
 326
 327   assert (s->h_next >= s->h_begin);
 328   assert (s->h_tail >= s->h_begin);
 329   assert (s->h_prev >= s->h_begin);
 330   assert (s->h_next <= s->h_end);
 331   assert (s->h_tail <= s->h_end);
 332   assert (s->h_prev <= s->h_end);
 333
 334   return;
 335 }
 336
 337 /* Target data function launch information.  */
 338
 339 struct targ_fn_launch
 340 {
 341   const char *fn;
 342   unsigned short dim[GOMP_DIM_MAX];
 343 };
 344
 345 /* Target PTX object information.  */
 346
 347 struct targ_ptx_obj
 348 {
 349   const char *code;
 350   size_t size;
 351 };
 352
 353 /* Target data image information.  */
 354
 355 typedef struct nvptx_tdata
 356 {
 357   const struct targ_ptx_obj *ptx_objs;
 358   unsigned ptx_num;
 359
 360   const char *const *var_names;
 361   unsigned var_num;
 362
 363   const struct targ_fn_launch *fn_descs;
 364   unsigned fn_num;
 365 } nvptx_tdata_t;
 366
 367 /* Descriptor of a loaded function.  */
 368
 369 struct targ_fn_descriptor
 370 {
 371   CUfunction fn;
 372   const struct targ_fn_launch *launch;
 373   int regs_per_thread;
 374   int max_threads_per_block;
 375 };
 376
 377 /* A loaded PTX image.  */
 378 struct ptx_image_data
 379 {
 380   const void *target_data;
 381   CUmodule module;
 382
 383   struct targ_fn_descriptor *fns;  /* Array of functions.  */
 384
 385   struct ptx_image_data *next;
 386 };
 387
 388 struct ptx_device
 389 {
 390   CUcontext ctx;
 391   bool ctx_shared;
 392   CUdevice dev;
 393   struct ptx_stream *null_stream;
 394   /* All non-null streams associated with this device (actually context),
 395      either created implicitly or passed in from the user (via
 396      acc_set_cuda_stream).  */
 397   struct ptx_stream *active_streams;
 398   struct {
 399     struct ptx_stream **arr;
 400     int size;
 401   } async_streams;
 402   /* A lock for use when manipulating the above stream list and array.  */
 403   pthread_mutex_t stream_lock;
 404   int ord;
 405   bool overlap;
 406   bool map;
 407   bool concur;
 408   bool mkern;
 409   int  mode;
 410   int clock_khz;
 411   int num_sms;
 412   int regs_per_block;
 413   int regs_per_sm;
 414
 415   struct ptx_image_data *images;  /* Images loaded on device.  */
 416   pthread_mutex_t image_lock;     /* Lock for above list.  */
 417
 418   struct ptx_device *next;
 419 };
 420
 421 enum ptx_event_type
 422 {
 423   PTX_EVT_MEM,
 424   PTX_EVT_KNL,
 425   PTX_EVT_SYNC,
 426   PTX_EVT_ASYNC_CLEANUP
 427 };
 428
 429 struct ptx_event
 430 {
 431   CUevent *evt;
 432   int type;
 433   void *addr;
 434   int ord;
 435   int val;
 436
 437   struct ptx_event *next;
 438 };
 439
 440 static pthread_mutex_t ptx_event_lock;
 441 static struct ptx_event *ptx_events;
 442
 443 static struct ptx_device **ptx_devices;
 444
 445 static inline struct nvptx_thread *
 446 nvptx_thread (void)
 447 {
 448   return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
 449 }
 450
 451 static bool
 452 init_streams_for_device (struct ptx_device *ptx_dev, int concurrency)
 453 {
 454   int i;
 455   struct ptx_stream *null_stream
 456     = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
 457
 458   null_stream->stream = NULL;
 459   null_stream->host_thread = pthread_self ();
 460   null_stream->multithreaded = true;
 461   null_stream->d = (CUdeviceptr) NULL;
 462   null_stream->h = NULL;
 463   if (!map_init (null_stream))
 464     return false;
 465
 466   ptx_dev->null_stream = null_stream;
 467   ptx_dev->active_streams = NULL;
 468   pthread_mutex_init (&ptx_dev->stream_lock, NULL);
 469
 470   if (concurrency < 1)
 471     concurrency = 1;
 472
 473   /* This is just a guess -- make space for as many async streams as the
 474      current device is capable of concurrently executing.  This can grow
 475      later as necessary.  No streams are created yet.  */
 476   ptx_dev->async_streams.arr
 477     = GOMP_PLUGIN_malloc (concurrency * sizeof (struct ptx_stream *));
 478   ptx_dev->async_streams.size = concurrency;
 479
 480   for (i = 0; i < concurrency; i++)
 481     ptx_dev->async_streams.arr[i] = NULL;
 482
 483   return true;
 484 }
 485
 486 static bool
 487 fini_streams_for_device (struct ptx_device *ptx_dev)
 488 {
 489   free (ptx_dev->async_streams.arr);
 490
 491   bool ret = true;
 492   while (ptx_dev->active_streams != NULL)
 493     {
 494       struct ptx_stream *s = ptx_dev->active_streams;
 495       ptx_dev->active_streams = ptx_dev->active_streams->next;
 496
 497       ret &= map_fini (s);
 498
 499       CUresult r = CUDA_CALL_NOCHECK (cuStreamDestroy, s->stream);
 500       if (r != CUDA_SUCCESS)
 501         {
 502           GOMP_PLUGIN_error ("cuStreamDestroy error: %s", cuda_error (r));
 503           ret = false;
 504         }
 505       free (s);
 506     }
 507
 508   ret &= map_fini (ptx_dev->null_stream);
 509   free (ptx_dev->null_stream);
 510   return ret;
 511 }
 512
 513 /* Select a stream for (OpenACC-semantics) ASYNC argument for the current
 514    thread THREAD (and also current device/context).  If CREATE is true, create
 515    the stream if it does not exist (or use EXISTING if it is non-NULL), and
 516    associate the stream with the same thread argument.  Returns stream to use
 517    as result.  */
 518
 519 static struct ptx_stream *
 520 select_stream_for_async (int async, pthread_t thread, bool create,
 521                          CUstream existing)
 522 {
 523   struct nvptx_thread *nvthd = nvptx_thread ();
 524   /* Local copy of TLS variable.  */
 525   struct ptx_device *ptx_dev = nvthd->ptx_dev;
 526   struct ptx_stream *stream = NULL;
 527   int orig_async = async;
 528
 529   /* The special value acc_async_noval (-1) maps (for now) to an
 530      implicitly-created stream, which is then handled the same as any other
 531      numbered async stream.  Other options are available, e.g. using the null
 532      stream for anonymous async operations, or choosing an idle stream from an
 533      active set.  But, stick with this for now.  */
 534   if (async > acc_async_sync)
 535     async++;
 536
 537   if (create)
 538     pthread_mutex_lock (&ptx_dev->stream_lock);
 539
 540   /* NOTE: AFAICT there's no particular need for acc_async_sync to map to the
 541      null stream, and in fact better performance may be obtainable if it doesn't
 542      (because the null stream enforces overly-strict synchronisation with
 543      respect to other streams for legacy reasons, and that's probably not
 544      needed with OpenACC).  Maybe investigate later.  */
 545   if (async == acc_async_sync)
 546     stream = ptx_dev->null_stream;
 547   else if (async >= 0 && async < ptx_dev->async_streams.size
 548            && ptx_dev->async_streams.arr[async] && !(create && existing))
 549     stream = ptx_dev->async_streams.arr[async];
 550   else if (async >= 0 && create)
 551     {
 552       if (async >= ptx_dev->async_streams.size)
 553         {
 554           int i, newsize = ptx_dev->async_streams.size * 2;
 555
 556           if (async >= newsize)
 557             newsize = async + 1;
 558
 559           ptx_dev->async_streams.arr
 560             = GOMP_PLUGIN_realloc (ptx_dev->async_streams.arr,
 561                                    newsize * sizeof (struct ptx_stream *));
 562
 563           for (i = ptx_dev->async_streams.size; i < newsize; i++)
 564             ptx_dev->async_streams.arr[i] = NULL;
 565
 566           ptx_dev->async_streams.size = newsize;
 567         }
 568
 569       /* Create a new stream on-demand if there isn't one already, or if we're
 570          setting a particular async value to an existing (externally-provided)
 571          stream.  */
 572       if (!ptx_dev->async_streams.arr[async] || existing)
 573         {
 574           CUresult r;
 575           struct ptx_stream *s
 576             = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
 577
 578           if (existing)
 579             s->stream = existing;
 580           else
 581             {
 582               r = CUDA_CALL_NOCHECK (cuStreamCreate, &s->stream,
 583                                      CU_STREAM_DEFAULT);
 584               if (r != CUDA_SUCCESS)
 585                 {
 586                   pthread_mutex_unlock (&ptx_dev->stream_lock);
 587                   GOMP_PLUGIN_fatal ("cuStreamCreate error: %s",
 588                                      cuda_error (r));
 589                 }
 590             }
 591
 592           /* If CREATE is true, we're going to be queueing some work on this
 593              stream.  Associate it with the current host thread.  */
 594           s->host_thread = thread;
 595           s->multithreaded = false;
 596
 597           s->d = (CUdeviceptr) NULL;
 598           s->h = NULL;
 599           if (!map_init (s))
 600             {
 601               pthread_mutex_unlock (&ptx_dev->stream_lock);
 602               GOMP_PLUGIN_fatal ("map_init fail");
 603             }
 604
 605           s->next = ptx_dev->active_streams;
 606           ptx_dev->active_streams = s;
 607           ptx_dev->async_streams.arr[async] = s;
 608         }
 609
 610       stream = ptx_dev->async_streams.arr[async];
 611     }
 612   else if (async < 0)
 613     {
 614       if (create)
 615         pthread_mutex_unlock (&ptx_dev->stream_lock);
 616       GOMP_PLUGIN_fatal ("bad async %d", async);
 617     }
 618
 619   if (create)
 620     {
 621       assert (stream != NULL);
 622
 623       /* If we're trying to use the same stream from different threads
 624          simultaneously, set stream->multithreaded to true.  This affects the
 625          behaviour of acc_async_test_all and acc_wait_all, which are supposed to
 626          only wait for asynchronous launches from the same host thread they are
 627          invoked on.  If multiple threads use the same async value, we make note
 628          of that here and fall back to testing/waiting for all threads in those
 629          functions.  */
 630       if (thread != stream->host_thread)
 631         stream->multithreaded = true;
 632
 633       pthread_mutex_unlock (&ptx_dev->stream_lock);
 634     }
 635   else if (stream && !stream->multithreaded
 636            && !pthread_equal (stream->host_thread, thread))
 637     GOMP_PLUGIN_fatal ("async %d used on wrong thread", orig_async);
 638
 639   return stream;
 640 }
 641
 642 /* Initialize the device.  Return TRUE on success, else FALSE.  PTX_DEV_LOCK
 643    should be locked on entry and remains locked on exit.  */
 644
 645 static bool
 646 nvptx_init (void)
 647 {
 648   int ndevs;
 649
 650   if (instantiated_devices != 0)
 651     return true;
 652
 653   ptx_events = NULL;
 654   pthread_mutex_init (&ptx_event_lock, NULL);
 655
 656   if (!init_cuda_lib ())
 657     return false;
 658
 659   CUDA_CALL (cuInit, 0);
 660
 661   CUDA_CALL (cuDeviceGetCount, &ndevs);
 662   ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *)
 663                                             * ndevs);
 664   return true;
 665 }
 666
 667 /* Select the N'th PTX device for the current host thread.  The device must
 668    have been previously opened before calling this function.  */
 669
 670 static bool
 671 nvptx_attach_host_thread_to_device (int n)
 672 {
 673   CUdevice dev;
 674   CUresult r;
 675   struct ptx_device *ptx_dev;
 676   CUcontext thd_ctx;
 677
 678   r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &dev);
 679   if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
 680     {
 681       GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
 682       return false;
 683     }
 684
 685   if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n)
 686     return true;
 687   else
 688     {
 689       CUcontext old_ctx;
 690
 691       ptx_dev = ptx_devices[n];
 692       if (!ptx_dev)
 693         {
 694           GOMP_PLUGIN_error ("device %d not found", n);
 695           return false;
 696         }
 697
 698       CUDA_CALL (cuCtxGetCurrent, &thd_ctx);
 699
 700       /* We don't necessarily have a current context (e.g. if it has been
 701          destroyed.  Pop it if we do though.  */
 702       if (thd_ctx != NULL)
 703         CUDA_CALL (cuCtxPopCurrent, &old_ctx);
 704
 705       CUDA_CALL (cuCtxPushCurrent, ptx_dev->ctx);
 706     }
 707   return true;
 708 }
 709
 710 static struct ptx_device *
 711 nvptx_open_device (int n)
 712 {
 713   struct ptx_device *ptx_dev;
 714   CUdevice dev, ctx_dev;
 715   CUresult r;
 716   int async_engines, pi;
 717
 718   CUDA_CALL_ERET (NULL, cuDeviceGet, &dev, n);
 719
 720   ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device));
 721
 722   ptx_dev->ord = n;
 723   ptx_dev->dev = dev;
 724   ptx_dev->ctx_shared = false;
 725
 726   r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &ctx_dev);
 727   if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
 728     {
 729       GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
 730       return NULL;
 731     }
 732
 733   if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev)
 734     {
 735       /* The current host thread has an active context for a different device.
 736          Detach it.  */
 737       CUcontext old_ctx;
 738       CUDA_CALL_ERET (NULL, cuCtxPopCurrent, &old_ctx);
 739     }
 740
 741   CUDA_CALL_ERET (NULL, cuCtxGetCurrent, &ptx_dev->ctx);
 742
 743   if (!ptx_dev->ctx)
 744     CUDA_CALL_ERET (NULL, cuCtxCreate, &ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
 745   else
 746     ptx_dev->ctx_shared = true;
 747
 748   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 749                   &pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
 750   ptx_dev->overlap = pi;
 751
 752   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 753                   &pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
 754   ptx_dev->map = pi;
 755
 756   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 757                   &pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
 758   ptx_dev->concur = pi;
 759
 760   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 761                   &pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
 762   ptx_dev->mode = pi;
 763
 764   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 765                   &pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
 766   ptx_dev->mkern = pi;
 767
 768   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 769                   &pi, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
 770   ptx_dev->clock_khz = pi;
 771
 772   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 773                   &pi, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
 774   ptx_dev->num_sms = pi;
 775
 776   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 777                   &pi, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev);
 778   ptx_dev->regs_per_block = pi;
 779
 780   /* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82 is defined only
 781      in CUDA 6.0 and newer.  */
 782   r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi, 82, dev);
 783   /* Fallback: use limit of registers per block, which is usually equal.  */
 784   if (r == CUDA_ERROR_INVALID_VALUE)
 785     pi = ptx_dev->regs_per_block;
 786   else if (r != CUDA_SUCCESS)
 787     {
 788       GOMP_PLUGIN_error ("cuDeviceGetAttribute error: %s", cuda_error (r));
 789       return NULL;
 790     }
 791   ptx_dev->regs_per_sm = pi;
 792
 793   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 794                   &pi, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev);
 795   if (pi != 32)
 796     {
 797       GOMP_PLUGIN_error ("Only warp size 32 is supported");
 798       return NULL;
 799     }
 800
 801   r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &async_engines,
 802                          CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
 803   if (r != CUDA_SUCCESS)
 804     async_engines = 1;
 805
 806   ptx_dev->images = NULL;
 807   pthread_mutex_init (&ptx_dev->image_lock, NULL);
 808
 809   if (!init_streams_for_device (ptx_dev, async_engines))
 810     return NULL;
 811
 812   return ptx_dev;
 813 }
 814
 815 static bool
 816 nvptx_close_device (struct ptx_device *ptx_dev)
 817 {
 818   if (!ptx_dev)
 819     return true;
 820
 821   if (!fini_streams_for_device (ptx_dev))
 822     return false;
 823
 824   pthread_mutex_destroy (&ptx_dev->image_lock);
 825
 826   if (!ptx_dev->ctx_shared)
 827     CUDA_CALL (cuCtxDestroy, ptx_dev->ctx);
 828
 829   free (ptx_dev);
 830   return true;
 831 }
 832
 833 static int
 834 nvptx_get_num_devices (void)
 835 {
 836   int n;
 837
 838   /* PR libgomp/65099: Currently, we only support offloading in 64-bit
 839      configurations.  */
 840   if (sizeof (void *) != 8)
 841     {
 842       GOMP_PLUGIN_debug (0, "Disabling nvptx offloading;"
 843                          " only 64-bit configurations are supported\n");
 844       return 0;
 845     }
 846
 847   /* This function will be called before the plugin has been initialized in
 848      order to enumerate available devices, but CUDA API routines can't be used
 849      until cuInit has been called.  Just call it now (but don't yet do any
 850      further initialization).  */
 851   if (instantiated_devices == 0)
 852     {
 853       if (!init_cuda_lib ())
 854         return 0;
 855       CUresult r = CUDA_CALL_NOCHECK (cuInit, 0);
 856       /* This is not an error: e.g. we may have CUDA libraries installed but
 857          no devices available.  */
 858       if (r != CUDA_SUCCESS)
 859         {
 860           GOMP_PLUGIN_debug (0, "Disabling nvptx offloading; cuInit: %s\n",
 861                              cuda_error (r));
 862           return 0;
 863         }
 864     }
 865
 866   CUDA_CALL_ERET (-1, cuDeviceGetCount, &n);
 867   return n;
 868 }
 869
 870 static void
 871 notify_var (const char *var_name, const char *env_var)
 872 {
 873   if (env_var == NULL)
 874     GOMP_PLUGIN_debug (0, "%s: <Not defined>\n", var_name);
 875   else
 876     GOMP_PLUGIN_debug (0, "%s: '%s'\n", var_name, env_var);
 877 }
 878
 879 static bool
 880 link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs,
 881           unsigned num_objs)
 882 {
 883   CUjit_option opts[6];
 884   void *optvals[6];
 885   float elapsed = 0.0;
 886   char elog[1024];
 887   char ilog[16384];
 888   CUlinkState linkstate;
 889   CUresult r;
 890   void *linkout;
 891   size_t linkoutsize __attribute__ ((unused));
 892
 893   opts[0] = CU_JIT_WALL_TIME;
 894   optvals[0] = &elapsed;
 895
 896   opts[1] = CU_JIT_INFO_LOG_BUFFER;
 897   optvals[1] = &ilog[0];
 898
 899   opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
 900   optvals[2] = (void *) sizeof ilog;
 901
 902   opts[3] = CU_JIT_ERROR_LOG_BUFFER;
 903   optvals[3] = &elog[0];
 904
 905   opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
 906   optvals[4] = (void *) sizeof elog;
 907
 908   opts[5] = CU_JIT_LOG_VERBOSE;
 909   optvals[5] = (void *) 1;
 910
 911   CUDA_CALL (cuLinkCreate, 6, opts, optvals, &linkstate);
 912
 913   for (; num_objs--; ptx_objs++)
 914     {
 915       /* cuLinkAddData's 'data' argument erroneously omits the const
 916          qualifier.  */
 917       GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs->code);
 918       r = CUDA_CALL_NOCHECK (cuLinkAddData, linkstate, CU_JIT_INPUT_PTX,
 919                              (char *) ptx_objs->code, ptx_objs->size,
 920                              0, 0, 0, 0);
 921       if (r != CUDA_SUCCESS)
 922         {
 923           GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
 924           GOMP_PLUGIN_error ("cuLinkAddData (ptx_code) error: %s",
 925                              cuda_error (r));
 926           return false;
 927         }
 928     }
 929
 930   GOMP_PLUGIN_debug (0, "Linking\n");
 931   r = CUDA_CALL_NOCHECK (cuLinkComplete, linkstate, &linkout, &linkoutsize);
 932
 933   GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed);
 934   GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]);
 935
 936   if (r != CUDA_SUCCESS)
 937     {
 938       GOMP_PLUGIN_error ("cuLinkComplete error: %s", cuda_error (r));
 939       return false;
 940     }
 941
 942   CUDA_CALL (cuModuleLoadData, module, linkout);
 943   CUDA_CALL (cuLinkDestroy, linkstate);
 944   return true;
 945 }
 946
 947 static void
 948 event_gc (bool memmap_lockable)
 949 {
 950   struct ptx_event *ptx_event = ptx_events;
 951   struct ptx_event *async_cleanups = NULL;
 952   struct nvptx_thread *nvthd = nvptx_thread ();
 953
 954   pthread_mutex_lock (&ptx_event_lock);
 955
 956   while (ptx_event != NULL)
 957     {
 958       CUresult r;
 959       struct ptx_event *e = ptx_event;
 960
 961       ptx_event = ptx_event->next;
 962
 963       if (e->ord != nvthd->ptx_dev->ord)
 964         continue;
 965
 966       r = CUDA_CALL_NOCHECK (cuEventQuery, *e->evt);
 967       if (r == CUDA_SUCCESS)
 968         {
 969           bool append_async = false;
 970           CUevent *te;
 971
 972           te = e->evt;
 973
 974           switch (e->type)
 975             {
 976             case PTX_EVT_MEM:
 977             case PTX_EVT_SYNC:
 978               break;
 979
 980             case PTX_EVT_KNL:
 981               map_pop (e->addr);
 982               break;
 983
 984             case PTX_EVT_ASYNC_CLEANUP:
 985               {
 986                 /* The function gomp_plugin_async_unmap_vars needs to claim the
 987                    memory-map splay tree lock for the current device, so we
 988                    can't call it when one of our callers has already claimed
 989                    the lock.  In that case, just delay the GC for this event
 990                    until later.  */
 991                 if (!memmap_lockable)
 992                   continue;
 993
 994                 append_async = true;
 995               }
 996               break;
 997             }
 998
 999           CUDA_CALL_NOCHECK (cuEventDestroy, *te);
1000           free ((void *)te);
1001
1002           /* Unlink 'e' from ptx_events list.  */
1003           if (ptx_events == e)
1004             ptx_events = ptx_events->next;
1005           else
1006             {
1007               struct ptx_event *e_ = ptx_events;
1008               while (e_->next != e)
1009                 e_ = e_->next;
1010               e_->next = e_->next->next;
1011             }
1012
1013           if (append_async)
1014             {
1015               e->next = async_cleanups;
1016               async_cleanups = e;
1017             }
1018           else
1019             free (e);
1020         }
1021     }
1022
1023   pthread_mutex_unlock (&ptx_event_lock);
1024
1025   /* We have to do these here, after ptx_event_lock is released.  */
1026   while (async_cleanups)
1027     {
1028       struct ptx_event *e = async_cleanups;
1029       async_cleanups = async_cleanups->next;
1030
1031       GOMP_PLUGIN_async_unmap_vars (e->addr, e->val);
1032       free (e);
1033     }
1034 }
1035
1036 static void
1037 event_add (enum ptx_event_type type, CUevent *e, void *h, int val)
1038 {
1039   struct ptx_event *ptx_event;
1040   struct nvptx_thread *nvthd = nvptx_thread ();
1041
1042   assert (type == PTX_EVT_MEM || type == PTX_EVT_KNL || type == PTX_EVT_SYNC
1043           || type == PTX_EVT_ASYNC_CLEANUP);
1044
1045   ptx_event = GOMP_PLUGIN_malloc (sizeof (struct ptx_event));
1046   ptx_event->type = type;
1047   ptx_event->evt = e;
1048   ptx_event->addr = h;
1049   ptx_event->ord = nvthd->ptx_dev->ord;
1050   ptx_event->val = val;
1051
1052   pthread_mutex_lock (&ptx_event_lock);
1053
1054   ptx_event->next = ptx_events;
1055   ptx_events = ptx_event;
1056
1057   pthread_mutex_unlock (&ptx_event_lock);
1058 }
1059
1060 static void
1061 nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
1062             int async, unsigned *dims, void *targ_mem_desc)
1063 {
1064   struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
1065   CUfunction function;
1066   CUresult r;
1067   int i;
1068   struct ptx_stream *dev_str;
1069   void *kargs[1];
1070   void *hp, *dp;
1071   struct nvptx_thread *nvthd = nvptx_thread ();
1072   const char *maybe_abort_msg = "(perhaps abort was called)";
1073
1074   function = targ_fn->fn;
1075
1076   dev_str = select_stream_for_async (async, pthread_self (), false, NULL);
1077   assert (dev_str == nvthd->current_stream);
1078
1079   /* Initialize the launch dimensions.  Typically this is constant,
1080      provided by the device compiler, but we must permit runtime
1081      values.  */
1082   int seen_zero = 0;
1083   for (i = 0; i != GOMP_DIM_MAX; i++)
1084     {
1085       if (targ_fn->launch->dim[i])
1086        dims[i] = targ_fn->launch->dim[i];
1087       if (!dims[i])
1088        seen_zero = 1;
1089     }
1090
1091   if (seen_zero)
1092     {
1093       /* See if the user provided GOMP_OPENACC_DIM environment
1094          variable to specify runtime defaults. */
1095       static int default_dims[GOMP_DIM_MAX];
1096
1097       pthread_mutex_lock (&ptx_dev_lock);
1098       if (!default_dims[0])
1099         {
1100           const char *var_name = "GOMP_OPENACC_DIM";
1101           /* We only read the environment variable once.  You can't
1102              change it in the middle of execution.  The syntax  is
1103              the same as for the -fopenacc-dim compilation option.  */
1104           const char *env_var = getenv (var_name);
1105           notify_var (var_name, env_var);
1106           if (env_var)
1107             {
1108               const char *pos = env_var;
1109
1110               for (i = 0; *pos && i != GOMP_DIM_MAX; i++)
1111                 {
1112                   if (i && *pos++ != ':')
1113                     break;
1114                   if (*pos != ':')
1115                     {
1116                       const char *eptr;
1117
1118                       errno = 0;
1119                       long val = strtol (pos, (char **)&eptr, 10);
1120                       if (errno || val < 0 || (unsigned)val != val)
1121                         break;
1122                       default_dims[i] = (int)val;
1123                       pos = eptr;
1124                     }
1125                 }
1126             }
1127
1128           int warp_size, block_size, dev_size, cpu_size;
1129           CUdevice dev = nvptx_thread()->ptx_dev->dev;
1130           /* 32 is the default for known hardware.  */
1131           int gang = 0, worker = 32, vector = 32;
1132           CUdevice_attribute cu_tpb, cu_ws, cu_mpc, cu_tpm;
1133
1134           cu_tpb = CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK;
1135           cu_ws = CU_DEVICE_ATTRIBUTE_WARP_SIZE;
1136           cu_mpc = CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT;
1137           cu_tpm  = CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR;
1138
1139           if (CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &block_size, cu_tpb,
1140                                  dev) == CUDA_SUCCESS
1141               && CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &warp_size, cu_ws,
1142                                     dev) == CUDA_SUCCESS
1143               && CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &dev_size, cu_mpc,
1144                                     dev) == CUDA_SUCCESS
1145               && CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &cpu_size, cu_tpm,
1146                                     dev) == CUDA_SUCCESS)
1147             {
1148               GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
1149                                  " dev_size=%d, cpu_size=%d\n",
1150                                  warp_size, block_size, dev_size, cpu_size);
1151               gang = (cpu_size / block_size) * dev_size;
1152               worker = block_size / warp_size;
1153               vector = warp_size;
1154             }
1155
1156           /* There is no upper bound on the gang size.  The best size
1157              matches the hardware configuration.  Logical gangs are
1158              scheduled onto physical hardware.  To maximize usage, we
1159              should guess a large number.  */
1160           if (default_dims[GOMP_DIM_GANG] < 1)
1161             default_dims[GOMP_DIM_GANG] = gang ? gang : 1024;
1162           /* The worker size must not exceed the hardware.  */
1163           if (default_dims[GOMP_DIM_WORKER] < 1
1164               || (default_dims[GOMP_DIM_WORKER] > worker && gang))
1165             default_dims[GOMP_DIM_WORKER] = worker;
1166           /* The vector size must exactly match the hardware.  */
1167           if (default_dims[GOMP_DIM_VECTOR] < 1
1168               || (default_dims[GOMP_DIM_VECTOR] != vector && gang))
1169             default_dims[GOMP_DIM_VECTOR] = vector;
1170
1171           GOMP_PLUGIN_debug (0, " default dimensions [%d,%d,%d]\n",
1172                              default_dims[GOMP_DIM_GANG],
1173                              default_dims[GOMP_DIM_WORKER],
1174                              default_dims[GOMP_DIM_VECTOR]);
1175         }
1176       pthread_mutex_unlock (&ptx_dev_lock);
1177
1178       for (i = 0; i != GOMP_DIM_MAX; i++)
1179         if (!dims[i])
1180           dims[i] = default_dims[i];
1181     }
1182
1183   /* This reserves a chunk of a pre-allocated page of memory mapped on both
1184      the host and the device. HP is a host pointer to the new chunk, and DP is
1185      the corresponding device pointer.  */
1186   map_push (dev_str, async, mapnum * sizeof (void *), &hp, &dp);
1187
1188   GOMP_PLUGIN_debug (0, "  %s: prepare mappings\n", __FUNCTION__);
1189
1190   /* Copy the array of arguments to the mapped page.  */
1191   for (i = 0; i < mapnum; i++)
1192     ((void **) hp)[i] = devaddrs[i];
1193
1194   /* Copy the (device) pointers to arguments to the device (dp and hp might in
1195      fact have the same value on a unified-memory system).  */
1196   CUDA_CALL_ASSERT (cuMemcpy, (CUdeviceptr) dp, (CUdeviceptr) hp,
1197                     mapnum * sizeof (void *));
1198   GOMP_PLUGIN_debug (0, "  %s: kernel %s: launch"
1199                      " gangs=%u, workers=%u, vectors=%u\n",
1200                      __FUNCTION__, targ_fn->launch->fn, dims[GOMP_DIM_GANG],
1201                      dims[GOMP_DIM_WORKER], dims[GOMP_DIM_VECTOR]);
1202
1203   // OpenACC            CUDA
1204   //
1205   // num_gangs          nctaid.x
1206   // num_workers        ntid.y
1207   // vector length      ntid.x
1208
1209   kargs[0] = &dp;
1210   CUDA_CALL_ASSERT (cuLaunchKernel, function,
1211                     dims[GOMP_DIM_GANG], 1, 1,
1212                     dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1,
1213                     0, dev_str->stream, kargs, 0);
1214
1215 #ifndef DISABLE_ASYNC
1216   if (async < acc_async_noval)
1217     {
1218       r = CUDA_CALL_NOCHECK (cuStreamSynchronize, dev_str->stream);
1219       if (r == CUDA_ERROR_LAUNCH_FAILED)
1220         GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
1221                            maybe_abort_msg);
1222       else if (r != CUDA_SUCCESS)
1223         GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1224     }
1225   else
1226     {
1227       CUevent *e;
1228
1229       e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1230
1231       r = CUDA_CALL_NOCHECK (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1232       if (r == CUDA_ERROR_LAUNCH_FAILED)
1233         GOMP_PLUGIN_fatal ("cuEventCreate error: %s %s\n", cuda_error (r),
1234                            maybe_abort_msg);
1235       else if (r != CUDA_SUCCESS)
1236         GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1237
1238       event_gc (true);
1239
1240       CUDA_CALL_ASSERT (cuEventRecord, *e, dev_str->stream);
1241
1242       event_add (PTX_EVT_KNL, e, (void *)dev_str, 0);
1243     }
1244 #else
1245   r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
1246   if (r == CUDA_ERROR_LAUNCH_FAILED)
1247     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
1248                        maybe_abort_msg);
1249   else if (r != CUDA_SUCCESS)
1250     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
1251 #endif
1252
1253   GOMP_PLUGIN_debug (0, "  %s: kernel %s: finished\n", __FUNCTION__,
1254                      targ_fn->launch->fn);
1255
1256 #ifndef DISABLE_ASYNC
1257   if (async < acc_async_noval)
1258 #endif
1259     map_pop (dev_str);
1260 }
1261
1262 void * openacc_get_current_cuda_context (void);
1263
1264 static void *
1265 nvptx_alloc (size_t s)
1266 {
1267   CUdeviceptr d;
1268
1269   CUDA_CALL_ERET (NULL, cuMemAlloc, &d, s);
1270   return (void *) d;
1271 }
1272
1273 static bool
1274 nvptx_free (void *p)
1275 {
1276   CUdeviceptr pb;
1277   size_t ps;
1278
1279   CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) p);
1280   if ((CUdeviceptr) p != pb)
1281     {
1282       GOMP_PLUGIN_error ("invalid device address");
1283       return false;
1284     }
1285
1286   CUDA_CALL (cuMemFree, (CUdeviceptr) p);
1287   return true;
1288 }
1289
1290
1291 static bool
1292 nvptx_host2dev (void *d, const void *h, size_t s)
1293 {
1294   CUdeviceptr pb;
1295   size_t ps;
1296   struct nvptx_thread *nvthd = nvptx_thread ();
1297
1298   if (!s)
1299     return true;
1300   if (!d)
1301     {
1302       GOMP_PLUGIN_error ("invalid device address");
1303       return false;
1304     }
1305
1306   CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
1307
1308   if (!pb)
1309     {
1310       GOMP_PLUGIN_error ("invalid device address");
1311       return false;
1312     }
1313   if (!h)
1314     {
1315       GOMP_PLUGIN_error ("invalid host address");
1316       return false;
1317     }
1318   if (d == h)
1319     {
1320       GOMP_PLUGIN_error ("invalid host or device address");
1321       return false;
1322     }
1323   if ((void *)(d + s) > (void *)(pb + ps))
1324     {
1325       GOMP_PLUGIN_error ("invalid size");
1326       return false;
1327     }
1328
1329 #ifndef DISABLE_ASYNC
1330   if (nvthd && nvthd->current_stream != nvthd->ptx_dev->null_stream)
1331     {
1332       CUevent *e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1333       CUDA_CALL (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1334       event_gc (false);
1335       CUDA_CALL (cuMemcpyHtoDAsync,
1336                  (CUdeviceptr) d, h, s, nvthd->current_stream->stream);
1337       CUDA_CALL (cuEventRecord, *e, nvthd->current_stream->stream);
1338       event_add (PTX_EVT_MEM, e, (void *)h, 0);
1339     }
1340   else
1341 #endif
1342     CUDA_CALL (cuMemcpyHtoD, (CUdeviceptr) d, h, s);
1343
1344   return true;
1345 }
1346
1347 static bool
1348 nvptx_dev2host (void *h, const void *d, size_t s)
1349 {
1350   CUdeviceptr pb;
1351   size_t ps;
1352   struct nvptx_thread *nvthd = nvptx_thread ();
1353
1354   if (!s)
1355     return true;
1356   if (!d)
1357     {
1358       GOMP_PLUGIN_error ("invalid device address");
1359       return false;
1360     }
1361
1362   CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
1363
1364   if (!pb)
1365     {
1366       GOMP_PLUGIN_error ("invalid device address");
1367       return false;
1368     }
1369   if (!h)
1370     {
1371       GOMP_PLUGIN_error ("invalid host address");
1372       return false;
1373     }
1374   if (d == h)
1375     {
1376       GOMP_PLUGIN_error ("invalid host or device address");
1377       return false;
1378     }
1379   if ((void *)(d + s) > (void *)(pb + ps))
1380     {
1381       GOMP_PLUGIN_error ("invalid size");
1382       return false;
1383     }
1384
1385 #ifndef DISABLE_ASYNC
1386   if (nvthd && nvthd->current_stream != nvthd->ptx_dev->null_stream)
1387     {
1388       CUevent *e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1389       CUDA_CALL (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1390       event_gc (false);
1391       CUDA_CALL (cuMemcpyDtoHAsync,
1392                  h, (CUdeviceptr) d, s, nvthd->current_stream->stream);
1393       CUDA_CALL (cuEventRecord, *e, nvthd->current_stream->stream);
1394       event_add (PTX_EVT_MEM, e, (void *)h, 0);
1395     }
1396   else
1397 #endif
1398     CUDA_CALL (cuMemcpyDtoH, h, (CUdeviceptr) d, s);
1399
1400   return true;
1401 }
1402
1403 static void
1404 nvptx_set_async (int async)
1405 {
1406   struct nvptx_thread *nvthd = nvptx_thread ();
1407   nvthd->current_stream
1408     = select_stream_for_async (async, pthread_self (), true, NULL);
1409 }
1410
1411 static int
1412 nvptx_async_test (int async)
1413 {
1414   CUresult r;
1415   struct ptx_stream *s;
1416
1417   s = select_stream_for_async (async, pthread_self (), false, NULL);
1418
1419   if (!s)
1420     GOMP_PLUGIN_fatal ("unknown async %d", async);
1421
1422   r = CUDA_CALL_NOCHECK (cuStreamQuery, s->stream);
1423   if (r == CUDA_SUCCESS)
1424     {
1425       /* The oacc-parallel.c:goacc_wait function calls this hook to determine
1426          whether all work has completed on this stream, and if so omits the call
1427          to the wait hook.  If that happens, event_gc might not get called
1428          (which prevents variables from getting unmapped and their associated
1429          device storage freed), so call it here.  */
1430       event_gc (true);
1431       return 1;
1432     }
1433   else if (r == CUDA_ERROR_NOT_READY)
1434     return 0;
1435
1436   GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
1437
1438   return 0;
1439 }
1440
1441 static int
1442 nvptx_async_test_all (void)
1443 {
1444   struct ptx_stream *s;
1445   pthread_t self = pthread_self ();
1446   struct nvptx_thread *nvthd = nvptx_thread ();
1447
1448   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1449
1450   for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
1451     {
1452       if ((s->multithreaded || pthread_equal (s->host_thread, self))
1453           && CUDA_CALL_NOCHECK (cuStreamQuery,
1454                                 s->stream) == CUDA_ERROR_NOT_READY)
1455         {
1456           pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1457           return 0;
1458         }
1459     }
1460
1461   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1462
1463   event_gc (true);
1464
1465   return 1;
1466 }
1467
1468 static void
1469 nvptx_wait (int async)
1470 {
1471   struct ptx_stream *s;
1472
1473   s = select_stream_for_async (async, pthread_self (), false, NULL);
1474   if (!s)
1475     GOMP_PLUGIN_fatal ("unknown async %d", async);
1476
1477   CUDA_CALL_ASSERT (cuStreamSynchronize, s->stream);
1478
1479   event_gc (true);
1480 }
1481
1482 static void
1483 nvptx_wait_async (int async1, int async2)
1484 {
1485   CUevent *e;
1486   struct ptx_stream *s1, *s2;
1487   pthread_t self = pthread_self ();
1488
1489   /* The stream that is waiting (rather than being waited for) doesn't
1490      necessarily have to exist already.  */
1491   s2 = select_stream_for_async (async2, self, true, NULL);
1492
1493   s1 = select_stream_for_async (async1, self, false, NULL);
1494   if (!s1)
1495     GOMP_PLUGIN_fatal ("invalid async 1\n");
1496
1497   if (s1 == s2)
1498     GOMP_PLUGIN_fatal ("identical parameters");
1499
1500   e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1501
1502   CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1503
1504   event_gc (true);
1505
1506   CUDA_CALL_ASSERT (cuEventRecord, *e, s1->stream);
1507
1508   event_add (PTX_EVT_SYNC, e, NULL, 0);
1509
1510   CUDA_CALL_ASSERT (cuStreamWaitEvent, s2->stream, *e, 0);
1511 }
1512
1513 static void
1514 nvptx_wait_all (void)
1515 {
1516   CUresult r;
1517   struct ptx_stream *s;
1518   pthread_t self = pthread_self ();
1519   struct nvptx_thread *nvthd = nvptx_thread ();
1520
1521   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1522
1523   /* Wait for active streams initiated by this thread (or by multiple threads)
1524      to complete.  */
1525   for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
1526     {
1527       if (s->multithreaded || pthread_equal (s->host_thread, self))
1528         {
1529           r = CUDA_CALL_NOCHECK (cuStreamQuery, s->stream);
1530           if (r == CUDA_SUCCESS)
1531             continue;
1532           else if (r != CUDA_ERROR_NOT_READY)
1533             GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
1534
1535           CUDA_CALL_ASSERT (cuStreamSynchronize, s->stream);
1536         }
1537     }
1538
1539   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1540
1541   event_gc (true);
1542 }
1543
1544 static void
1545 nvptx_wait_all_async (int async)
1546 {
1547   struct ptx_stream *waiting_stream, *other_stream;
1548   CUevent *e;
1549   struct nvptx_thread *nvthd = nvptx_thread ();
1550   pthread_t self = pthread_self ();
1551
1552   /* The stream doing the waiting.  This could be the first mention of the
1553      stream, so create it if necessary.  */
1554   waiting_stream
1555     = select_stream_for_async (async, pthread_self (), true, NULL);
1556
1557   /* Launches on the null stream already block on other streams in the
1558      context.  */
1559   if (!waiting_stream || waiting_stream == nvthd->ptx_dev->null_stream)
1560     return;
1561
1562   event_gc (true);
1563
1564   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1565
1566   for (other_stream = nvthd->ptx_dev->active_streams;
1567        other_stream != NULL;
1568        other_stream = other_stream->next)
1569     {
1570       if (!other_stream->multithreaded
1571           && !pthread_equal (other_stream->host_thread, self))
1572         continue;
1573
1574       e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1575
1576       CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1577
1578       /* Record an event on the waited-for stream.  */
1579       CUDA_CALL_ASSERT (cuEventRecord, *e, other_stream->stream);
1580
1581       event_add (PTX_EVT_SYNC, e, NULL, 0);
1582
1583       CUDA_CALL_ASSERT (cuStreamWaitEvent, waiting_stream->stream, *e, 0);
1584    }
1585
1586   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1587 }
1588
1589 static void *
1590 nvptx_get_current_cuda_device (void)
1591 {
1592   struct nvptx_thread *nvthd = nvptx_thread ();
1593
1594   if (!nvthd || !nvthd->ptx_dev)
1595     return NULL;
1596
1597   return &nvthd->ptx_dev->dev;
1598 }
1599
1600 static void *
1601 nvptx_get_current_cuda_context (void)
1602 {
1603   struct nvptx_thread *nvthd = nvptx_thread ();
1604
1605   if (!nvthd || !nvthd->ptx_dev)
1606     return NULL;
1607
1608   return nvthd->ptx_dev->ctx;
1609 }
1610
1611 static void *
1612 nvptx_get_cuda_stream (int async)
1613 {
1614   struct ptx_stream *s;
1615   struct nvptx_thread *nvthd = nvptx_thread ();
1616
1617   if (!nvthd || !nvthd->ptx_dev)
1618     return NULL;
1619
1620   s = select_stream_for_async (async, pthread_self (), false, NULL);
1621
1622   return s ? s->stream : NULL;
1623 }
1624
1625 static int
1626 nvptx_set_cuda_stream (int async, void *stream)
1627 {
1628   struct ptx_stream *oldstream;
1629   pthread_t self = pthread_self ();
1630   struct nvptx_thread *nvthd = nvptx_thread ();
1631
1632   if (async < 0)
1633     GOMP_PLUGIN_fatal ("bad async %d", async);
1634
1635   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1636
1637   /* We have a list of active streams and an array mapping async values to
1638      entries of that list.  We need to take "ownership" of the passed-in stream,
1639      and add it to our list, removing the previous entry also (if there was one)
1640      in order to prevent resource leaks.  Note the potential for surprise
1641      here: maybe we should keep track of passed-in streams and leave it up to
1642      the user to tidy those up, but that doesn't work for stream handles
1643      returned from acc_get_cuda_stream above...  */
1644
1645   oldstream = select_stream_for_async (async, self, false, NULL);
1646
1647   if (oldstream)
1648     {
1649       if (nvthd->ptx_dev->active_streams == oldstream)
1650         nvthd->ptx_dev->active_streams = nvthd->ptx_dev->active_streams->next;
1651       else
1652         {
1653           struct ptx_stream *s = nvthd->ptx_dev->active_streams;
1654           while (s->next != oldstream)
1655             s = s->next;
1656           s->next = s->next->next;
1657         }
1658
1659       CUDA_CALL_ASSERT (cuStreamDestroy, oldstream->stream);
1660
1661       if (!map_fini (oldstream))
1662         GOMP_PLUGIN_fatal ("error when freeing host memory");
1663
1664       free (oldstream);
1665     }
1666
1667   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1668
1669   (void) select_stream_for_async (async, self, true, (CUstream) stream);
1670
1671   return 1;
1672 }
1673
1674 /* Plugin entry points.  */
1675
1676 const char *
1677 GOMP_OFFLOAD_get_name (void)
1678 {
1679   return "nvptx";
1680 }
1681
1682 unsigned int
1683 GOMP_OFFLOAD_get_caps (void)
1684 {
1685   return GOMP_OFFLOAD_CAP_OPENACC_200 | GOMP_OFFLOAD_CAP_OPENMP_400;
1686 }
1687
1688 int
1689 GOMP_OFFLOAD_get_type (void)
1690 {
1691   return OFFLOAD_TARGET_TYPE_NVIDIA_PTX;
1692 }
1693
1694 int
1695 GOMP_OFFLOAD_get_num_devices (void)
1696 {
1697   return nvptx_get_num_devices ();
1698 }
1699
1700 bool
1701 GOMP_OFFLOAD_init_device (int n)
1702 {
1703   struct ptx_device *dev;
1704
1705   pthread_mutex_lock (&ptx_dev_lock);
1706
1707   if (!nvptx_init () || ptx_devices[n] != NULL)
1708     {
1709       pthread_mutex_unlock (&ptx_dev_lock);
1710       return false;
1711     }
1712
1713   dev = nvptx_open_device (n);
1714   if (dev)
1715     {
1716       ptx_devices[n] = dev;
1717       instantiated_devices++;
1718     }
1719
1720   pthread_mutex_unlock (&ptx_dev_lock);
1721
1722   return dev != NULL;
1723 }
1724
1725 bool
1726 GOMP_OFFLOAD_fini_device (int n)
1727 {
1728   pthread_mutex_lock (&ptx_dev_lock);
1729
1730   if (ptx_devices[n] != NULL)
1731     {
1732       if (!nvptx_attach_host_thread_to_device (n)
1733           || !nvptx_close_device (ptx_devices[n]))
1734         {
1735           pthread_mutex_unlock (&ptx_dev_lock);
1736           return false;
1737         }
1738       ptx_devices[n] = NULL;
1739       instantiated_devices--;
1740     }
1741
1742   pthread_mutex_unlock (&ptx_dev_lock);
1743   return true;
1744 }
1745
1746 /* Return the libgomp version number we're compatible with.  There is
1747    no requirement for cross-version compatibility.  */
1748
1749 unsigned
1750 GOMP_OFFLOAD_version (void)
1751 {
1752   return GOMP_VERSION;
1753 }
1754
1755 /* Initialize __nvptx_clocktick, if present in MODULE.  */
1756
1757 static void
1758 nvptx_set_clocktick (CUmodule module, struct ptx_device *dev)
1759 {
1760   CUdeviceptr dptr;
1761   CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &dptr, NULL,
1762                                   module, "__nvptx_clocktick");
1763   if (r == CUDA_ERROR_NOT_FOUND)
1764     return;
1765   if (r != CUDA_SUCCESS)
1766     GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
1767   double __nvptx_clocktick = 1e-3 / dev->clock_khz;
1768   r = CUDA_CALL_NOCHECK (cuMemcpyHtoD, dptr, &__nvptx_clocktick,
1769                          sizeof (__nvptx_clocktick));
1770   if (r != CUDA_SUCCESS)
1771     GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
1772 }
1773
1774 /* Load the (partial) program described by TARGET_DATA to device
1775    number ORD.  Allocate and return TARGET_TABLE.  */
1776
1777 int
1778 GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
1779                          struct addr_pair **target_table)
1780 {
1781   CUmodule module;
1782   const char *const *var_names;
1783   const struct targ_fn_launch *fn_descs;
1784   unsigned int fn_entries, var_entries, i, j;
1785   struct targ_fn_descriptor *targ_fns;
1786   struct addr_pair *targ_tbl;
1787   const nvptx_tdata_t *img_header = (const nvptx_tdata_t *) target_data;
1788   struct ptx_image_data *new_image;
1789   struct ptx_device *dev;
1790
1791   if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1792     {
1793       GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1794                          " (expected %u, received %u)",
1795                          GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1796       return -1;
1797     }
1798
1799   if (!nvptx_attach_host_thread_to_device (ord)
1800       || !link_ptx (&module, img_header->ptx_objs, img_header->ptx_num))
1801     return -1;
1802
1803   dev = ptx_devices[ord];
1804
1805   /* The mkoffload utility emits a struct of pointers/integers at the
1806      start of each offload image.  The array of kernel names and the
1807      functions addresses form a one-to-one correspondence.  */
1808
1809   var_entries = img_header->var_num;
1810   var_names = img_header->var_names;
1811   fn_entries = img_header->fn_num;
1812   fn_descs = img_header->fn_descs;
1813
1814   targ_tbl = GOMP_PLUGIN_malloc (sizeof (struct addr_pair)
1815                                  * (fn_entries + var_entries));
1816   targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
1817                                  * fn_entries);
1818
1819   *target_table = targ_tbl;
1820
1821   new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data));
1822   new_image->target_data = target_data;
1823   new_image->module = module;
1824   new_image->fns = targ_fns;
1825
1826   pthread_mutex_lock (&dev->image_lock);
1827   new_image->next = dev->images;
1828   dev->images = new_image;
1829   pthread_mutex_unlock (&dev->image_lock);
1830
1831   for (i = 0; i < fn_entries; i++, targ_fns++, targ_tbl++)
1832     {
1833       CUfunction function;
1834       int nregs, mthrs;
1835
1836       CUDA_CALL_ERET (-1, cuModuleGetFunction, &function, module,
1837                       fn_descs[i].fn);
1838       CUDA_CALL_ERET (-1, cuFuncGetAttribute, &nregs,
1839                       CU_FUNC_ATTRIBUTE_NUM_REGS, function);
1840       CUDA_CALL_ERET (-1, cuFuncGetAttribute, &mthrs,
1841                       CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, function);
1842
1843       targ_fns->fn = function;
1844       targ_fns->launch = &fn_descs[i];
1845       targ_fns->regs_per_thread = nregs;
1846       targ_fns->max_threads_per_block = mthrs;
1847
1848       targ_tbl->start = (uintptr_t) targ_fns;
1849       targ_tbl->end = targ_tbl->start + 1;
1850     }
1851
1852   for (j = 0; j < var_entries; j++, targ_tbl++)
1853     {
1854       CUdeviceptr var;
1855       size_t bytes;
1856
1857       CUDA_CALL_ERET (-1, cuModuleGetGlobal,
1858                       &var, &bytes, module, var_names[j]);
1859
1860       targ_tbl->start = (uintptr_t) var;
1861       targ_tbl->end = targ_tbl->start + bytes;
1862     }
1863
1864   nvptx_set_clocktick (module, dev);
1865
1866   return fn_entries + var_entries;
1867 }
1868
1869 /* Unload the program described by TARGET_DATA.  DEV_DATA is the
1870    function descriptors allocated by G_O_load_image.  */
1871
1872 bool
1873 GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data)
1874 {
1875   struct ptx_image_data *image, **prev_p;
1876   struct ptx_device *dev = ptx_devices[ord];
1877
1878   if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1879     {
1880       GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1881                          " (expected %u, received %u)",
1882                          GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1883       return false;
1884     }
1885
1886   bool ret = true;
1887   pthread_mutex_lock (&dev->image_lock);
1888   for (prev_p = &dev->images; (image = *prev_p) != 0; prev_p = &image->next)
1889     if (image->target_data == target_data)
1890       {
1891         *prev_p = image->next;
1892         if (CUDA_CALL_NOCHECK (cuModuleUnload, image->module) != CUDA_SUCCESS)
1893           ret = false;
1894         free (image->fns);
1895         free (image);
1896         break;
1897       }
1898   pthread_mutex_unlock (&dev->image_lock);
1899   return ret;
1900 }
1901
1902 void *
1903 GOMP_OFFLOAD_alloc (int ord, size_t size)
1904 {
1905   if (!nvptx_attach_host_thread_to_device (ord))
1906     return NULL;
1907   return nvptx_alloc (size);
1908 }
1909
1910 bool
1911 GOMP_OFFLOAD_free (int ord, void *ptr)
1912 {
1913   return (nvptx_attach_host_thread_to_device (ord)
1914           && nvptx_free (ptr));
1915 }
1916
1917 bool
1918 GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
1919 {
1920   return (nvptx_attach_host_thread_to_device (ord)
1921           && nvptx_dev2host (dst, src, n));
1922 }
1923
1924 bool
1925 GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
1926 {
1927   return (nvptx_attach_host_thread_to_device (ord)
1928           && nvptx_host2dev (dst, src, n));
1929 }
1930
1931 bool
1932 GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n)
1933 {
1934   struct ptx_device *ptx_dev = ptx_devices[ord];
1935   CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n,
1936                                 ptx_dev->null_stream->stream);
1937   return true;
1938 }
1939
1940 void (*device_run) (int n, void *fn_ptr, void *vars) = NULL;
1941
1942 void
1943 GOMP_OFFLOAD_openacc_exec (void (*fn) (void *), size_t mapnum,
1944                            void **hostaddrs, void **devaddrs,
1945                            int async, unsigned *dims, void *targ_mem_desc)
1946 {
1947   nvptx_exec (fn, mapnum, hostaddrs, devaddrs, async, dims, targ_mem_desc);
1948 }
1949
1950 void
1951 GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc, int async)
1952 {
1953   struct nvptx_thread *nvthd = nvptx_thread ();
1954   CUevent *e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1955
1956   CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1957   CUDA_CALL_ASSERT (cuEventRecord, *e, nvthd->current_stream->stream);
1958   event_add (PTX_EVT_ASYNC_CLEANUP, e, targ_mem_desc, async);
1959 }
1960
1961 int
1962 GOMP_OFFLOAD_openacc_async_test (int async)
1963 {
1964   return nvptx_async_test (async);
1965 }
1966
1967 int
1968 GOMP_OFFLOAD_openacc_async_test_all (void)
1969 {
1970   return nvptx_async_test_all ();
1971 }
1972
1973 void
1974 GOMP_OFFLOAD_openacc_async_wait (int async)
1975 {
1976   nvptx_wait (async);
1977 }
1978
1979 void
1980 GOMP_OFFLOAD_openacc_async_wait_async (int async1, int async2)
1981 {
1982   nvptx_wait_async (async1, async2);
1983 }
1984
1985 void
1986 GOMP_OFFLOAD_openacc_async_wait_all (void)
1987 {
1988   nvptx_wait_all ();
1989 }
1990
1991 void
1992 GOMP_OFFLOAD_openacc_async_wait_all_async (int async)
1993 {
1994   nvptx_wait_all_async (async);
1995 }
1996
1997 void
1998 GOMP_OFFLOAD_openacc_async_set_async (int async)
1999 {
2000   nvptx_set_async (async);
2001 }
2002
2003 void *
2004 GOMP_OFFLOAD_openacc_create_thread_data (int ord)
2005 {
2006   struct ptx_device *ptx_dev;
2007   struct nvptx_thread *nvthd
2008     = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
2009   CUcontext thd_ctx;
2010
2011   ptx_dev = ptx_devices[ord];
2012
2013   assert (ptx_dev);
2014
2015   CUDA_CALL_ASSERT (cuCtxGetCurrent, &thd_ctx);
2016
2017   assert (ptx_dev->ctx);
2018
2019   if (!thd_ctx)
2020     CUDA_CALL_ASSERT (cuCtxPushCurrent, ptx_dev->ctx);
2021
2022   nvthd->current_stream = ptx_dev->null_stream;
2023   nvthd->ptx_dev = ptx_dev;
2024
2025   return (void *) nvthd;
2026 }
2027
2028 void
2029 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
2030 {
2031   free (data);
2032 }
2033
2034 void *
2035 GOMP_OFFLOAD_openacc_cuda_get_current_device (void)
2036 {
2037   return nvptx_get_current_cuda_device ();
2038 }
2039
2040 void *
2041 GOMP_OFFLOAD_openacc_cuda_get_current_context (void)
2042 {
2043   return nvptx_get_current_cuda_context ();
2044 }
2045
2046 /* NOTE: This returns a CUstream, not a ptx_stream pointer.  */
2047
2048 void *
2049 GOMP_OFFLOAD_openacc_cuda_get_stream (int async)
2050 {
2051   return nvptx_get_cuda_stream (async);
2052 }
2053
2054 /* NOTE: This takes a CUstream, not a ptx_stream pointer.  */
2055
2056 int
2057 GOMP_OFFLOAD_openacc_cuda_set_stream (int async, void *stream)
2058 {
2059   return nvptx_set_cuda_stream (async, stream);
2060 }
2061
2062 /* Adjust launch dimensions: pick good values for number of blocks and warps
2063    and ensure that number of warps does not exceed CUDA limits as well as GCC's
2064    own limits.  */
2065
2066 static void
2067 nvptx_adjust_launch_bounds (struct targ_fn_descriptor *fn,
2068                             struct ptx_device *ptx_dev,
2069                             int *teams_p, int *threads_p)
2070 {
2071   int max_warps_block = fn->max_threads_per_block / 32;
2072   /* Maximum 32 warps per block is an implementation limit in NVPTX backend
2073      and libgcc, which matches documented limit of all GPUs as of 2015.  */
2074   if (max_warps_block > 32)
2075     max_warps_block = 32;
2076   if (*threads_p <= 0)
2077     *threads_p = 8;
2078   if (*threads_p > max_warps_block)
2079     *threads_p = max_warps_block;
2080
2081   int regs_per_block = fn->regs_per_thread * 32 * *threads_p;
2082   /* This is an estimate of how many blocks the device can host simultaneously.
2083      Actual limit, which may be lower, can be queried with "occupancy control"
2084      driver interface (since CUDA 6.0).  */
2085   int max_blocks = ptx_dev->regs_per_sm / regs_per_block * ptx_dev->num_sms;
2086   if (*teams_p <= 0 || *teams_p > max_blocks)
2087     *teams_p = max_blocks;
2088 }
2089
2090 /* Return the size of per-warp stacks (see gcc -msoft-stack) to use for OpenMP
2091    target regions.  */
2092
2093 static size_t
2094 nvptx_stacks_size ()
2095 {
2096   return 128 * 1024;
2097 }
2098
2099 /* Return contiguous storage for NUM stacks, each SIZE bytes.  */
2100
2101 static void *
2102 nvptx_stacks_alloc (size_t size, int num)
2103 {
2104   CUdeviceptr stacks;
2105   CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &stacks, size * num);
2106   if (r != CUDA_SUCCESS)
2107     GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
2108   return (void *) stacks;
2109 }
2110
2111 /* Release storage previously allocated by nvptx_stacks_alloc.  */
2112
2113 static void
2114 nvptx_stacks_free (void *p, int num)
2115 {
2116   CUresult r = CUDA_CALL_NOCHECK (cuMemFree, (CUdeviceptr) p);
2117   if (r != CUDA_SUCCESS)
2118     GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
2119 }
2120
2121 void
2122 GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
2123 {
2124   CUfunction function = ((struct targ_fn_descriptor *) tgt_fn)->fn;
2125   CUresult r;
2126   struct ptx_device *ptx_dev = ptx_devices[ord];
2127   const char *maybe_abort_msg = "(perhaps abort was called)";
2128   int teams = 0, threads = 0;
2129
2130   if (!args)
2131     GOMP_PLUGIN_fatal ("No target arguments provided");
2132   while (*args)
2133     {
2134       intptr_t id = (intptr_t) *args++, val;
2135       if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
2136         val = (intptr_t) *args++;
2137       else
2138         val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
2139       if ((id & GOMP_TARGET_ARG_DEVICE_MASK) != GOMP_TARGET_ARG_DEVICE_ALL)
2140         continue;
2141       val = val > INT_MAX ? INT_MAX : val;
2142       id &= GOMP_TARGET_ARG_ID_MASK;
2143       if (id == GOMP_TARGET_ARG_NUM_TEAMS)
2144         teams = val;
2145       else if (id == GOMP_TARGET_ARG_THREAD_LIMIT)
2146         threads = val;
2147     }
2148   nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads);
2149
2150   size_t stack_size = nvptx_stacks_size ();
2151   void *stacks = nvptx_stacks_alloc (stack_size, teams * threads);
2152   void *fn_args[] = {tgt_vars, stacks, (void *) stack_size};
2153   size_t fn_args_size = sizeof fn_args;
2154   void *config[] = {
2155     CU_LAUNCH_PARAM_BUFFER_POINTER, fn_args,
2156     CU_LAUNCH_PARAM_BUFFER_SIZE, &fn_args_size,
2157     CU_LAUNCH_PARAM_END
2158   };
2159   r = CUDA_CALL_NOCHECK (cuLaunchKernel, function, teams, 1, 1,
2160                          32, threads, 1, 0, ptx_dev->null_stream->stream,
2161                          NULL, config);
2162   if (r != CUDA_SUCCESS)
2163     GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
2164
2165   r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
2166   if (r == CUDA_ERROR_LAUNCH_FAILED)
2167     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
2168                        maybe_abort_msg);
2169   else if (r != CUDA_SUCCESS)
2170     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
2171   nvptx_stacks_free (stacks, teams * threads);
2172 }
2173
2174 void
2175 GOMP_OFFLOAD_async_run (int ord, void *tgt_fn, void *tgt_vars, void **args,
2176                         void *async_data)
2177 {
2178   GOMP_PLUGIN_fatal ("GOMP_OFFLOAD_async_run unimplemented");
2179 }