libgomp/plugin/plugin-nvptx.c

   1 /* Plugin for NVPTX execution.
   2
   3    Copyright (C) 2013-2018 Free Software Foundation, Inc.
   4
   5    Contributed by Mentor Embedded.
   6
   7    This file is part of the GNU Offloading and Multi Processing Library
   8    (libgomp).
   9
  10    Libgomp is free software; you can redistribute it and/or modify it
  11    under the terms of the GNU General Public License as published by
  12    the Free Software Foundation; either version 3, or (at your option)
  13    any later version.
  14
  15    Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
  16    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  17    FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  18    more details.
  19
  20    Under Section 7 of GPL version 3, you are granted additional
  21    permissions described in the GCC Runtime Library Exception, version
  22    3.1, as published by the Free Software Foundation.
  23
  24    You should have received a copy of the GNU General Public License and
  25    a copy of the GCC Runtime Library Exception along with this program;
  26    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  27    <http://www.gnu.org/licenses/>.  */
  28
  29 /* Nvidia PTX-specific parts of OpenACC support.  The cuda driver
  30    library appears to hold some implicit state, but the documentation
  31    is not clear as to what that state might be.  Or how one might
  32    propagate it from one thread to another.  */
  33
  34 #define _GNU_SOURCE
  35 #include "openacc.h"
  36 #include "config.h"
  37 #include "libgomp-plugin.h"
  38 #include "oacc-plugin.h"
  39 #include "gomp-constants.h"
  40
  41 #include <pthread.h>
  42 #include <cuda.h>
  43 #include <stdbool.h>
  44 #include <stdint.h>
  45 #include <limits.h>
  46 #include <string.h>
  47 #include <stdio.h>
  48 #include <unistd.h>
  49 #include <assert.h>
  50 #include <errno.h>
  51
  52 #if PLUGIN_NVPTX_DYNAMIC
  53 # include <dlfcn.h>
  54
  55 # define CUDA_CALLS \
  56 CUDA_ONE_CALL (cuCtxCreate)             \
  57 CUDA_ONE_CALL (cuCtxDestroy)            \
  58 CUDA_ONE_CALL (cuCtxGetCurrent)         \
  59 CUDA_ONE_CALL (cuCtxGetDevice)          \
  60 CUDA_ONE_CALL (cuCtxPopCurrent)         \
  61 CUDA_ONE_CALL (cuCtxPushCurrent)        \
  62 CUDA_ONE_CALL (cuCtxSynchronize)        \
  63 CUDA_ONE_CALL (cuDeviceGet)             \
  64 CUDA_ONE_CALL (cuDeviceGetAttribute)    \
  65 CUDA_ONE_CALL (cuDeviceGetCount)        \
  66 CUDA_ONE_CALL (cuEventCreate)           \
  67 CUDA_ONE_CALL (cuEventDestroy)          \
  68 CUDA_ONE_CALL (cuEventElapsedTime)      \
  69 CUDA_ONE_CALL (cuEventQuery)            \
  70 CUDA_ONE_CALL (cuEventRecord)           \
  71 CUDA_ONE_CALL (cuEventSynchronize)      \
  72 CUDA_ONE_CALL (cuFuncGetAttribute)      \
  73 CUDA_ONE_CALL (cuGetErrorString)        \
  74 CUDA_ONE_CALL (cuInit)                  \
  75 CUDA_ONE_CALL (cuLaunchKernel)          \
  76 CUDA_ONE_CALL (cuLinkAddData)           \
  77 CUDA_ONE_CALL (cuLinkComplete)          \
  78 CUDA_ONE_CALL (cuLinkCreate)            \
  79 CUDA_ONE_CALL (cuLinkDestroy)           \
  80 CUDA_ONE_CALL (cuMemAlloc)              \
  81 CUDA_ONE_CALL (cuMemAllocHost)          \
  82 CUDA_ONE_CALL (cuMemcpy)                \
  83 CUDA_ONE_CALL (cuMemcpyDtoDAsync)       \
  84 CUDA_ONE_CALL (cuMemcpyDtoH)            \
  85 CUDA_ONE_CALL (cuMemcpyDtoHAsync)       \
  86 CUDA_ONE_CALL (cuMemcpyHtoD)            \
  87 CUDA_ONE_CALL (cuMemcpyHtoDAsync)       \
  88 CUDA_ONE_CALL (cuMemFree)               \
  89 CUDA_ONE_CALL (cuMemFreeHost)           \
  90 CUDA_ONE_CALL (cuMemGetAddressRange)    \
  91 CUDA_ONE_CALL (cuMemHostGetDevicePointer)\
  92 CUDA_ONE_CALL (cuModuleGetFunction)     \
  93 CUDA_ONE_CALL (cuModuleGetGlobal)       \
  94 CUDA_ONE_CALL (cuModuleLoad)            \
  95 CUDA_ONE_CALL (cuModuleLoadData)        \
  96 CUDA_ONE_CALL (cuModuleUnload)          \
  97 CUDA_ONE_CALL (cuStreamCreate)          \
  98 CUDA_ONE_CALL (cuStreamDestroy)         \
  99 CUDA_ONE_CALL (cuStreamQuery)           \
 100 CUDA_ONE_CALL (cuStreamSynchronize)     \
 101 CUDA_ONE_CALL (cuStreamWaitEvent)
 102 # define CUDA_ONE_CALL(call) \
 103   __typeof (call) *call;
 104 struct cuda_lib_s {
 105   CUDA_CALLS
 106 } cuda_lib;
 107
 108 /* -1 if init_cuda_lib has not been called yet, false
 109    if it has been and failed, true if it has been and succeeded.  */
 110 static signed char cuda_lib_inited = -1;
 111
 112 /* Dynamically load the CUDA runtime library and initialize function
 113    pointers, return false if unsuccessful, true if successful.  */
 114 static bool
 115 init_cuda_lib (void)
 116 {
 117   if (cuda_lib_inited != -1)
 118     return cuda_lib_inited;
 119   const char *cuda_runtime_lib = "libcuda.so.1";
 120   void *h = dlopen (cuda_runtime_lib, RTLD_LAZY);
 121   cuda_lib_inited = false;
 122   if (h == NULL)
 123     return false;
 124 # undef CUDA_ONE_CALL
 125 # define CUDA_ONE_CALL(call) CUDA_ONE_CALL_1 (call)
 126 # define CUDA_ONE_CALL_1(call) \
 127   cuda_lib.call = dlsym (h, #call);     \
 128   if (cuda_lib.call == NULL)            \
 129     return false;
 130   CUDA_CALLS
 131   cuda_lib_inited = true;
 132   return true;
 133 }
 134 # undef CUDA_ONE_CALL
 135 # undef CUDA_ONE_CALL_1
 136 # define CUDA_CALL_PREFIX cuda_lib.
 137 #else
 138 # define CUDA_CALL_PREFIX
 139 # define init_cuda_lib() true
 140 #endif
 141
 142 #include "secure_getenv.h"
 143
 144 /* Convenience macros for the frequently used CUDA library call and
 145    error handling sequence as well as CUDA library calls that
 146    do the error checking themselves or don't do it at all.  */
 147
 148 #define CUDA_CALL_ERET(ERET, FN, ...)           \
 149   do {                                          \
 150     unsigned __r                                \
 151       = CUDA_CALL_PREFIX FN (__VA_ARGS__);      \
 152     if (__r != CUDA_SUCCESS)                    \
 153       {                                         \
 154         GOMP_PLUGIN_error (#FN " error: %s",    \
 155                            cuda_error (__r));   \
 156         return ERET;                            \
 157       }                                         \
 158   } while (0)
 159
 160 #define CUDA_CALL(FN, ...)                      \
 161   CUDA_CALL_ERET (false, FN, __VA_ARGS__)
 162
 163 #define CUDA_CALL_ASSERT(FN, ...)               \
 164   do {                                          \
 165     unsigned __r                                \
 166       = CUDA_CALL_PREFIX FN (__VA_ARGS__);      \
 167     if (__r != CUDA_SUCCESS)                    \
 168       {                                         \
 169         GOMP_PLUGIN_fatal (#FN " error: %s",    \
 170                            cuda_error (__r));   \
 171       }                                         \
 172   } while (0)
 173
 174 #define CUDA_CALL_NOCHECK(FN, ...)              \
 175   CUDA_CALL_PREFIX FN (__VA_ARGS__)
 176
 177 static const char *
 178 cuda_error (CUresult r)
 179 {
 180 #if CUDA_VERSION < 7000
 181   /* Specified in documentation and present in library from at least
 182      5.5.  Not declared in header file prior to 7.0.  */
 183   extern CUresult cuGetErrorString (CUresult, const char **);
 184 #endif
 185   const char *desc;
 186
 187   r = CUDA_CALL_NOCHECK (cuGetErrorString, r, &desc);
 188   if (r != CUDA_SUCCESS)
 189     desc = "unknown cuda error";
 190
 191   return desc;
 192 }
 193
 194 static unsigned int instantiated_devices = 0;
 195 static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
 196
 197 struct ptx_stream
 198 {
 199   CUstream stream;
 200   pthread_t host_thread;
 201   bool multithreaded;
 202
 203   CUdeviceptr d;
 204   void *h;
 205   void *h_begin;
 206   void *h_end;
 207   void *h_next;
 208   void *h_prev;
 209   void *h_tail;
 210
 211   struct ptx_stream *next;
 212 };
 213
 214 /* Thread-specific data for PTX.  */
 215
 216 struct nvptx_thread
 217 {
 218   struct ptx_stream *current_stream;
 219   struct ptx_device *ptx_dev;
 220 };
 221
 222 struct map
 223 {
 224   int     async;
 225   size_t  size;
 226   char    mappings[0];
 227 };
 228
 229 static bool
 230 map_init (struct ptx_stream *s)
 231 {
 232   int size = getpagesize ();
 233
 234   assert (s);
 235   assert (!s->d);
 236   assert (!s->h);
 237
 238   CUDA_CALL (cuMemAllocHost, &s->h, size);
 239   CUDA_CALL (cuMemHostGetDevicePointer, &s->d, s->h, 0);
 240
 241   assert (s->h);
 242
 243   s->h_begin = s->h;
 244   s->h_end = s->h_begin + size;
 245   s->h_next = s->h_prev = s->h_tail = s->h_begin;
 246
 247   assert (s->h_next);
 248   assert (s->h_end);
 249   return true;
 250 }
 251
 252 static bool
 253 map_fini (struct ptx_stream *s)
 254 {
 255   CUDA_CALL (cuMemFreeHost, s->h);
 256   return true;
 257 }
 258
 259 static void
 260 map_pop (struct ptx_stream *s)
 261 {
 262   struct map *m;
 263
 264   assert (s != NULL);
 265   assert (s->h_next);
 266   assert (s->h_prev);
 267   assert (s->h_tail);
 268
 269   m = s->h_tail;
 270
 271   s->h_tail += m->size;
 272
 273   if (s->h_tail >= s->h_end)
 274     s->h_tail = s->h_begin + (int) (s->h_tail - s->h_end);
 275
 276   if (s->h_next == s->h_tail)
 277     s->h_prev = s->h_next;
 278
 279   assert (s->h_next >= s->h_begin);
 280   assert (s->h_tail >= s->h_begin);
 281   assert (s->h_prev >= s->h_begin);
 282
 283   assert (s->h_next <= s->h_end);
 284   assert (s->h_tail <= s->h_end);
 285   assert (s->h_prev <= s->h_end);
 286 }
 287
 288 static void
 289 map_push (struct ptx_stream *s, int async, size_t size, void **h, void **d)
 290 {
 291   int left;
 292   int offset;
 293   struct map *m;
 294
 295   assert (s != NULL);
 296
 297   left = s->h_end - s->h_next;
 298   size += sizeof (struct map);
 299
 300   assert (s->h_prev);
 301   assert (s->h_next);
 302
 303   if (size >= left)
 304     {
 305       m = s->h_prev;
 306       m->size += left;
 307       s->h_next = s->h_begin;
 308
 309       if (s->h_next + size > s->h_end)
 310         GOMP_PLUGIN_fatal ("unable to push map");
 311     }
 312
 313   assert (s->h_next);
 314
 315   m = s->h_next;
 316   m->async = async;
 317   m->size = size;
 318
 319   offset = (void *)&m->mappings[0] - s->h;
 320
 321   *d = (void *)(s->d + offset);
 322   *h = (void *)(s->h + offset);
 323
 324   s->h_prev = s->h_next;
 325   s->h_next += size;
 326
 327   assert (s->h_prev);
 328   assert (s->h_next);
 329
 330   assert (s->h_next >= s->h_begin);
 331   assert (s->h_tail >= s->h_begin);
 332   assert (s->h_prev >= s->h_begin);
 333   assert (s->h_next <= s->h_end);
 334   assert (s->h_tail <= s->h_end);
 335   assert (s->h_prev <= s->h_end);
 336
 337   return;
 338 }
 339
 340 /* Target data function launch information.  */
 341
 342 struct targ_fn_launch
 343 {
 344   const char *fn;
 345   unsigned short dim[GOMP_DIM_MAX];
 346 };
 347
 348 /* Target PTX object information.  */
 349
 350 struct targ_ptx_obj
 351 {
 352   const char *code;
 353   size_t size;
 354 };
 355
 356 /* Target data image information.  */
 357
 358 typedef struct nvptx_tdata
 359 {
 360   const struct targ_ptx_obj *ptx_objs;
 361   unsigned ptx_num;
 362
 363   const char *const *var_names;
 364   unsigned var_num;
 365
 366   const struct targ_fn_launch *fn_descs;
 367   unsigned fn_num;
 368 } nvptx_tdata_t;
 369
 370 /* Descriptor of a loaded function.  */
 371
 372 struct targ_fn_descriptor
 373 {
 374   CUfunction fn;
 375   const struct targ_fn_launch *launch;
 376   int regs_per_thread;
 377   int max_threads_per_block;
 378 };
 379
 380 /* A loaded PTX image.  */
 381 struct ptx_image_data
 382 {
 383   const void *target_data;
 384   CUmodule module;
 385
 386   struct targ_fn_descriptor *fns;  /* Array of functions.  */
 387
 388   struct ptx_image_data *next;
 389 };
 390
 391 struct ptx_device
 392 {
 393   CUcontext ctx;
 394   bool ctx_shared;
 395   CUdevice dev;
 396   struct ptx_stream *null_stream;
 397   /* All non-null streams associated with this device (actually context),
 398      either created implicitly or passed in from the user (via
 399      acc_set_cuda_stream).  */
 400   struct ptx_stream *active_streams;
 401   struct {
 402     struct ptx_stream **arr;
 403     int size;
 404   } async_streams;
 405   /* A lock for use when manipulating the above stream list and array.  */
 406   pthread_mutex_t stream_lock;
 407   int ord;
 408   bool overlap;
 409   bool map;
 410   bool concur;
 411   bool mkern;
 412   int  mode;
 413   int clock_khz;
 414   int num_sms;
 415   int regs_per_block;
 416   int regs_per_sm;
 417
 418   struct ptx_image_data *images;  /* Images loaded on device.  */
 419   pthread_mutex_t image_lock;     /* Lock for above list.  */
 420
 421   struct ptx_device *next;
 422 };
 423
 424 enum ptx_event_type
 425 {
 426   PTX_EVT_MEM,
 427   PTX_EVT_KNL,
 428   PTX_EVT_SYNC,
 429   PTX_EVT_ASYNC_CLEANUP
 430 };
 431
 432 struct ptx_event
 433 {
 434   CUevent *evt;
 435   int type;
 436   void *addr;
 437   int ord;
 438   int val;
 439
 440   struct ptx_event *next;
 441 };
 442
 443 static pthread_mutex_t ptx_event_lock;
 444 static struct ptx_event *ptx_events;
 445
 446 static struct ptx_device **ptx_devices;
 447
 448 static inline struct nvptx_thread *
 449 nvptx_thread (void)
 450 {
 451   return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
 452 }
 453
 454 static bool
 455 init_streams_for_device (struct ptx_device *ptx_dev, int concurrency)
 456 {
 457   int i;
 458   struct ptx_stream *null_stream
 459     = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
 460
 461   null_stream->stream = NULL;
 462   null_stream->host_thread = pthread_self ();
 463   null_stream->multithreaded = true;
 464   null_stream->d = (CUdeviceptr) NULL;
 465   null_stream->h = NULL;
 466   if (!map_init (null_stream))
 467     return false;
 468
 469   ptx_dev->null_stream = null_stream;
 470   ptx_dev->active_streams = NULL;
 471   pthread_mutex_init (&ptx_dev->stream_lock, NULL);
 472
 473   if (concurrency < 1)
 474     concurrency = 1;
 475
 476   /* This is just a guess -- make space for as many async streams as the
 477      current device is capable of concurrently executing.  This can grow
 478      later as necessary.  No streams are created yet.  */
 479   ptx_dev->async_streams.arr
 480     = GOMP_PLUGIN_malloc (concurrency * sizeof (struct ptx_stream *));
 481   ptx_dev->async_streams.size = concurrency;
 482
 483   for (i = 0; i < concurrency; i++)
 484     ptx_dev->async_streams.arr[i] = NULL;
 485
 486   return true;
 487 }
 488
 489 static bool
 490 fini_streams_for_device (struct ptx_device *ptx_dev)
 491 {
 492   free (ptx_dev->async_streams.arr);
 493
 494   bool ret = true;
 495   while (ptx_dev->active_streams != NULL)
 496     {
 497       struct ptx_stream *s = ptx_dev->active_streams;
 498       ptx_dev->active_streams = ptx_dev->active_streams->next;
 499
 500       ret &= map_fini (s);
 501
 502       CUresult r = CUDA_CALL_NOCHECK (cuStreamDestroy, s->stream);
 503       if (r != CUDA_SUCCESS)
 504         {
 505           GOMP_PLUGIN_error ("cuStreamDestroy error: %s", cuda_error (r));
 506           ret = false;
 507         }
 508       free (s);
 509     }
 510
 511   ret &= map_fini (ptx_dev->null_stream);
 512   free (ptx_dev->null_stream);
 513   return ret;
 514 }
 515
 516 /* Select a stream for (OpenACC-semantics) ASYNC argument for the current
 517    thread THREAD (and also current device/context).  If CREATE is true, create
 518    the stream if it does not exist (or use EXISTING if it is non-NULL), and
 519    associate the stream with the same thread argument.  Returns stream to use
 520    as result.  */
 521
 522 static struct ptx_stream *
 523 select_stream_for_async (int async, pthread_t thread, bool create,
 524                          CUstream existing)
 525 {
 526   struct nvptx_thread *nvthd = nvptx_thread ();
 527   /* Local copy of TLS variable.  */
 528   struct ptx_device *ptx_dev = nvthd->ptx_dev;
 529   struct ptx_stream *stream = NULL;
 530   int orig_async = async;
 531
 532   /* The special value acc_async_noval (-1) maps (for now) to an
 533      implicitly-created stream, which is then handled the same as any other
 534      numbered async stream.  Other options are available, e.g. using the null
 535      stream for anonymous async operations, or choosing an idle stream from an
 536      active set.  But, stick with this for now.  */
 537   if (async > acc_async_sync)
 538     async++;
 539
 540   if (create)
 541     pthread_mutex_lock (&ptx_dev->stream_lock);
 542
 543   /* NOTE: AFAICT there's no particular need for acc_async_sync to map to the
 544      null stream, and in fact better performance may be obtainable if it doesn't
 545      (because the null stream enforces overly-strict synchronisation with
 546      respect to other streams for legacy reasons, and that's probably not
 547      needed with OpenACC).  Maybe investigate later.  */
 548   if (async == acc_async_sync)
 549     stream = ptx_dev->null_stream;
 550   else if (async >= 0 && async < ptx_dev->async_streams.size
 551            && ptx_dev->async_streams.arr[async] && !(create && existing))
 552     stream = ptx_dev->async_streams.arr[async];
 553   else if (async >= 0 && create)
 554     {
 555       if (async >= ptx_dev->async_streams.size)
 556         {
 557           int i, newsize = ptx_dev->async_streams.size * 2;
 558
 559           if (async >= newsize)
 560             newsize = async + 1;
 561
 562           ptx_dev->async_streams.arr
 563             = GOMP_PLUGIN_realloc (ptx_dev->async_streams.arr,
 564                                    newsize * sizeof (struct ptx_stream *));
 565
 566           for (i = ptx_dev->async_streams.size; i < newsize; i++)
 567             ptx_dev->async_streams.arr[i] = NULL;
 568
 569           ptx_dev->async_streams.size = newsize;
 570         }
 571
 572       /* Create a new stream on-demand if there isn't one already, or if we're
 573          setting a particular async value to an existing (externally-provided)
 574          stream.  */
 575       if (!ptx_dev->async_streams.arr[async] || existing)
 576         {
 577           CUresult r;
 578           struct ptx_stream *s
 579             = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
 580
 581           if (existing)
 582             s->stream = existing;
 583           else
 584             {
 585               r = CUDA_CALL_NOCHECK (cuStreamCreate, &s->stream,
 586                                      CU_STREAM_DEFAULT);
 587               if (r != CUDA_SUCCESS)
 588                 {
 589                   pthread_mutex_unlock (&ptx_dev->stream_lock);
 590                   GOMP_PLUGIN_fatal ("cuStreamCreate error: %s",
 591                                      cuda_error (r));
 592                 }
 593             }
 594
 595           /* If CREATE is true, we're going to be queueing some work on this
 596              stream.  Associate it with the current host thread.  */
 597           s->host_thread = thread;
 598           s->multithreaded = false;
 599
 600           s->d = (CUdeviceptr) NULL;
 601           s->h = NULL;
 602           if (!map_init (s))
 603             {
 604               pthread_mutex_unlock (&ptx_dev->stream_lock);
 605               GOMP_PLUGIN_fatal ("map_init fail");
 606             }
 607
 608           s->next = ptx_dev->active_streams;
 609           ptx_dev->active_streams = s;
 610           ptx_dev->async_streams.arr[async] = s;
 611         }
 612
 613       stream = ptx_dev->async_streams.arr[async];
 614     }
 615   else if (async < 0)
 616     {
 617       if (create)
 618         pthread_mutex_unlock (&ptx_dev->stream_lock);
 619       GOMP_PLUGIN_fatal ("bad async %d", async);
 620     }
 621
 622   if (create)
 623     {
 624       assert (stream != NULL);
 625
 626       /* If we're trying to use the same stream from different threads
 627          simultaneously, set stream->multithreaded to true.  This affects the
 628          behaviour of acc_async_test_all and acc_wait_all, which are supposed to
 629          only wait for asynchronous launches from the same host thread they are
 630          invoked on.  If multiple threads use the same async value, we make note
 631          of that here and fall back to testing/waiting for all threads in those
 632          functions.  */
 633       if (thread != stream->host_thread)
 634         stream->multithreaded = true;
 635
 636       pthread_mutex_unlock (&ptx_dev->stream_lock);
 637     }
 638   else if (stream && !stream->multithreaded
 639            && !pthread_equal (stream->host_thread, thread))
 640     GOMP_PLUGIN_fatal ("async %d used on wrong thread", orig_async);
 641
 642   return stream;
 643 }
 644
 645 /* Initialize the device.  Return TRUE on success, else FALSE.  PTX_DEV_LOCK
 646    should be locked on entry and remains locked on exit.  */
 647
 648 static bool
 649 nvptx_init (void)
 650 {
 651   int ndevs;
 652
 653   if (instantiated_devices != 0)
 654     return true;
 655
 656   ptx_events = NULL;
 657   pthread_mutex_init (&ptx_event_lock, NULL);
 658
 659   if (!init_cuda_lib ())
 660     return false;
 661
 662   CUDA_CALL (cuInit, 0);
 663
 664   CUDA_CALL (cuDeviceGetCount, &ndevs);
 665   ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *)
 666                                             * ndevs);
 667   return true;
 668 }
 669
 670 /* Select the N'th PTX device for the current host thread.  The device must
 671    have been previously opened before calling this function.  */
 672
 673 static bool
 674 nvptx_attach_host_thread_to_device (int n)
 675 {
 676   CUdevice dev;
 677   CUresult r;
 678   struct ptx_device *ptx_dev;
 679   CUcontext thd_ctx;
 680
 681   r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &dev);
 682   if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
 683     {
 684       GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
 685       return false;
 686     }
 687
 688   if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n)
 689     return true;
 690   else
 691     {
 692       CUcontext old_ctx;
 693
 694       ptx_dev = ptx_devices[n];
 695       if (!ptx_dev)
 696         {
 697           GOMP_PLUGIN_error ("device %d not found", n);
 698           return false;
 699         }
 700
 701       CUDA_CALL (cuCtxGetCurrent, &thd_ctx);
 702
 703       /* We don't necessarily have a current context (e.g. if it has been
 704          destroyed.  Pop it if we do though.  */
 705       if (thd_ctx != NULL)
 706         CUDA_CALL (cuCtxPopCurrent, &old_ctx);
 707
 708       CUDA_CALL (cuCtxPushCurrent, ptx_dev->ctx);
 709     }
 710   return true;
 711 }
 712
 713 static struct ptx_device *
 714 nvptx_open_device (int n)
 715 {
 716   struct ptx_device *ptx_dev;
 717   CUdevice dev, ctx_dev;
 718   CUresult r;
 719   int async_engines, pi;
 720
 721   CUDA_CALL_ERET (NULL, cuDeviceGet, &dev, n);
 722
 723   ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device));
 724
 725   ptx_dev->ord = n;
 726   ptx_dev->dev = dev;
 727   ptx_dev->ctx_shared = false;
 728
 729   r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &ctx_dev);
 730   if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
 731     {
 732       GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
 733       return NULL;
 734     }
 735
 736   if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev)
 737     {
 738       /* The current host thread has an active context for a different device.
 739          Detach it.  */
 740       CUcontext old_ctx;
 741       CUDA_CALL_ERET (NULL, cuCtxPopCurrent, &old_ctx);
 742     }
 743
 744   CUDA_CALL_ERET (NULL, cuCtxGetCurrent, &ptx_dev->ctx);
 745
 746   if (!ptx_dev->ctx)
 747     CUDA_CALL_ERET (NULL, cuCtxCreate, &ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
 748   else
 749     ptx_dev->ctx_shared = true;
 750
 751   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 752                   &pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
 753   ptx_dev->overlap = pi;
 754
 755   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 756                   &pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
 757   ptx_dev->map = pi;
 758
 759   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 760                   &pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
 761   ptx_dev->concur = pi;
 762
 763   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 764                   &pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
 765   ptx_dev->mode = pi;
 766
 767   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 768                   &pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
 769   ptx_dev->mkern = pi;
 770
 771   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 772                   &pi, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
 773   ptx_dev->clock_khz = pi;
 774
 775   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 776                   &pi, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
 777   ptx_dev->num_sms = pi;
 778
 779   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 780                   &pi, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev);
 781   ptx_dev->regs_per_block = pi;
 782
 783   /* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82 is defined only
 784      in CUDA 6.0 and newer.  */
 785   r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi, 82, dev);
 786   /* Fallback: use limit of registers per block, which is usually equal.  */
 787   if (r == CUDA_ERROR_INVALID_VALUE)
 788     pi = ptx_dev->regs_per_block;
 789   else if (r != CUDA_SUCCESS)
 790     {
 791       GOMP_PLUGIN_error ("cuDeviceGetAttribute error: %s", cuda_error (r));
 792       return NULL;
 793     }
 794   ptx_dev->regs_per_sm = pi;
 795
 796   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 797                   &pi, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev);
 798   if (pi != 32)
 799     {
 800       GOMP_PLUGIN_error ("Only warp size 32 is supported");
 801       return NULL;
 802     }
 803
 804   r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &async_engines,
 805                          CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
 806   if (r != CUDA_SUCCESS)
 807     async_engines = 1;
 808
 809   ptx_dev->images = NULL;
 810   pthread_mutex_init (&ptx_dev->image_lock, NULL);
 811
 812   if (!init_streams_for_device (ptx_dev, async_engines))
 813     return NULL;
 814
 815   return ptx_dev;
 816 }
 817
 818 static bool
 819 nvptx_close_device (struct ptx_device *ptx_dev)
 820 {
 821   if (!ptx_dev)
 822     return true;
 823
 824   if (!fini_streams_for_device (ptx_dev))
 825     return false;
 826
 827   pthread_mutex_destroy (&ptx_dev->image_lock);
 828
 829   if (!ptx_dev->ctx_shared)
 830     CUDA_CALL (cuCtxDestroy, ptx_dev->ctx);
 831
 832   free (ptx_dev);
 833   return true;
 834 }
 835
 836 static int
 837 nvptx_get_num_devices (void)
 838 {
 839   int n;
 840
 841   /* PR libgomp/65099: Currently, we only support offloading in 64-bit
 842      configurations.  */
 843   if (sizeof (void *) != 8)
 844     {
 845       GOMP_PLUGIN_debug (0, "Disabling nvptx offloading;"
 846                          " only 64-bit configurations are supported\n");
 847       return 0;
 848     }
 849
 850   /* This function will be called before the plugin has been initialized in
 851      order to enumerate available devices, but CUDA API routines can't be used
 852      until cuInit has been called.  Just call it now (but don't yet do any
 853      further initialization).  */
 854   if (instantiated_devices == 0)
 855     {
 856       if (!init_cuda_lib ())
 857         return 0;
 858       CUresult r = CUDA_CALL_NOCHECK (cuInit, 0);
 859       /* This is not an error: e.g. we may have CUDA libraries installed but
 860          no devices available.  */
 861       if (r != CUDA_SUCCESS)
 862         {
 863           GOMP_PLUGIN_debug (0, "Disabling nvptx offloading; cuInit: %s\n",
 864                              cuda_error (r));
 865           return 0;
 866         }
 867     }
 868
 869   CUDA_CALL_ERET (-1, cuDeviceGetCount, &n);
 870   return n;
 871 }
 872
 873 static void
 874 notify_var (const char *var_name, const char *env_var)
 875 {
 876   if (env_var == NULL)
 877     GOMP_PLUGIN_debug (0, "%s: <Not defined>\n", var_name);
 878   else
 879     GOMP_PLUGIN_debug (0, "%s: '%s'\n", var_name, env_var);
 880 }
 881
 882 static void
 883 process_GOMP_NVPTX_JIT (intptr_t *gomp_nvptx_o)
 884 {
 885   const char *var_name = "GOMP_NVPTX_JIT";
 886   const char *env_var = secure_getenv (var_name);
 887   notify_var (var_name, env_var);
 888
 889   if (env_var == NULL)
 890     return;
 891
 892   const char *c = env_var;
 893   while (*c != '\0')
 894     {
 895       while (*c == ' ')
 896         c++;
 897
 898       if (c[0] == '-' && c[1] == 'O'
 899           && '0' <= c[2] && c[2] <= '4'
 900           && (c[3] == '\0' || c[3] == ' '))
 901         {
 902           *gomp_nvptx_o = c[2] - '0';
 903           c += 3;
 904           continue;
 905         }
 906
 907       GOMP_PLUGIN_error ("Error parsing %s", var_name);
 908       break;
 909     }
 910 }
 911
 912 static bool
 913 link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs,
 914           unsigned num_objs)
 915 {
 916   CUjit_option opts[7];
 917   void *optvals[7];
 918   float elapsed = 0.0;
 919   char elog[1024];
 920   char ilog[16384];
 921   CUlinkState linkstate;
 922   CUresult r;
 923   void *linkout;
 924   size_t linkoutsize __attribute__ ((unused));
 925
 926   opts[0] = CU_JIT_WALL_TIME;
 927   optvals[0] = &elapsed;
 928
 929   opts[1] = CU_JIT_INFO_LOG_BUFFER;
 930   optvals[1] = &ilog[0];
 931
 932   opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
 933   optvals[2] = (void *) sizeof ilog;
 934
 935   opts[3] = CU_JIT_ERROR_LOG_BUFFER;
 936   optvals[3] = &elog[0];
 937
 938   opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
 939   optvals[4] = (void *) sizeof elog;
 940
 941   opts[5] = CU_JIT_LOG_VERBOSE;
 942   optvals[5] = (void *) 1;
 943
 944   static intptr_t gomp_nvptx_o = -1;
 945
 946   static bool init_done = false;
 947   if (!init_done)
 948     {
 949       process_GOMP_NVPTX_JIT (&gomp_nvptx_o);
 950       init_done = true;
 951   }
 952
 953   int nopts = 6;
 954   if (gomp_nvptx_o != -1)
 955     {
 956       opts[nopts] = CU_JIT_OPTIMIZATION_LEVEL;
 957       optvals[nopts] = (void *) gomp_nvptx_o;
 958       nopts++;
 959     }
 960
 961   CUDA_CALL (cuLinkCreate, nopts, opts, optvals, &linkstate);
 962
 963   for (; num_objs--; ptx_objs++)
 964     {
 965       /* cuLinkAddData's 'data' argument erroneously omits the const
 966          qualifier.  */
 967       GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs->code);
 968       r = CUDA_CALL_NOCHECK (cuLinkAddData, linkstate, CU_JIT_INPUT_PTX,
 969                              (char *) ptx_objs->code, ptx_objs->size,
 970                              0, 0, 0, 0);
 971       if (r != CUDA_SUCCESS)
 972         {
 973           GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
 974           GOMP_PLUGIN_error ("cuLinkAddData (ptx_code) error: %s",
 975                              cuda_error (r));
 976           return false;
 977         }
 978     }
 979
 980   GOMP_PLUGIN_debug (0, "Linking\n");
 981   r = CUDA_CALL_NOCHECK (cuLinkComplete, linkstate, &linkout, &linkoutsize);
 982
 983   GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed);
 984   GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]);
 985
 986   if (r != CUDA_SUCCESS)
 987     {
 988       GOMP_PLUGIN_error ("cuLinkComplete error: %s", cuda_error (r));
 989       return false;
 990     }
 991
 992   CUDA_CALL (cuModuleLoadData, module, linkout);
 993   CUDA_CALL (cuLinkDestroy, linkstate);
 994   return true;
 995 }
 996
 997 static void
 998 event_gc (bool memmap_lockable)
 999 {
1000   struct ptx_event *ptx_event = ptx_events;
1001   struct ptx_event *async_cleanups = NULL;
1002   struct nvptx_thread *nvthd = nvptx_thread ();
1003
1004   pthread_mutex_lock (&ptx_event_lock);
1005
1006   while (ptx_event != NULL)
1007     {
1008       CUresult r;
1009       struct ptx_event *e = ptx_event;
1010
1011       ptx_event = ptx_event->next;
1012
1013       if (e->ord != nvthd->ptx_dev->ord)
1014         continue;
1015
1016       r = CUDA_CALL_NOCHECK (cuEventQuery, *e->evt);
1017       if (r == CUDA_SUCCESS)
1018         {
1019           bool append_async = false;
1020           CUevent *te;
1021
1022           te = e->evt;
1023
1024           switch (e->type)
1025             {
1026             case PTX_EVT_MEM:
1027             case PTX_EVT_SYNC:
1028               break;
1029
1030             case PTX_EVT_KNL:
1031               map_pop (e->addr);
1032               break;
1033
1034             case PTX_EVT_ASYNC_CLEANUP:
1035               {
1036                 /* The function gomp_plugin_async_unmap_vars needs to claim the
1037                    memory-map splay tree lock for the current device, so we
1038                    can't call it when one of our callers has already claimed
1039                    the lock.  In that case, just delay the GC for this event
1040                    until later.  */
1041                 if (!memmap_lockable)
1042                   continue;
1043
1044                 append_async = true;
1045               }
1046               break;
1047             }
1048
1049           CUDA_CALL_NOCHECK (cuEventDestroy, *te);
1050           free ((void *)te);
1051
1052           /* Unlink 'e' from ptx_events list.  */
1053           if (ptx_events == e)
1054             ptx_events = ptx_events->next;
1055           else
1056             {
1057               struct ptx_event *e_ = ptx_events;
1058               while (e_->next != e)
1059                 e_ = e_->next;
1060               e_->next = e_->next->next;
1061             }
1062
1063           if (append_async)
1064             {
1065               e->next = async_cleanups;
1066               async_cleanups = e;
1067             }
1068           else
1069             free (e);
1070         }
1071     }
1072
1073   pthread_mutex_unlock (&ptx_event_lock);
1074
1075   /* We have to do these here, after ptx_event_lock is released.  */
1076   while (async_cleanups)
1077     {
1078       struct ptx_event *e = async_cleanups;
1079       async_cleanups = async_cleanups->next;
1080
1081       GOMP_PLUGIN_async_unmap_vars (e->addr, e->val);
1082       free (e);
1083     }
1084 }
1085
1086 static void
1087 event_add (enum ptx_event_type type, CUevent *e, void *h, int val)
1088 {
1089   struct ptx_event *ptx_event;
1090   struct nvptx_thread *nvthd = nvptx_thread ();
1091
1092   assert (type == PTX_EVT_MEM || type == PTX_EVT_KNL || type == PTX_EVT_SYNC
1093           || type == PTX_EVT_ASYNC_CLEANUP);
1094
1095   ptx_event = GOMP_PLUGIN_malloc (sizeof (struct ptx_event));
1096   ptx_event->type = type;
1097   ptx_event->evt = e;
1098   ptx_event->addr = h;
1099   ptx_event->ord = nvthd->ptx_dev->ord;
1100   ptx_event->val = val;
1101
1102   pthread_mutex_lock (&ptx_event_lock);
1103
1104   ptx_event->next = ptx_events;
1105   ptx_events = ptx_event;
1106
1107   pthread_mutex_unlock (&ptx_event_lock);
1108 }
1109
1110 static void
1111 nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
1112             int async, unsigned *dims, void *targ_mem_desc)
1113 {
1114   struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
1115   CUfunction function;
1116   CUresult r;
1117   int i;
1118   struct ptx_stream *dev_str;
1119   void *kargs[1];
1120   void *hp, *dp;
1121   struct nvptx_thread *nvthd = nvptx_thread ();
1122   const char *maybe_abort_msg = "(perhaps abort was called)";
1123
1124   function = targ_fn->fn;
1125
1126   dev_str = select_stream_for_async (async, pthread_self (), false, NULL);
1127   assert (dev_str == nvthd->current_stream);
1128
1129   /* Initialize the launch dimensions.  Typically this is constant,
1130      provided by the device compiler, but we must permit runtime
1131      values.  */
1132   int seen_zero = 0;
1133   for (i = 0; i != GOMP_DIM_MAX; i++)
1134     {
1135       if (targ_fn->launch->dim[i])
1136        dims[i] = targ_fn->launch->dim[i];
1137       if (!dims[i])
1138        seen_zero = 1;
1139     }
1140
1141   if (seen_zero)
1142     {
1143       /* See if the user provided GOMP_OPENACC_DIM environment
1144          variable to specify runtime defaults. */
1145       static int default_dims[GOMP_DIM_MAX];
1146
1147       pthread_mutex_lock (&ptx_dev_lock);
1148       if (!default_dims[0])
1149         {
1150           for (int i = 0; i < GOMP_DIM_MAX; ++i)
1151             default_dims[i] = GOMP_PLUGIN_acc_default_dim (i);
1152
1153           int warp_size, block_size, dev_size, cpu_size;
1154           CUdevice dev = nvptx_thread()->ptx_dev->dev;
1155           /* 32 is the default for known hardware.  */
1156           int gang = 0, worker = 32, vector = 32;
1157           CUdevice_attribute cu_tpb, cu_ws, cu_mpc, cu_tpm;
1158
1159           cu_tpb = CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK;
1160           cu_ws = CU_DEVICE_ATTRIBUTE_WARP_SIZE;
1161           cu_mpc = CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT;
1162           cu_tpm  = CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR;
1163
1164           if (CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &block_size, cu_tpb,
1165                                  dev) == CUDA_SUCCESS
1166               && CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &warp_size, cu_ws,
1167                                     dev) == CUDA_SUCCESS
1168               && CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &dev_size, cu_mpc,
1169                                     dev) == CUDA_SUCCESS
1170               && CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &cpu_size, cu_tpm,
1171                                     dev) == CUDA_SUCCESS)
1172             {
1173               GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
1174                                  " dev_size=%d, cpu_size=%d\n",
1175                                  warp_size, block_size, dev_size, cpu_size);
1176               gang = (cpu_size / block_size) * dev_size;
1177               worker = block_size / warp_size;
1178               vector = warp_size;
1179             }
1180
1181           /* There is no upper bound on the gang size.  The best size
1182              matches the hardware configuration.  Logical gangs are
1183              scheduled onto physical hardware.  To maximize usage, we
1184              should guess a large number.  */
1185           if (default_dims[GOMP_DIM_GANG] < 1)
1186             default_dims[GOMP_DIM_GANG] = gang ? gang : 1024;
1187           /* The worker size must not exceed the hardware.  */
1188           if (default_dims[GOMP_DIM_WORKER] < 1
1189               || (default_dims[GOMP_DIM_WORKER] > worker && gang))
1190             default_dims[GOMP_DIM_WORKER] = worker;
1191           /* The vector size must exactly match the hardware.  */
1192           if (default_dims[GOMP_DIM_VECTOR] < 1
1193               || (default_dims[GOMP_DIM_VECTOR] != vector && gang))
1194             default_dims[GOMP_DIM_VECTOR] = vector;
1195
1196           GOMP_PLUGIN_debug (0, " default dimensions [%d,%d,%d]\n",
1197                              default_dims[GOMP_DIM_GANG],
1198                              default_dims[GOMP_DIM_WORKER],
1199                              default_dims[GOMP_DIM_VECTOR]);
1200         }
1201       pthread_mutex_unlock (&ptx_dev_lock);
1202
1203       for (i = 0; i != GOMP_DIM_MAX; i++)
1204         if (!dims[i])
1205           dims[i] = default_dims[i];
1206     }
1207
1208   /* This reserves a chunk of a pre-allocated page of memory mapped on both
1209      the host and the device. HP is a host pointer to the new chunk, and DP is
1210      the corresponding device pointer.  */
1211   map_push (dev_str, async, mapnum * sizeof (void *), &hp, &dp);
1212
1213   GOMP_PLUGIN_debug (0, "  %s: prepare mappings\n", __FUNCTION__);
1214
1215   /* Copy the array of arguments to the mapped page.  */
1216   for (i = 0; i < mapnum; i++)
1217     ((void **) hp)[i] = devaddrs[i];
1218
1219   /* Copy the (device) pointers to arguments to the device (dp and hp might in
1220      fact have the same value on a unified-memory system).  */
1221   CUDA_CALL_ASSERT (cuMemcpy, (CUdeviceptr) dp, (CUdeviceptr) hp,
1222                     mapnum * sizeof (void *));
1223   GOMP_PLUGIN_debug (0, "  %s: kernel %s: launch"
1224                      " gangs=%u, workers=%u, vectors=%u\n",
1225                      __FUNCTION__, targ_fn->launch->fn, dims[GOMP_DIM_GANG],
1226                      dims[GOMP_DIM_WORKER], dims[GOMP_DIM_VECTOR]);
1227
1228   // OpenACC            CUDA
1229   //
1230   // num_gangs          nctaid.x
1231   // num_workers        ntid.y
1232   // vector length      ntid.x
1233
1234   kargs[0] = &dp;
1235   CUDA_CALL_ASSERT (cuLaunchKernel, function,
1236                     dims[GOMP_DIM_GANG], 1, 1,
1237                     dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1,
1238                     0, dev_str->stream, kargs, 0);
1239
1240 #ifndef DISABLE_ASYNC
1241   if (async < acc_async_noval)
1242     {
1243       r = CUDA_CALL_NOCHECK (cuStreamSynchronize, dev_str->stream);
1244       if (r == CUDA_ERROR_LAUNCH_FAILED)
1245         GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
1246                            maybe_abort_msg);
1247       else if (r != CUDA_SUCCESS)
1248         GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1249     }
1250   else
1251     {
1252       CUevent *e;
1253
1254       e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1255
1256       r = CUDA_CALL_NOCHECK (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1257       if (r == CUDA_ERROR_LAUNCH_FAILED)
1258         GOMP_PLUGIN_fatal ("cuEventCreate error: %s %s\n", cuda_error (r),
1259                            maybe_abort_msg);
1260       else if (r != CUDA_SUCCESS)
1261         GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1262
1263       event_gc (true);
1264
1265       CUDA_CALL_ASSERT (cuEventRecord, *e, dev_str->stream);
1266
1267       event_add (PTX_EVT_KNL, e, (void *)dev_str, 0);
1268     }
1269 #else
1270   r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
1271   if (r == CUDA_ERROR_LAUNCH_FAILED)
1272     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
1273                        maybe_abort_msg);
1274   else if (r != CUDA_SUCCESS)
1275     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
1276 #endif
1277
1278   GOMP_PLUGIN_debug (0, "  %s: kernel %s: finished\n", __FUNCTION__,
1279                      targ_fn->launch->fn);
1280
1281 #ifndef DISABLE_ASYNC
1282   if (async < acc_async_noval)
1283 #endif
1284     map_pop (dev_str);
1285 }
1286
1287 void * openacc_get_current_cuda_context (void);
1288
1289 static void *
1290 nvptx_alloc (size_t s)
1291 {
1292   CUdeviceptr d;
1293
1294   CUDA_CALL_ERET (NULL, cuMemAlloc, &d, s);
1295   return (void *) d;
1296 }
1297
1298 static bool
1299 nvptx_free (void *p)
1300 {
1301   CUdeviceptr pb;
1302   size_t ps;
1303
1304   CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) p);
1305   if ((CUdeviceptr) p != pb)
1306     {
1307       GOMP_PLUGIN_error ("invalid device address");
1308       return false;
1309     }
1310
1311   CUDA_CALL (cuMemFree, (CUdeviceptr) p);
1312   return true;
1313 }
1314
1315
1316 static bool
1317 nvptx_host2dev (void *d, const void *h, size_t s)
1318 {
1319   CUdeviceptr pb;
1320   size_t ps;
1321   struct nvptx_thread *nvthd = nvptx_thread ();
1322
1323   if (!s)
1324     return true;
1325   if (!d)
1326     {
1327       GOMP_PLUGIN_error ("invalid device address");
1328       return false;
1329     }
1330
1331   CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
1332
1333   if (!pb)
1334     {
1335       GOMP_PLUGIN_error ("invalid device address");
1336       return false;
1337     }
1338   if (!h)
1339     {
1340       GOMP_PLUGIN_error ("invalid host address");
1341       return false;
1342     }
1343   if (d == h)
1344     {
1345       GOMP_PLUGIN_error ("invalid host or device address");
1346       return false;
1347     }
1348   if ((void *)(d + s) > (void *)(pb + ps))
1349     {
1350       GOMP_PLUGIN_error ("invalid size");
1351       return false;
1352     }
1353
1354 #ifndef DISABLE_ASYNC
1355   if (nvthd && nvthd->current_stream != nvthd->ptx_dev->null_stream)
1356     {
1357       CUevent *e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1358       CUDA_CALL (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1359       event_gc (false);
1360       CUDA_CALL (cuMemcpyHtoDAsync,
1361                  (CUdeviceptr) d, h, s, nvthd->current_stream->stream);
1362       CUDA_CALL (cuEventRecord, *e, nvthd->current_stream->stream);
1363       event_add (PTX_EVT_MEM, e, (void *)h, 0);
1364     }
1365   else
1366 #endif
1367     CUDA_CALL (cuMemcpyHtoD, (CUdeviceptr) d, h, s);
1368
1369   return true;
1370 }
1371
1372 static bool
1373 nvptx_dev2host (void *h, const void *d, size_t s)
1374 {
1375   CUdeviceptr pb;
1376   size_t ps;
1377   struct nvptx_thread *nvthd = nvptx_thread ();
1378
1379   if (!s)
1380     return true;
1381   if (!d)
1382     {
1383       GOMP_PLUGIN_error ("invalid device address");
1384       return false;
1385     }
1386
1387   CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
1388
1389   if (!pb)
1390     {
1391       GOMP_PLUGIN_error ("invalid device address");
1392       return false;
1393     }
1394   if (!h)
1395     {
1396       GOMP_PLUGIN_error ("invalid host address");
1397       return false;
1398     }
1399   if (d == h)
1400     {
1401       GOMP_PLUGIN_error ("invalid host or device address");
1402       return false;
1403     }
1404   if ((void *)(d + s) > (void *)(pb + ps))
1405     {
1406       GOMP_PLUGIN_error ("invalid size");
1407       return false;
1408     }
1409
1410 #ifndef DISABLE_ASYNC
1411   if (nvthd && nvthd->current_stream != nvthd->ptx_dev->null_stream)
1412     {
1413       CUevent *e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1414       CUDA_CALL (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1415       event_gc (false);
1416       CUDA_CALL (cuMemcpyDtoHAsync,
1417                  h, (CUdeviceptr) d, s, nvthd->current_stream->stream);
1418       CUDA_CALL (cuEventRecord, *e, nvthd->current_stream->stream);
1419       event_add (PTX_EVT_MEM, e, (void *)h, 0);
1420     }
1421   else
1422 #endif
1423     CUDA_CALL (cuMemcpyDtoH, h, (CUdeviceptr) d, s);
1424
1425   return true;
1426 }
1427
1428 static void
1429 nvptx_set_async (int async)
1430 {
1431   struct nvptx_thread *nvthd = nvptx_thread ();
1432   nvthd->current_stream
1433     = select_stream_for_async (async, pthread_self (), true, NULL);
1434 }
1435
1436 static int
1437 nvptx_async_test (int async)
1438 {
1439   CUresult r;
1440   struct ptx_stream *s;
1441
1442   s = select_stream_for_async (async, pthread_self (), false, NULL);
1443
1444   if (!s)
1445     GOMP_PLUGIN_fatal ("unknown async %d", async);
1446
1447   r = CUDA_CALL_NOCHECK (cuStreamQuery, s->stream);
1448   if (r == CUDA_SUCCESS)
1449     {
1450       /* The oacc-parallel.c:goacc_wait function calls this hook to determine
1451          whether all work has completed on this stream, and if so omits the call
1452          to the wait hook.  If that happens, event_gc might not get called
1453          (which prevents variables from getting unmapped and their associated
1454          device storage freed), so call it here.  */
1455       event_gc (true);
1456       return 1;
1457     }
1458   else if (r == CUDA_ERROR_NOT_READY)
1459     return 0;
1460
1461   GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
1462
1463   return 0;
1464 }
1465
1466 static int
1467 nvptx_async_test_all (void)
1468 {
1469   struct ptx_stream *s;
1470   pthread_t self = pthread_self ();
1471   struct nvptx_thread *nvthd = nvptx_thread ();
1472
1473   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1474
1475   for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
1476     {
1477       if ((s->multithreaded || pthread_equal (s->host_thread, self))
1478           && CUDA_CALL_NOCHECK (cuStreamQuery,
1479                                 s->stream) == CUDA_ERROR_NOT_READY)
1480         {
1481           pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1482           return 0;
1483         }
1484     }
1485
1486   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1487
1488   event_gc (true);
1489
1490   return 1;
1491 }
1492
1493 static void
1494 nvptx_wait (int async)
1495 {
1496   struct ptx_stream *s;
1497
1498   s = select_stream_for_async (async, pthread_self (), false, NULL);
1499   if (!s)
1500     GOMP_PLUGIN_fatal ("unknown async %d", async);
1501
1502   CUDA_CALL_ASSERT (cuStreamSynchronize, s->stream);
1503
1504   event_gc (true);
1505 }
1506
1507 static void
1508 nvptx_wait_async (int async1, int async2)
1509 {
1510   CUevent *e;
1511   struct ptx_stream *s1, *s2;
1512   pthread_t self = pthread_self ();
1513
1514   /* The stream that is waiting (rather than being waited for) doesn't
1515      necessarily have to exist already.  */
1516   s2 = select_stream_for_async (async2, self, true, NULL);
1517
1518   s1 = select_stream_for_async (async1, self, false, NULL);
1519   if (!s1)
1520     GOMP_PLUGIN_fatal ("invalid async 1\n");
1521
1522   if (s1 == s2)
1523     GOMP_PLUGIN_fatal ("identical parameters");
1524
1525   e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1526
1527   CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1528
1529   event_gc (true);
1530
1531   CUDA_CALL_ASSERT (cuEventRecord, *e, s1->stream);
1532
1533   event_add (PTX_EVT_SYNC, e, NULL, 0);
1534
1535   CUDA_CALL_ASSERT (cuStreamWaitEvent, s2->stream, *e, 0);
1536 }
1537
1538 static void
1539 nvptx_wait_all (void)
1540 {
1541   CUresult r;
1542   struct ptx_stream *s;
1543   pthread_t self = pthread_self ();
1544   struct nvptx_thread *nvthd = nvptx_thread ();
1545
1546   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1547
1548   /* Wait for active streams initiated by this thread (or by multiple threads)
1549      to complete.  */
1550   for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
1551     {
1552       if (s->multithreaded || pthread_equal (s->host_thread, self))
1553         {
1554           r = CUDA_CALL_NOCHECK (cuStreamQuery, s->stream);
1555           if (r == CUDA_SUCCESS)
1556             continue;
1557           else if (r != CUDA_ERROR_NOT_READY)
1558             GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
1559
1560           CUDA_CALL_ASSERT (cuStreamSynchronize, s->stream);
1561         }
1562     }
1563
1564   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1565
1566   event_gc (true);
1567 }
1568
1569 static void
1570 nvptx_wait_all_async (int async)
1571 {
1572   struct ptx_stream *waiting_stream, *other_stream;
1573   CUevent *e;
1574   struct nvptx_thread *nvthd = nvptx_thread ();
1575   pthread_t self = pthread_self ();
1576
1577   /* The stream doing the waiting.  This could be the first mention of the
1578      stream, so create it if necessary.  */
1579   waiting_stream
1580     = select_stream_for_async (async, pthread_self (), true, NULL);
1581
1582   /* Launches on the null stream already block on other streams in the
1583      context.  */
1584   if (!waiting_stream || waiting_stream == nvthd->ptx_dev->null_stream)
1585     return;
1586
1587   event_gc (true);
1588
1589   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1590
1591   for (other_stream = nvthd->ptx_dev->active_streams;
1592        other_stream != NULL;
1593        other_stream = other_stream->next)
1594     {
1595       if (!other_stream->multithreaded
1596           && !pthread_equal (other_stream->host_thread, self))
1597         continue;
1598
1599       e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1600
1601       CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1602
1603       /* Record an event on the waited-for stream.  */
1604       CUDA_CALL_ASSERT (cuEventRecord, *e, other_stream->stream);
1605
1606       event_add (PTX_EVT_SYNC, e, NULL, 0);
1607
1608       CUDA_CALL_ASSERT (cuStreamWaitEvent, waiting_stream->stream, *e, 0);
1609    }
1610
1611   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1612 }
1613
1614 static void *
1615 nvptx_get_current_cuda_device (void)
1616 {
1617   struct nvptx_thread *nvthd = nvptx_thread ();
1618
1619   if (!nvthd || !nvthd->ptx_dev)
1620     return NULL;
1621
1622   return &nvthd->ptx_dev->dev;
1623 }
1624
1625 static void *
1626 nvptx_get_current_cuda_context (void)
1627 {
1628   struct nvptx_thread *nvthd = nvptx_thread ();
1629
1630   if (!nvthd || !nvthd->ptx_dev)
1631     return NULL;
1632
1633   return nvthd->ptx_dev->ctx;
1634 }
1635
1636 static void *
1637 nvptx_get_cuda_stream (int async)
1638 {
1639   struct ptx_stream *s;
1640   struct nvptx_thread *nvthd = nvptx_thread ();
1641
1642   if (!nvthd || !nvthd->ptx_dev)
1643     return NULL;
1644
1645   s = select_stream_for_async (async, pthread_self (), false, NULL);
1646
1647   return s ? s->stream : NULL;
1648 }
1649
1650 static int
1651 nvptx_set_cuda_stream (int async, void *stream)
1652 {
1653   struct ptx_stream *oldstream;
1654   pthread_t self = pthread_self ();
1655   struct nvptx_thread *nvthd = nvptx_thread ();
1656
1657   if (async < 0)
1658     GOMP_PLUGIN_fatal ("bad async %d", async);
1659
1660   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1661
1662   /* We have a list of active streams and an array mapping async values to
1663      entries of that list.  We need to take "ownership" of the passed-in stream,
1664      and add it to our list, removing the previous entry also (if there was one)
1665      in order to prevent resource leaks.  Note the potential for surprise
1666      here: maybe we should keep track of passed-in streams and leave it up to
1667      the user to tidy those up, but that doesn't work for stream handles
1668      returned from acc_get_cuda_stream above...  */
1669
1670   oldstream = select_stream_for_async (async, self, false, NULL);
1671
1672   if (oldstream)
1673     {
1674       if (nvthd->ptx_dev->active_streams == oldstream)
1675         nvthd->ptx_dev->active_streams = nvthd->ptx_dev->active_streams->next;
1676       else
1677         {
1678           struct ptx_stream *s = nvthd->ptx_dev->active_streams;
1679           while (s->next != oldstream)
1680             s = s->next;
1681           s->next = s->next->next;
1682         }
1683
1684       CUDA_CALL_ASSERT (cuStreamDestroy, oldstream->stream);
1685
1686       if (!map_fini (oldstream))
1687         GOMP_PLUGIN_fatal ("error when freeing host memory");
1688
1689       free (oldstream);
1690     }
1691
1692   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1693
1694   (void) select_stream_for_async (async, self, true, (CUstream) stream);
1695
1696   return 1;
1697 }
1698
1699 /* Plugin entry points.  */
1700
1701 const char *
1702 GOMP_OFFLOAD_get_name (void)
1703 {
1704   return "nvptx";
1705 }
1706
1707 unsigned int
1708 GOMP_OFFLOAD_get_caps (void)
1709 {
1710   return GOMP_OFFLOAD_CAP_OPENACC_200 | GOMP_OFFLOAD_CAP_OPENMP_400;
1711 }
1712
1713 int
1714 GOMP_OFFLOAD_get_type (void)
1715 {
1716   return OFFLOAD_TARGET_TYPE_NVIDIA_PTX;
1717 }
1718
1719 int
1720 GOMP_OFFLOAD_get_num_devices (void)
1721 {
1722   return nvptx_get_num_devices ();
1723 }
1724
1725 bool
1726 GOMP_OFFLOAD_init_device (int n)
1727 {
1728   struct ptx_device *dev;
1729
1730   pthread_mutex_lock (&ptx_dev_lock);
1731
1732   if (!nvptx_init () || ptx_devices[n] != NULL)
1733     {
1734       pthread_mutex_unlock (&ptx_dev_lock);
1735       return false;
1736     }
1737
1738   dev = nvptx_open_device (n);
1739   if (dev)
1740     {
1741       ptx_devices[n] = dev;
1742       instantiated_devices++;
1743     }
1744
1745   pthread_mutex_unlock (&ptx_dev_lock);
1746
1747   return dev != NULL;
1748 }
1749
1750 bool
1751 GOMP_OFFLOAD_fini_device (int n)
1752 {
1753   pthread_mutex_lock (&ptx_dev_lock);
1754
1755   if (ptx_devices[n] != NULL)
1756     {
1757       if (!nvptx_attach_host_thread_to_device (n)
1758           || !nvptx_close_device (ptx_devices[n]))
1759         {
1760           pthread_mutex_unlock (&ptx_dev_lock);
1761           return false;
1762         }
1763       ptx_devices[n] = NULL;
1764       instantiated_devices--;
1765     }
1766
1767   pthread_mutex_unlock (&ptx_dev_lock);
1768   return true;
1769 }
1770
1771 /* Return the libgomp version number we're compatible with.  There is
1772    no requirement for cross-version compatibility.  */
1773
1774 unsigned
1775 GOMP_OFFLOAD_version (void)
1776 {
1777   return GOMP_VERSION;
1778 }
1779
1780 /* Initialize __nvptx_clocktick, if present in MODULE.  */
1781
1782 static void
1783 nvptx_set_clocktick (CUmodule module, struct ptx_device *dev)
1784 {
1785   CUdeviceptr dptr;
1786   CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &dptr, NULL,
1787                                   module, "__nvptx_clocktick");
1788   if (r == CUDA_ERROR_NOT_FOUND)
1789     return;
1790   if (r != CUDA_SUCCESS)
1791     GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
1792   double __nvptx_clocktick = 1e-3 / dev->clock_khz;
1793   r = CUDA_CALL_NOCHECK (cuMemcpyHtoD, dptr, &__nvptx_clocktick,
1794                          sizeof (__nvptx_clocktick));
1795   if (r != CUDA_SUCCESS)
1796     GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
1797 }
1798
1799 /* Load the (partial) program described by TARGET_DATA to device
1800    number ORD.  Allocate and return TARGET_TABLE.  */
1801
1802 int
1803 GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
1804                          struct addr_pair **target_table)
1805 {
1806   CUmodule module;
1807   const char *const *var_names;
1808   const struct targ_fn_launch *fn_descs;
1809   unsigned int fn_entries, var_entries, i, j;
1810   struct targ_fn_descriptor *targ_fns;
1811   struct addr_pair *targ_tbl;
1812   const nvptx_tdata_t *img_header = (const nvptx_tdata_t *) target_data;
1813   struct ptx_image_data *new_image;
1814   struct ptx_device *dev;
1815
1816   if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1817     {
1818       GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1819                          " (expected %u, received %u)",
1820                          GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1821       return -1;
1822     }
1823
1824   if (!nvptx_attach_host_thread_to_device (ord)
1825       || !link_ptx (&module, img_header->ptx_objs, img_header->ptx_num))
1826     return -1;
1827
1828   dev = ptx_devices[ord];
1829
1830   /* The mkoffload utility emits a struct of pointers/integers at the
1831      start of each offload image.  The array of kernel names and the
1832      functions addresses form a one-to-one correspondence.  */
1833
1834   var_entries = img_header->var_num;
1835   var_names = img_header->var_names;
1836   fn_entries = img_header->fn_num;
1837   fn_descs = img_header->fn_descs;
1838
1839   targ_tbl = GOMP_PLUGIN_malloc (sizeof (struct addr_pair)
1840                                  * (fn_entries + var_entries));
1841   targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
1842                                  * fn_entries);
1843
1844   *target_table = targ_tbl;
1845
1846   new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data));
1847   new_image->target_data = target_data;
1848   new_image->module = module;
1849   new_image->fns = targ_fns;
1850
1851   pthread_mutex_lock (&dev->image_lock);
1852   new_image->next = dev->images;
1853   dev->images = new_image;
1854   pthread_mutex_unlock (&dev->image_lock);
1855
1856   for (i = 0; i < fn_entries; i++, targ_fns++, targ_tbl++)
1857     {
1858       CUfunction function;
1859       int nregs, mthrs;
1860
1861       CUDA_CALL_ERET (-1, cuModuleGetFunction, &function, module,
1862                       fn_descs[i].fn);
1863       CUDA_CALL_ERET (-1, cuFuncGetAttribute, &nregs,
1864                       CU_FUNC_ATTRIBUTE_NUM_REGS, function);
1865       CUDA_CALL_ERET (-1, cuFuncGetAttribute, &mthrs,
1866                       CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, function);
1867
1868       targ_fns->fn = function;
1869       targ_fns->launch = &fn_descs[i];
1870       targ_fns->regs_per_thread = nregs;
1871       targ_fns->max_threads_per_block = mthrs;
1872
1873       targ_tbl->start = (uintptr_t) targ_fns;
1874       targ_tbl->end = targ_tbl->start + 1;
1875     }
1876
1877   for (j = 0; j < var_entries; j++, targ_tbl++)
1878     {
1879       CUdeviceptr var;
1880       size_t bytes;
1881
1882       CUDA_CALL_ERET (-1, cuModuleGetGlobal,
1883                       &var, &bytes, module, var_names[j]);
1884
1885       targ_tbl->start = (uintptr_t) var;
1886       targ_tbl->end = targ_tbl->start + bytes;
1887     }
1888
1889   nvptx_set_clocktick (module, dev);
1890
1891   return fn_entries + var_entries;
1892 }
1893
1894 /* Unload the program described by TARGET_DATA.  DEV_DATA is the
1895    function descriptors allocated by G_O_load_image.  */
1896
1897 bool
1898 GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data)
1899 {
1900   struct ptx_image_data *image, **prev_p;
1901   struct ptx_device *dev = ptx_devices[ord];
1902
1903   if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1904     {
1905       GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1906                          " (expected %u, received %u)",
1907                          GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1908       return false;
1909     }
1910
1911   bool ret = true;
1912   pthread_mutex_lock (&dev->image_lock);
1913   for (prev_p = &dev->images; (image = *prev_p) != 0; prev_p = &image->next)
1914     if (image->target_data == target_data)
1915       {
1916         *prev_p = image->next;
1917         if (CUDA_CALL_NOCHECK (cuModuleUnload, image->module) != CUDA_SUCCESS)
1918           ret = false;
1919         free (image->fns);
1920         free (image);
1921         break;
1922       }
1923   pthread_mutex_unlock (&dev->image_lock);
1924   return ret;
1925 }
1926
1927 void *
1928 GOMP_OFFLOAD_alloc (int ord, size_t size)
1929 {
1930   if (!nvptx_attach_host_thread_to_device (ord))
1931     return NULL;
1932   return nvptx_alloc (size);
1933 }
1934
1935 bool
1936 GOMP_OFFLOAD_free (int ord, void *ptr)
1937 {
1938   return (nvptx_attach_host_thread_to_device (ord)
1939           && nvptx_free (ptr));
1940 }
1941
1942 bool
1943 GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
1944 {
1945   return (nvptx_attach_host_thread_to_device (ord)
1946           && nvptx_dev2host (dst, src, n));
1947 }
1948
1949 bool
1950 GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
1951 {
1952   return (nvptx_attach_host_thread_to_device (ord)
1953           && nvptx_host2dev (dst, src, n));
1954 }
1955
1956 bool
1957 GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n)
1958 {
1959   struct ptx_device *ptx_dev = ptx_devices[ord];
1960   CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n,
1961                                 ptx_dev->null_stream->stream);
1962   return true;
1963 }
1964
1965 void (*device_run) (int n, void *fn_ptr, void *vars) = NULL;
1966
1967 void
1968 GOMP_OFFLOAD_openacc_exec (void (*fn) (void *), size_t mapnum,
1969                            void **hostaddrs, void **devaddrs,
1970                            int async, unsigned *dims, void *targ_mem_desc)
1971 {
1972   nvptx_exec (fn, mapnum, hostaddrs, devaddrs, async, dims, targ_mem_desc);
1973 }
1974
1975 void
1976 GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc, int async)
1977 {
1978   struct nvptx_thread *nvthd = nvptx_thread ();
1979   CUevent *e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1980
1981   CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1982   CUDA_CALL_ASSERT (cuEventRecord, *e, nvthd->current_stream->stream);
1983   event_add (PTX_EVT_ASYNC_CLEANUP, e, targ_mem_desc, async);
1984 }
1985
1986 int
1987 GOMP_OFFLOAD_openacc_async_test (int async)
1988 {
1989   return nvptx_async_test (async);
1990 }
1991
1992 int
1993 GOMP_OFFLOAD_openacc_async_test_all (void)
1994 {
1995   return nvptx_async_test_all ();
1996 }
1997
1998 void
1999 GOMP_OFFLOAD_openacc_async_wait (int async)
2000 {
2001   nvptx_wait (async);
2002 }
2003
2004 void
2005 GOMP_OFFLOAD_openacc_async_wait_async (int async1, int async2)
2006 {
2007   nvptx_wait_async (async1, async2);
2008 }
2009
2010 void
2011 GOMP_OFFLOAD_openacc_async_wait_all (void)
2012 {
2013   nvptx_wait_all ();
2014 }
2015
2016 void
2017 GOMP_OFFLOAD_openacc_async_wait_all_async (int async)
2018 {
2019   nvptx_wait_all_async (async);
2020 }
2021
2022 void
2023 GOMP_OFFLOAD_openacc_async_set_async (int async)
2024 {
2025   nvptx_set_async (async);
2026 }
2027
2028 void *
2029 GOMP_OFFLOAD_openacc_create_thread_data (int ord)
2030 {
2031   struct ptx_device *ptx_dev;
2032   struct nvptx_thread *nvthd
2033     = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
2034   CUcontext thd_ctx;
2035
2036   ptx_dev = ptx_devices[ord];
2037
2038   assert (ptx_dev);
2039
2040   CUDA_CALL_ASSERT (cuCtxGetCurrent, &thd_ctx);
2041
2042   assert (ptx_dev->ctx);
2043
2044   if (!thd_ctx)
2045     CUDA_CALL_ASSERT (cuCtxPushCurrent, ptx_dev->ctx);
2046
2047   nvthd->current_stream = ptx_dev->null_stream;
2048   nvthd->ptx_dev = ptx_dev;
2049
2050   return (void *) nvthd;
2051 }
2052
2053 void
2054 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
2055 {
2056   free (data);
2057 }
2058
2059 void *
2060 GOMP_OFFLOAD_openacc_cuda_get_current_device (void)
2061 {
2062   return nvptx_get_current_cuda_device ();
2063 }
2064
2065 void *
2066 GOMP_OFFLOAD_openacc_cuda_get_current_context (void)
2067 {
2068   return nvptx_get_current_cuda_context ();
2069 }
2070
2071 /* NOTE: This returns a CUstream, not a ptx_stream pointer.  */
2072
2073 void *
2074 GOMP_OFFLOAD_openacc_cuda_get_stream (int async)
2075 {
2076   return nvptx_get_cuda_stream (async);
2077 }
2078
2079 /* NOTE: This takes a CUstream, not a ptx_stream pointer.  */
2080
2081 int
2082 GOMP_OFFLOAD_openacc_cuda_set_stream (int async, void *stream)
2083 {
2084   return nvptx_set_cuda_stream (async, stream);
2085 }
2086
2087 /* Adjust launch dimensions: pick good values for number of blocks and warps
2088    and ensure that number of warps does not exceed CUDA limits as well as GCC's
2089    own limits.  */
2090
2091 static void
2092 nvptx_adjust_launch_bounds (struct targ_fn_descriptor *fn,
2093                             struct ptx_device *ptx_dev,
2094                             int *teams_p, int *threads_p)
2095 {
2096   int max_warps_block = fn->max_threads_per_block / 32;
2097   /* Maximum 32 warps per block is an implementation limit in NVPTX backend
2098      and libgcc, which matches documented limit of all GPUs as of 2015.  */
2099   if (max_warps_block > 32)
2100     max_warps_block = 32;
2101   if (*threads_p <= 0)
2102     *threads_p = 8;
2103   if (*threads_p > max_warps_block)
2104     *threads_p = max_warps_block;
2105
2106   int regs_per_block = fn->regs_per_thread * 32 * *threads_p;
2107   /* This is an estimate of how many blocks the device can host simultaneously.
2108      Actual limit, which may be lower, can be queried with "occupancy control"
2109      driver interface (since CUDA 6.0).  */
2110   int max_blocks = ptx_dev->regs_per_sm / regs_per_block * ptx_dev->num_sms;
2111   if (*teams_p <= 0 || *teams_p > max_blocks)
2112     *teams_p = max_blocks;
2113 }
2114
2115 /* Return the size of per-warp stacks (see gcc -msoft-stack) to use for OpenMP
2116    target regions.  */
2117
2118 static size_t
2119 nvptx_stacks_size ()
2120 {
2121   return 128 * 1024;
2122 }
2123
2124 /* Return contiguous storage for NUM stacks, each SIZE bytes.  */
2125
2126 static void *
2127 nvptx_stacks_alloc (size_t size, int num)
2128 {
2129   CUdeviceptr stacks;
2130   CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &stacks, size * num);
2131   if (r != CUDA_SUCCESS)
2132     GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
2133   return (void *) stacks;
2134 }
2135
2136 /* Release storage previously allocated by nvptx_stacks_alloc.  */
2137
2138 static void
2139 nvptx_stacks_free (void *p, int num)
2140 {
2141   CUresult r = CUDA_CALL_NOCHECK (cuMemFree, (CUdeviceptr) p);
2142   if (r != CUDA_SUCCESS)
2143     GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
2144 }
2145
2146 void
2147 GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
2148 {
2149   CUfunction function = ((struct targ_fn_descriptor *) tgt_fn)->fn;
2150   CUresult r;
2151   struct ptx_device *ptx_dev = ptx_devices[ord];
2152   const char *maybe_abort_msg = "(perhaps abort was called)";
2153   int teams = 0, threads = 0;
2154
2155   if (!args)
2156     GOMP_PLUGIN_fatal ("No target arguments provided");
2157   while (*args)
2158     {
2159       intptr_t id = (intptr_t) *args++, val;
2160       if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
2161         val = (intptr_t) *args++;
2162       else
2163         val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
2164       if ((id & GOMP_TARGET_ARG_DEVICE_MASK) != GOMP_TARGET_ARG_DEVICE_ALL)
2165         continue;
2166       val = val > INT_MAX ? INT_MAX : val;
2167       id &= GOMP_TARGET_ARG_ID_MASK;
2168       if (id == GOMP_TARGET_ARG_NUM_TEAMS)
2169         teams = val;
2170       else if (id == GOMP_TARGET_ARG_THREAD_LIMIT)
2171         threads = val;
2172     }
2173   nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads);
2174
2175   size_t stack_size = nvptx_stacks_size ();
2176   void *stacks = nvptx_stacks_alloc (stack_size, teams * threads);
2177   void *fn_args[] = {tgt_vars, stacks, (void *) stack_size};
2178   size_t fn_args_size = sizeof fn_args;
2179   void *config[] = {
2180     CU_LAUNCH_PARAM_BUFFER_POINTER, fn_args,
2181     CU_LAUNCH_PARAM_BUFFER_SIZE, &fn_args_size,
2182     CU_LAUNCH_PARAM_END
2183   };
2184   r = CUDA_CALL_NOCHECK (cuLaunchKernel, function, teams, 1, 1,
2185                          32, threads, 1, 0, ptx_dev->null_stream->stream,
2186                          NULL, config);
2187   if (r != CUDA_SUCCESS)
2188     GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
2189
2190   r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
2191   if (r == CUDA_ERROR_LAUNCH_FAILED)
2192     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
2193                        maybe_abort_msg);
2194   else if (r != CUDA_SUCCESS)
2195     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
2196   nvptx_stacks_free (stacks, teams * threads);
2197 }
2198
2199 void
2200 GOMP_OFFLOAD_async_run (int ord, void *tgt_fn, void *tgt_vars, void **args,
2201                         void *async_data)
2202 {
2203   GOMP_PLUGIN_fatal ("GOMP_OFFLOAD_async_run unimplemented");
2204 }