libgomp/plugin/plugin-nvptx.c

   1 /* Plugin for NVPTX execution.
   2
   3    Copyright (C) 2013-2015 Free Software Foundation, Inc.
   4
   5    Contributed by Mentor Embedded.
   6
   7    This file is part of the GNU Offloading and Multi Processing Library
   8    (libgomp).
   9
  10    Libgomp is free software; you can redistribute it and/or modify it
  11    under the terms of the GNU General Public License as published by
  12    the Free Software Foundation; either version 3, or (at your option)
  13    any later version.
  14
  15    Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
  16    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  17    FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  18    more details.
  19
  20    Under Section 7 of GPL version 3, you are granted additional
  21    permissions described in the GCC Runtime Library Exception, version
  22    3.1, as published by the Free Software Foundation.
  23
  24    You should have received a copy of the GNU General Public License and
  25    a copy of the GCC Runtime Library Exception along with this program;
  26    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  27    <http://www.gnu.org/licenses/>.  */
  28
  29 /* Nvidia PTX-specific parts of OpenACC support.  The cuda driver
  30    library appears to hold some implicit state, but the documentation
  31    is not clear as to what that state might be.  Or how one might
  32    propagate it from one thread to another.  */
  33
  34 #include "openacc.h"
  35 #include "config.h"
  36 #include "libgomp-plugin.h"
  37 #include "oacc-ptx.h"
  38 #include "oacc-plugin.h"
  39 #include "gomp-constants.h"
  40
  41 #include <pthread.h>
  42 #include <cuda.h>
  43 #include <stdbool.h>
  44 #include <stdint.h>
  45 #include <string.h>
  46 #include <stdio.h>
  47 #include <unistd.h>
  48 #include <assert.h>
  49
  50 #define ARRAYSIZE(X) (sizeof (X) / sizeof ((X)[0]))
  51
  52 static const struct
  53 {
  54   CUresult r;
  55   const char *m;
  56 } cuda_errlist[]=
  57 {
  58   { CUDA_ERROR_INVALID_VALUE, "invalid value" },
  59   { CUDA_ERROR_OUT_OF_MEMORY, "out of memory" },
  60   { CUDA_ERROR_NOT_INITIALIZED, "not initialized" },
  61   { CUDA_ERROR_DEINITIALIZED, "deinitialized" },
  62   { CUDA_ERROR_PROFILER_DISABLED, "profiler disabled" },
  63   { CUDA_ERROR_PROFILER_NOT_INITIALIZED, "profiler not initialized" },
  64   { CUDA_ERROR_PROFILER_ALREADY_STARTED, "already started" },
  65   { CUDA_ERROR_PROFILER_ALREADY_STOPPED, "already stopped" },
  66   { CUDA_ERROR_NO_DEVICE, "no device" },
  67   { CUDA_ERROR_INVALID_DEVICE, "invalid device" },
  68   { CUDA_ERROR_INVALID_IMAGE, "invalid image" },
  69   { CUDA_ERROR_INVALID_CONTEXT, "invalid context" },
  70   { CUDA_ERROR_CONTEXT_ALREADY_CURRENT, "context already current" },
  71   { CUDA_ERROR_MAP_FAILED, "map error" },
  72   { CUDA_ERROR_UNMAP_FAILED, "unmap error" },
  73   { CUDA_ERROR_ARRAY_IS_MAPPED, "array is mapped" },
  74   { CUDA_ERROR_ALREADY_MAPPED, "already mapped" },
  75   { CUDA_ERROR_NO_BINARY_FOR_GPU, "no binary for gpu" },
  76   { CUDA_ERROR_ALREADY_ACQUIRED, "already acquired" },
  77   { CUDA_ERROR_NOT_MAPPED, "not mapped" },
  78   { CUDA_ERROR_NOT_MAPPED_AS_ARRAY, "not mapped as array" },
  79   { CUDA_ERROR_NOT_MAPPED_AS_POINTER, "not mapped as pointer" },
  80   { CUDA_ERROR_ECC_UNCORRECTABLE, "ecc uncorrectable" },
  81   { CUDA_ERROR_UNSUPPORTED_LIMIT, "unsupported limit" },
  82   { CUDA_ERROR_CONTEXT_ALREADY_IN_USE, "context already in use" },
  83   { CUDA_ERROR_PEER_ACCESS_UNSUPPORTED, "peer access unsupported" },
  84   { CUDA_ERROR_INVALID_SOURCE, "invalid source" },
  85   { CUDA_ERROR_FILE_NOT_FOUND, "file not found" },
  86   { CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
  87                                            "shared object symbol not found" },
  88   { CUDA_ERROR_SHARED_OBJECT_INIT_FAILED, "shared object init error" },
  89   { CUDA_ERROR_OPERATING_SYSTEM, "operating system" },
  90   { CUDA_ERROR_INVALID_HANDLE, "invalid handle" },
  91   { CUDA_ERROR_NOT_FOUND, "not found" },
  92   { CUDA_ERROR_NOT_READY, "not ready" },
  93   { CUDA_ERROR_LAUNCH_FAILED, "launch error" },
  94   { CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, "launch out of resources" },
  95   { CUDA_ERROR_LAUNCH_TIMEOUT, "launch timeout" },
  96   { CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
  97                                              "launch incompatibe texturing" },
  98   { CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED, "peer access already enabled" },
  99   { CUDA_ERROR_PEER_ACCESS_NOT_ENABLED, "peer access not enabled " },
 100   { CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE, "primary cotext active" },
 101   { CUDA_ERROR_CONTEXT_IS_DESTROYED, "context is destroyed" },
 102   { CUDA_ERROR_ASSERT, "assert" },
 103   { CUDA_ERROR_TOO_MANY_PEERS, "too many peers" },
 104   { CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED,
 105                                            "host memory already registered" },
 106   { CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED, "host memory not registered" },
 107   { CUDA_ERROR_NOT_PERMITTED, "not permitted" },
 108   { CUDA_ERROR_NOT_SUPPORTED, "not supported" },
 109   { CUDA_ERROR_UNKNOWN, "unknown" }
 110 };
 111
 112 static const char *
 113 cuda_error (CUresult r)
 114 {
 115   int i;
 116
 117   for (i = 0; i < ARRAYSIZE (cuda_errlist); i++)
 118     {
 119       if (cuda_errlist[i].r == r)
 120         return cuda_errlist[i].m;
 121     }
 122
 123   static char errmsg[30];
 124
 125   snprintf (errmsg, sizeof (errmsg), "unknown error code: %d", r);
 126
 127   return errmsg;
 128 }
 129
 130 static unsigned int instantiated_devices = 0;
 131 static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
 132
 133 struct ptx_stream
 134 {
 135   CUstream stream;
 136   pthread_t host_thread;
 137   bool multithreaded;
 138
 139   CUdeviceptr d;
 140   void *h;
 141   void *h_begin;
 142   void *h_end;
 143   void *h_next;
 144   void *h_prev;
 145   void *h_tail;
 146
 147   struct ptx_stream *next;
 148 };
 149
 150 /* Thread-specific data for PTX.  */
 151
 152 struct nvptx_thread
 153 {
 154   struct ptx_stream *current_stream;
 155   struct ptx_device *ptx_dev;
 156 };
 157
 158 struct map
 159 {
 160   int     async;
 161   size_t  size;
 162   char    mappings[0];
 163 };
 164
 165 static void
 166 map_init (struct ptx_stream *s)
 167 {
 168   CUresult r;
 169
 170   int size = getpagesize ();
 171
 172   assert (s);
 173   assert (!s->d);
 174   assert (!s->h);
 175
 176   r = cuMemAllocHost (&s->h, size);
 177   if (r != CUDA_SUCCESS)
 178     GOMP_PLUGIN_fatal ("cuMemAllocHost error: %s", cuda_error (r));
 179
 180   r = cuMemHostGetDevicePointer (&s->d, s->h, 0);
 181   if (r != CUDA_SUCCESS)
 182     GOMP_PLUGIN_fatal ("cuMemHostGetDevicePointer error: %s", cuda_error (r));
 183
 184   assert (s->h);
 185
 186   s->h_begin = s->h;
 187   s->h_end = s->h_begin + size;
 188   s->h_next = s->h_prev = s->h_tail = s->h_begin;
 189
 190   assert (s->h_next);
 191   assert (s->h_end);
 192 }
 193
 194 static void
 195 map_fini (struct ptx_stream *s)
 196 {
 197   CUresult r;
 198
 199   r = cuMemFreeHost (s->h);
 200   if (r != CUDA_SUCCESS)
 201     GOMP_PLUGIN_fatal ("cuMemFreeHost error: %s", cuda_error (r));
 202 }
 203
 204 static void
 205 map_pop (struct ptx_stream *s)
 206 {
 207   struct map *m;
 208
 209   assert (s != NULL);
 210   assert (s->h_next);
 211   assert (s->h_prev);
 212   assert (s->h_tail);
 213
 214   m = s->h_tail;
 215
 216   s->h_tail += m->size;
 217
 218   if (s->h_tail >= s->h_end)
 219     s->h_tail = s->h_begin + (int) (s->h_tail - s->h_end);
 220
 221   if (s->h_next == s->h_tail)
 222     s->h_prev = s->h_next;
 223
 224   assert (s->h_next >= s->h_begin);
 225   assert (s->h_tail >= s->h_begin);
 226   assert (s->h_prev >= s->h_begin);
 227
 228   assert (s->h_next <= s->h_end);
 229   assert (s->h_tail <= s->h_end);
 230   assert (s->h_prev <= s->h_end);
 231 }
 232
 233 static void
 234 map_push (struct ptx_stream *s, int async, size_t size, void **h, void **d)
 235 {
 236   int left;
 237   int offset;
 238   struct map *m;
 239
 240   assert (s != NULL);
 241
 242   left = s->h_end - s->h_next;
 243   size += sizeof (struct map);
 244
 245   assert (s->h_prev);
 246   assert (s->h_next);
 247
 248   if (size >= left)
 249     {
 250       m = s->h_prev;
 251       m->size += left;
 252       s->h_next = s->h_begin;
 253
 254       if (s->h_next + size > s->h_end)
 255         GOMP_PLUGIN_fatal ("unable to push map");
 256     }
 257
 258   assert (s->h_next);
 259
 260   m = s->h_next;
 261   m->async = async;
 262   m->size = size;
 263
 264   offset = (void *)&m->mappings[0] - s->h;
 265
 266   *d = (void *)(s->d + offset);
 267   *h = (void *)(s->h + offset);
 268
 269   s->h_prev = s->h_next;
 270   s->h_next += size;
 271
 272   assert (s->h_prev);
 273   assert (s->h_next);
 274
 275   assert (s->h_next >= s->h_begin);
 276   assert (s->h_tail >= s->h_begin);
 277   assert (s->h_prev >= s->h_begin);
 278   assert (s->h_next <= s->h_end);
 279   assert (s->h_tail <= s->h_end);
 280   assert (s->h_prev <= s->h_end);
 281
 282   return;
 283 }
 284
 285 /* Descriptor of a loaded function.  */
 286
 287 struct targ_fn_descriptor
 288 {
 289   CUfunction fn;
 290   const char *name;
 291 };
 292
 293 /* A loaded PTX image.  */
 294 struct ptx_image_data
 295 {
 296   const void *target_data;
 297   CUmodule module;
 298
 299   struct targ_fn_descriptor *fns;  /* Array of functions.  */
 300
 301   struct ptx_image_data *next;
 302 };
 303
 304 struct ptx_device
 305 {
 306   CUcontext ctx;
 307   bool ctx_shared;
 308   CUdevice dev;
 309   struct ptx_stream *null_stream;
 310   /* All non-null streams associated with this device (actually context),
 311      either created implicitly or passed in from the user (via
 312      acc_set_cuda_stream).  */
 313   struct ptx_stream *active_streams;
 314   struct {
 315     struct ptx_stream **arr;
 316     int size;
 317   } async_streams;
 318   /* A lock for use when manipulating the above stream list and array.  */
 319   pthread_mutex_t stream_lock;
 320   int ord;
 321   bool overlap;
 322   bool map;
 323   bool concur;
 324   int  mode;
 325   bool mkern;
 326
 327   struct ptx_image_data *images;  /* Images loaded on device.  */
 328   pthread_mutex_t image_lock;     /* Lock for above list.  */
 329
 330   struct ptx_device *next;
 331 };
 332
 333 enum ptx_event_type
 334 {
 335   PTX_EVT_MEM,
 336   PTX_EVT_KNL,
 337   PTX_EVT_SYNC,
 338   PTX_EVT_ASYNC_CLEANUP
 339 };
 340
 341 struct ptx_event
 342 {
 343   CUevent *evt;
 344   int type;
 345   void *addr;
 346   int ord;
 347
 348   struct ptx_event *next;
 349 };
 350
 351 static pthread_mutex_t ptx_event_lock;
 352 static struct ptx_event *ptx_events;
 353
 354 static struct ptx_device **ptx_devices;
 355
 356 static inline struct nvptx_thread *
 357 nvptx_thread (void)
 358 {
 359   return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
 360 }
 361
 362 static void
 363 init_streams_for_device (struct ptx_device *ptx_dev, int concurrency)
 364 {
 365   int i;
 366   struct ptx_stream *null_stream
 367     = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
 368
 369   null_stream->stream = NULL;
 370   null_stream->host_thread = pthread_self ();
 371   null_stream->multithreaded = true;
 372   null_stream->d = (CUdeviceptr) NULL;
 373   null_stream->h = NULL;
 374   map_init (null_stream);
 375   ptx_dev->null_stream = null_stream;
 376
 377   ptx_dev->active_streams = NULL;
 378   pthread_mutex_init (&ptx_dev->stream_lock, NULL);
 379
 380   if (concurrency < 1)
 381     concurrency = 1;
 382
 383   /* This is just a guess -- make space for as many async streams as the
 384      current device is capable of concurrently executing.  This can grow
 385      later as necessary.  No streams are created yet.  */
 386   ptx_dev->async_streams.arr
 387     = GOMP_PLUGIN_malloc (concurrency * sizeof (struct ptx_stream *));
 388   ptx_dev->async_streams.size = concurrency;
 389
 390   for (i = 0; i < concurrency; i++)
 391     ptx_dev->async_streams.arr[i] = NULL;
 392 }
 393
 394 static void
 395 fini_streams_for_device (struct ptx_device *ptx_dev)
 396 {
 397   free (ptx_dev->async_streams.arr);
 398
 399   while (ptx_dev->active_streams != NULL)
 400     {
 401       struct ptx_stream *s = ptx_dev->active_streams;
 402       ptx_dev->active_streams = ptx_dev->active_streams->next;
 403
 404       map_fini (s);
 405       cuStreamDestroy (s->stream);
 406       free (s);
 407     }
 408
 409   map_fini (ptx_dev->null_stream);
 410   free (ptx_dev->null_stream);
 411 }
 412
 413 /* Select a stream for (OpenACC-semantics) ASYNC argument for the current
 414    thread THREAD (and also current device/context).  If CREATE is true, create
 415    the stream if it does not exist (or use EXISTING if it is non-NULL), and
 416    associate the stream with the same thread argument.  Returns stream to use
 417    as result.  */
 418
 419 static struct ptx_stream *
 420 select_stream_for_async (int async, pthread_t thread, bool create,
 421                          CUstream existing)
 422 {
 423   struct nvptx_thread *nvthd = nvptx_thread ();
 424   /* Local copy of TLS variable.  */
 425   struct ptx_device *ptx_dev = nvthd->ptx_dev;
 426   struct ptx_stream *stream = NULL;
 427   int orig_async = async;
 428
 429   /* The special value acc_async_noval (-1) maps (for now) to an
 430      implicitly-created stream, which is then handled the same as any other
 431      numbered async stream.  Other options are available, e.g. using the null
 432      stream for anonymous async operations, or choosing an idle stream from an
 433      active set.  But, stick with this for now.  */
 434   if (async > acc_async_sync)
 435     async++;
 436
 437   if (create)
 438     pthread_mutex_lock (&ptx_dev->stream_lock);
 439
 440   /* NOTE: AFAICT there's no particular need for acc_async_sync to map to the
 441      null stream, and in fact better performance may be obtainable if it doesn't
 442      (because the null stream enforces overly-strict synchronisation with
 443      respect to other streams for legacy reasons, and that's probably not
 444      needed with OpenACC).  Maybe investigate later.  */
 445   if (async == acc_async_sync)
 446     stream = ptx_dev->null_stream;
 447   else if (async >= 0 && async < ptx_dev->async_streams.size
 448            && ptx_dev->async_streams.arr[async] && !(create && existing))
 449     stream = ptx_dev->async_streams.arr[async];
 450   else if (async >= 0 && create)
 451     {
 452       if (async >= ptx_dev->async_streams.size)
 453         {
 454           int i, newsize = ptx_dev->async_streams.size * 2;
 455
 456           if (async >= newsize)
 457             newsize = async + 1;
 458
 459           ptx_dev->async_streams.arr
 460             = GOMP_PLUGIN_realloc (ptx_dev->async_streams.arr,
 461                                    newsize * sizeof (struct ptx_stream *));
 462
 463           for (i = ptx_dev->async_streams.size; i < newsize; i++)
 464             ptx_dev->async_streams.arr[i] = NULL;
 465
 466           ptx_dev->async_streams.size = newsize;
 467         }
 468
 469       /* Create a new stream on-demand if there isn't one already, or if we're
 470          setting a particular async value to an existing (externally-provided)
 471          stream.  */
 472       if (!ptx_dev->async_streams.arr[async] || existing)
 473         {
 474           CUresult r;
 475           struct ptx_stream *s
 476             = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
 477
 478           if (existing)
 479             s->stream = existing;
 480           else
 481             {
 482               r = cuStreamCreate (&s->stream, CU_STREAM_DEFAULT);
 483               if (r != CUDA_SUCCESS)
 484                 GOMP_PLUGIN_fatal ("cuStreamCreate error: %s", cuda_error (r));
 485             }
 486
 487           /* If CREATE is true, we're going to be queueing some work on this
 488              stream.  Associate it with the current host thread.  */
 489           s->host_thread = thread;
 490           s->multithreaded = false;
 491
 492           s->d = (CUdeviceptr) NULL;
 493           s->h = NULL;
 494           map_init (s);
 495
 496           s->next = ptx_dev->active_streams;
 497           ptx_dev->active_streams = s;
 498           ptx_dev->async_streams.arr[async] = s;
 499         }
 500
 501       stream = ptx_dev->async_streams.arr[async];
 502     }
 503   else if (async < 0)
 504     GOMP_PLUGIN_fatal ("bad async %d", async);
 505
 506   if (create)
 507     {
 508       assert (stream != NULL);
 509
 510       /* If we're trying to use the same stream from different threads
 511          simultaneously, set stream->multithreaded to true.  This affects the
 512          behaviour of acc_async_test_all and acc_wait_all, which are supposed to
 513          only wait for asynchronous launches from the same host thread they are
 514          invoked on.  If multiple threads use the same async value, we make note
 515          of that here and fall back to testing/waiting for all threads in those
 516          functions.  */
 517       if (thread != stream->host_thread)
 518         stream->multithreaded = true;
 519
 520       pthread_mutex_unlock (&ptx_dev->stream_lock);
 521     }
 522   else if (stream && !stream->multithreaded
 523            && !pthread_equal (stream->host_thread, thread))
 524     GOMP_PLUGIN_fatal ("async %d used on wrong thread", orig_async);
 525
 526   return stream;
 527 }
 528
 529 /* Initialize the device.  Return TRUE on success, else FALSE.  PTX_DEV_LOCK
 530    should be locked on entry and remains locked on exit.  */
 531
 532 static bool
 533 nvptx_init (void)
 534 {
 535   CUresult r;
 536   int ndevs;
 537
 538   if (instantiated_devices != 0)
 539     return true;
 540
 541   r = cuInit (0);
 542   if (r != CUDA_SUCCESS)
 543     GOMP_PLUGIN_fatal ("cuInit error: %s", cuda_error (r));
 544
 545   ptx_events = NULL;
 546
 547   pthread_mutex_init (&ptx_event_lock, NULL);
 548
 549   r = cuDeviceGetCount (&ndevs);
 550   if (r != CUDA_SUCCESS)
 551     GOMP_PLUGIN_fatal ("cuDeviceGetCount error: %s", cuda_error (r));
 552
 553   ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *)
 554                                             * ndevs);
 555
 556   return true;
 557 }
 558
 559 /* Select the N'th PTX device for the current host thread.  The device must
 560    have been previously opened before calling this function.  */
 561
 562 static void
 563 nvptx_attach_host_thread_to_device (int n)
 564 {
 565   CUdevice dev;
 566   CUresult r;
 567   struct ptx_device *ptx_dev;
 568   CUcontext thd_ctx;
 569
 570   r = cuCtxGetDevice (&dev);
 571   if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
 572     GOMP_PLUGIN_fatal ("cuCtxGetDevice error: %s", cuda_error (r));
 573
 574   if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n)
 575     return;
 576   else
 577     {
 578       CUcontext old_ctx;
 579
 580       ptx_dev = ptx_devices[n];
 581       assert (ptx_dev);
 582
 583       r = cuCtxGetCurrent (&thd_ctx);
 584       if (r != CUDA_SUCCESS)
 585         GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));
 586
 587       /* We don't necessarily have a current context (e.g. if it has been
 588          destroyed.  Pop it if we do though.  */
 589       if (thd_ctx != NULL)
 590         {
 591           r = cuCtxPopCurrent (&old_ctx);
 592           if (r != CUDA_SUCCESS)
 593             GOMP_PLUGIN_fatal ("cuCtxPopCurrent error: %s", cuda_error (r));
 594         }
 595
 596       r = cuCtxPushCurrent (ptx_dev->ctx);
 597       if (r != CUDA_SUCCESS)
 598         GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s", cuda_error (r));
 599     }
 600 }
 601
 602 static struct ptx_device *
 603 nvptx_open_device (int n)
 604 {
 605   struct ptx_device *ptx_dev;
 606   CUdevice dev, ctx_dev;
 607   CUresult r;
 608   int async_engines, pi;
 609
 610   r = cuDeviceGet (&dev, n);
 611   if (r != CUDA_SUCCESS)
 612     GOMP_PLUGIN_fatal ("cuDeviceGet error: %s", cuda_error (r));
 613
 614   ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device));
 615
 616   ptx_dev->ord = n;
 617   ptx_dev->dev = dev;
 618   ptx_dev->ctx_shared = false;
 619
 620   r = cuCtxGetDevice (&ctx_dev);
 621   if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
 622     GOMP_PLUGIN_fatal ("cuCtxGetDevice error: %s", cuda_error (r));
 623
 624   if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev)
 625     {
 626       /* The current host thread has an active context for a different device.
 627          Detach it.  */
 628       CUcontext old_ctx;
 629
 630       r = cuCtxPopCurrent (&old_ctx);
 631       if (r != CUDA_SUCCESS)
 632         GOMP_PLUGIN_fatal ("cuCtxPopCurrent error: %s", cuda_error (r));
 633     }
 634
 635   r = cuCtxGetCurrent (&ptx_dev->ctx);
 636   if (r != CUDA_SUCCESS)
 637     GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));
 638
 639   if (!ptx_dev->ctx)
 640     {
 641       r = cuCtxCreate (&ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
 642       if (r != CUDA_SUCCESS)
 643         GOMP_PLUGIN_fatal ("cuCtxCreate error: %s", cuda_error (r));
 644     }
 645   else
 646     ptx_dev->ctx_shared = true;
 647
 648   r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
 649   if (r != CUDA_SUCCESS)
 650     GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
 651
 652   ptx_dev->overlap = pi;
 653
 654   r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
 655   if (r != CUDA_SUCCESS)
 656     GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
 657
 658   ptx_dev->map = pi;
 659
 660   r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
 661   if (r != CUDA_SUCCESS)
 662     GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
 663
 664   ptx_dev->concur = pi;
 665
 666   r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
 667   if (r != CUDA_SUCCESS)
 668     GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
 669
 670   ptx_dev->mode = pi;
 671
 672   r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
 673   if (r != CUDA_SUCCESS)
 674     GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
 675
 676   ptx_dev->mkern = pi;
 677
 678   r = cuDeviceGetAttribute (&async_engines,
 679                             CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
 680   if (r != CUDA_SUCCESS)
 681     async_engines = 1;
 682
 683   ptx_dev->images = NULL;
 684   pthread_mutex_init (&ptx_dev->image_lock, NULL);
 685
 686   init_streams_for_device (ptx_dev, async_engines);
 687
 688   return ptx_dev;
 689 }
 690
 691 static void
 692 nvptx_close_device (struct ptx_device *ptx_dev)
 693 {
 694   CUresult r;
 695
 696   if (!ptx_dev)
 697     return;
 698
 699   fini_streams_for_device (ptx_dev);
 700
 701   pthread_mutex_destroy (&ptx_dev->image_lock);
 702
 703   if (!ptx_dev->ctx_shared)
 704     {
 705       r = cuCtxDestroy (ptx_dev->ctx);
 706       if (r != CUDA_SUCCESS)
 707         GOMP_PLUGIN_fatal ("cuCtxDestroy error: %s", cuda_error (r));
 708     }
 709
 710   free (ptx_dev);
 711 }
 712
 713 static int
 714 nvptx_get_num_devices (void)
 715 {
 716   int n;
 717   CUresult r;
 718
 719   /* PR libgomp/65099: Currently, we only support offloading in 64-bit
 720      configurations.  */
 721   if (sizeof (void *) != 8)
 722     return 0;
 723
 724   /* This function will be called before the plugin has been initialized in
 725      order to enumerate available devices, but CUDA API routines can't be used
 726      until cuInit has been called.  Just call it now (but don't yet do any
 727      further initialization).  */
 728   if (instantiated_devices == 0)
 729     {
 730       r = cuInit (0);
 731       /* This is not an error: e.g. we may have CUDA libraries installed but
 732          no devices available.  */
 733       if (r != CUDA_SUCCESS)
 734         return 0;
 735     }
 736
 737   r = cuDeviceGetCount (&n);
 738   if (r!= CUDA_SUCCESS)
 739     GOMP_PLUGIN_fatal ("cuDeviceGetCount error: %s", cuda_error (r));
 740
 741   return n;
 742 }
 743
 744
 745 static void
 746 link_ptx (CUmodule *module, const char *ptx_code)
 747 {
 748   CUjit_option opts[7];
 749   void *optvals[7];
 750   float elapsed = 0.0;
 751 #define LOGSIZE 8192
 752   char elog[LOGSIZE];
 753   char ilog[LOGSIZE];
 754   unsigned long logsize = LOGSIZE;
 755   CUlinkState linkstate;
 756   CUresult r;
 757   void *linkout;
 758   size_t linkoutsize __attribute__ ((unused));
 759
 760   GOMP_PLUGIN_debug (0, "attempting to load:\n---\n%s\n---\n", ptx_code);
 761
 762   opts[0] = CU_JIT_WALL_TIME;
 763   optvals[0] = &elapsed;
 764
 765   opts[1] = CU_JIT_INFO_LOG_BUFFER;
 766   optvals[1] = &ilog[0];
 767
 768   opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
 769   optvals[2] = (void *) logsize;
 770
 771   opts[3] = CU_JIT_ERROR_LOG_BUFFER;
 772   optvals[3] = &elog[0];
 773
 774   opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
 775   optvals[4] = (void *) logsize;
 776
 777   opts[5] = CU_JIT_LOG_VERBOSE;
 778   optvals[5] = (void *) 1;
 779
 780   opts[6] = CU_JIT_TARGET;
 781   optvals[6] = (void *) CU_TARGET_COMPUTE_30;
 782
 783   r = cuLinkCreate (7, opts, optvals, &linkstate);
 784   if (r != CUDA_SUCCESS)
 785     GOMP_PLUGIN_fatal ("cuLinkCreate error: %s", cuda_error (r));
 786
 787   char *abort_ptx = ABORT_PTX;
 788   r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, abort_ptx,
 789                      strlen (abort_ptx) + 1, 0, 0, 0, 0);
 790   if (r != CUDA_SUCCESS)
 791     {
 792       GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
 793       GOMP_PLUGIN_fatal ("cuLinkAddData (abort) error: %s", cuda_error (r));
 794     }
 795
 796   char *acc_on_device_ptx = ACC_ON_DEVICE_PTX;
 797   r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, acc_on_device_ptx,
 798                      strlen (acc_on_device_ptx) + 1, 0, 0, 0, 0);
 799   if (r != CUDA_SUCCESS)
 800     {
 801       GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
 802       GOMP_PLUGIN_fatal ("cuLinkAddData (acc_on_device) error: %s",
 803                          cuda_error (r));
 804     }
 805
 806   char *goacc_internal_ptx = GOACC_INTERNAL_PTX;
 807   r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, goacc_internal_ptx,
 808                      strlen (goacc_internal_ptx) + 1, 0, 0, 0, 0);
 809   if (r != CUDA_SUCCESS)
 810     {
 811       GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
 812       GOMP_PLUGIN_fatal ("cuLinkAddData (goacc_internal_ptx) error: %s",
 813                          cuda_error (r));
 814     }
 815
 816   /* cuLinkAddData's 'data' argument erroneously omits the const qualifier.  */
 817   r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, (char *)ptx_code,
 818               strlen (ptx_code) + 1, 0, 0, 0, 0);
 819   if (r != CUDA_SUCCESS)
 820     {
 821       GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
 822       GOMP_PLUGIN_fatal ("cuLinkAddData (ptx_code) error: %s", cuda_error (r));
 823     }
 824
 825   r = cuLinkComplete (linkstate, &linkout, &linkoutsize);
 826   if (r != CUDA_SUCCESS)
 827     GOMP_PLUGIN_fatal ("cuLinkComplete error: %s", cuda_error (r));
 828
 829   GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed);
 830   GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]);
 831
 832   r = cuModuleLoadData (module, linkout);
 833   if (r != CUDA_SUCCESS)
 834     GOMP_PLUGIN_fatal ("cuModuleLoadData error: %s", cuda_error (r));
 835 }
 836
 837 static void
 838 event_gc (bool memmap_lockable)
 839 {
 840   struct ptx_event *ptx_event = ptx_events;
 841   struct nvptx_thread *nvthd = nvptx_thread ();
 842
 843   pthread_mutex_lock (&ptx_event_lock);
 844
 845   while (ptx_event != NULL)
 846     {
 847       CUresult r;
 848       struct ptx_event *e = ptx_event;
 849
 850       ptx_event = ptx_event->next;
 851
 852       if (e->ord != nvthd->ptx_dev->ord)
 853         continue;
 854
 855       r = cuEventQuery (*e->evt);
 856       if (r == CUDA_SUCCESS)
 857         {
 858           CUevent *te;
 859
 860           te = e->evt;
 861
 862           switch (e->type)
 863             {
 864             case PTX_EVT_MEM:
 865             case PTX_EVT_SYNC:
 866               break;
 867
 868             case PTX_EVT_KNL:
 869               map_pop (e->addr);
 870               break;
 871
 872             case PTX_EVT_ASYNC_CLEANUP:
 873               {
 874                 /* The function gomp_plugin_async_unmap_vars needs to claim the
 875                    memory-map splay tree lock for the current device, so we
 876                    can't call it when one of our callers has already claimed
 877                    the lock.  In that case, just delay the GC for this event
 878                    until later.  */
 879                 if (!memmap_lockable)
 880                   continue;
 881
 882                 GOMP_PLUGIN_async_unmap_vars (e->addr);
 883               }
 884               break;
 885             }
 886
 887           cuEventDestroy (*te);
 888           free ((void *)te);
 889
 890           if (ptx_events == e)
 891             ptx_events = ptx_events->next;
 892           else
 893             {
 894               struct ptx_event *e_ = ptx_events;
 895               while (e_->next != e)
 896                 e_ = e_->next;
 897               e_->next = e_->next->next;
 898             }
 899
 900           free (e);
 901         }
 902     }
 903
 904   pthread_mutex_unlock (&ptx_event_lock);
 905 }
 906
 907 static void
 908 event_add (enum ptx_event_type type, CUevent *e, void *h)
 909 {
 910   struct ptx_event *ptx_event;
 911   struct nvptx_thread *nvthd = nvptx_thread ();
 912
 913   assert (type == PTX_EVT_MEM || type == PTX_EVT_KNL || type == PTX_EVT_SYNC
 914           || type == PTX_EVT_ASYNC_CLEANUP);
 915
 916   ptx_event = GOMP_PLUGIN_malloc (sizeof (struct ptx_event));
 917   ptx_event->type = type;
 918   ptx_event->evt = e;
 919   ptx_event->addr = h;
 920   ptx_event->ord = nvthd->ptx_dev->ord;
 921
 922   pthread_mutex_lock (&ptx_event_lock);
 923
 924   ptx_event->next = ptx_events;
 925   ptx_events = ptx_event;
 926
 927   pthread_mutex_unlock (&ptx_event_lock);
 928 }
 929
 930 void
 931 nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
 932           size_t *sizes, unsigned short *kinds, int num_gangs, int num_workers,
 933           int vector_length, int async, void *targ_mem_desc)
 934 {
 935   struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
 936   CUfunction function;
 937   CUresult r;
 938   int i;
 939   struct ptx_stream *dev_str;
 940   void *kargs[1];
 941   void *hp, *dp;
 942   unsigned int nthreads_in_block;
 943   struct nvptx_thread *nvthd = nvptx_thread ();
 944   const char *maybe_abort_msg = "(perhaps abort was called)";
 945
 946   function = targ_fn->fn;
 947
 948   dev_str = select_stream_for_async (async, pthread_self (), false, NULL);
 949   assert (dev_str == nvthd->current_stream);
 950
 951   /* This reserves a chunk of a pre-allocated page of memory mapped on both
 952      the host and the device. HP is a host pointer to the new chunk, and DP is
 953      the corresponding device pointer.  */
 954   map_push (dev_str, async, mapnum * sizeof (void *), &hp, &dp);
 955
 956   GOMP_PLUGIN_debug (0, "  %s: prepare mappings\n", __FUNCTION__);
 957
 958   /* Copy the array of arguments to the mapped page.  */
 959   for (i = 0; i < mapnum; i++)
 960     ((void **) hp)[i] = devaddrs[i];
 961
 962   /* Copy the (device) pointers to arguments to the device (dp and hp might in
 963      fact have the same value on a unified-memory system).  */
 964   r = cuMemcpy ((CUdeviceptr)dp, (CUdeviceptr)hp, mapnum * sizeof (void *));
 965   if (r != CUDA_SUCCESS)
 966     GOMP_PLUGIN_fatal ("cuMemcpy failed: %s", cuda_error (r));
 967
 968   GOMP_PLUGIN_debug (0, "  %s: kernel %s: launch\n", __FUNCTION__, targ_fn->name);
 969
 970   // OpenACC            CUDA
 971   //
 972   // num_gangs          blocks
 973   // num_workers        warps (where a warp is equivalent to 32 threads)
 974   // vector length      threads
 975   //
 976
 977   /* The openacc vector_length clause 'determines the vector length to use for
 978      vector or SIMD operations'.  The question is how to map this to CUDA.
 979
 980      In CUDA, the warp size is the vector length of a CUDA device.  However, the
 981      CUDA interface abstracts away from that, and only shows us warp size
 982      indirectly in maximum number of threads per block, which is a product of
 983      warp size and the number of hyperthreads of a multiprocessor.
 984
 985      We choose to map openacc vector_length directly onto the number of threads
 986      in a block, in the x dimension.  This is reflected in gcc code generation
 987      that uses ThreadIdx.x to access vector elements.
 988
 989      Attempting to use an openacc vector_length of more than the maximum number
 990      of threads per block will result in a cuda error.  */
 991   nthreads_in_block = vector_length;
 992
 993   kargs[0] = &dp;
 994   r = cuLaunchKernel (function,
 995                       num_gangs, 1, 1,
 996                       nthreads_in_block, 1, 1,
 997                       0, dev_str->stream, kargs, 0);
 998   if (r != CUDA_SUCCESS)
 999     GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
1000
1001 #ifndef DISABLE_ASYNC
1002   if (async < acc_async_noval)
1003     {
1004       r = cuStreamSynchronize (dev_str->stream);
1005       if (r == CUDA_ERROR_LAUNCH_FAILED)
1006         GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
1007                            maybe_abort_msg);
1008       else if (r != CUDA_SUCCESS)
1009         GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1010     }
1011   else
1012     {
1013       CUevent *e;
1014
1015       e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1016
1017       r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1018       if (r == CUDA_ERROR_LAUNCH_FAILED)
1019         GOMP_PLUGIN_fatal ("cuEventCreate error: %s %s\n", cuda_error (r),
1020                            maybe_abort_msg);
1021       else if (r != CUDA_SUCCESS)
1022         GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1023
1024       event_gc (true);
1025
1026       r = cuEventRecord (*e, dev_str->stream);
1027       if (r != CUDA_SUCCESS)
1028         GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1029
1030       event_add (PTX_EVT_KNL, e, (void *)dev_str);
1031     }
1032 #else
1033   r = cuCtxSynchronize ();
1034   if (r == CUDA_ERROR_LAUNCH_FAILED)
1035     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
1036                        maybe_abort_msg);
1037   else if (r != CUDA_SUCCESS)
1038     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
1039 #endif
1040
1041   GOMP_PLUGIN_debug (0, "  %s: kernel %s: finished\n", __FUNCTION__,
1042                      targ_fn->name);
1043
1044 #ifndef DISABLE_ASYNC
1045   if (async < acc_async_noval)
1046 #endif
1047     map_pop (dev_str);
1048 }
1049
1050 void * openacc_get_current_cuda_context (void);
1051
1052 static void *
1053 nvptx_alloc (size_t s)
1054 {
1055   CUdeviceptr d;
1056   CUresult r;
1057
1058   r = cuMemAlloc (&d, s);
1059   if (r == CUDA_ERROR_OUT_OF_MEMORY)
1060     return 0;
1061   if (r != CUDA_SUCCESS)
1062     GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
1063   return (void *)d;
1064 }
1065
1066 static void
1067 nvptx_free (void *p)
1068 {
1069   CUresult r;
1070   CUdeviceptr pb;
1071   size_t ps;
1072
1073   r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)p);
1074   if (r != CUDA_SUCCESS)
1075     GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r));
1076
1077   if ((CUdeviceptr)p != pb)
1078     GOMP_PLUGIN_fatal ("invalid device address");
1079
1080   r = cuMemFree ((CUdeviceptr)p);
1081   if (r != CUDA_SUCCESS)
1082     GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
1083 }
1084
1085 static void *
1086 nvptx_host2dev (void *d, const void *h, size_t s)
1087 {
1088   CUresult r;
1089   CUdeviceptr pb;
1090   size_t ps;
1091   struct nvptx_thread *nvthd = nvptx_thread ();
1092
1093   if (!s)
1094     return 0;
1095
1096   if (!d)
1097     GOMP_PLUGIN_fatal ("invalid device address");
1098
1099   r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)d);
1100   if (r != CUDA_SUCCESS)
1101     GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r));
1102
1103   if (!pb)
1104     GOMP_PLUGIN_fatal ("invalid device address");
1105
1106   if (!h)
1107     GOMP_PLUGIN_fatal ("invalid host address");
1108
1109   if (d == h)
1110     GOMP_PLUGIN_fatal ("invalid host or device address");
1111
1112   if ((void *)(d + s) > (void *)(pb + ps))
1113     GOMP_PLUGIN_fatal ("invalid size");
1114
1115 #ifndef DISABLE_ASYNC
1116   if (nvthd->current_stream != nvthd->ptx_dev->null_stream)
1117     {
1118       CUevent *e;
1119
1120       e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1121
1122       r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1123       if (r != CUDA_SUCCESS)
1124         GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1125
1126       event_gc (false);
1127
1128       r = cuMemcpyHtoDAsync ((CUdeviceptr)d, h, s,
1129                              nvthd->current_stream->stream);
1130       if (r != CUDA_SUCCESS)
1131         GOMP_PLUGIN_fatal ("cuMemcpyHtoDAsync error: %s", cuda_error (r));
1132
1133       r = cuEventRecord (*e, nvthd->current_stream->stream);
1134       if (r != CUDA_SUCCESS)
1135         GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1136
1137       event_add (PTX_EVT_MEM, e, (void *)h);
1138     }
1139   else
1140 #endif
1141     {
1142       r = cuMemcpyHtoD ((CUdeviceptr)d, h, s);
1143       if (r != CUDA_SUCCESS)
1144         GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
1145     }
1146
1147   return 0;
1148 }
1149
1150 static void *
1151 nvptx_dev2host (void *h, const void *d, size_t s)
1152 {
1153   CUresult r;
1154   CUdeviceptr pb;
1155   size_t ps;
1156   struct nvptx_thread *nvthd = nvptx_thread ();
1157
1158   if (!s)
1159     return 0;
1160
1161   if (!d)
1162     GOMP_PLUGIN_fatal ("invalid device address");
1163
1164   r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)d);
1165   if (r != CUDA_SUCCESS)
1166     GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r));
1167
1168   if (!pb)
1169     GOMP_PLUGIN_fatal ("invalid device address");
1170
1171   if (!h)
1172     GOMP_PLUGIN_fatal ("invalid host address");
1173
1174   if (d == h)
1175     GOMP_PLUGIN_fatal ("invalid host or device address");
1176
1177   if ((void *)(d + s) > (void *)(pb + ps))
1178     GOMP_PLUGIN_fatal ("invalid size");
1179
1180 #ifndef DISABLE_ASYNC
1181   if (nvthd->current_stream != nvthd->ptx_dev->null_stream)
1182     {
1183       CUevent *e;
1184
1185       e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1186
1187       r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1188       if (r != CUDA_SUCCESS)
1189         GOMP_PLUGIN_fatal ("cuEventCreate error: %s\n", cuda_error (r));
1190
1191       event_gc (false);
1192
1193       r = cuMemcpyDtoHAsync (h, (CUdeviceptr)d, s,
1194                              nvthd->current_stream->stream);
1195       if (r != CUDA_SUCCESS)
1196         GOMP_PLUGIN_fatal ("cuMemcpyDtoHAsync error: %s", cuda_error (r));
1197
1198       r = cuEventRecord (*e, nvthd->current_stream->stream);
1199       if (r != CUDA_SUCCESS)
1200         GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1201
1202       event_add (PTX_EVT_MEM, e, (void *)h);
1203     }
1204   else
1205 #endif
1206     {
1207       r = cuMemcpyDtoH (h, (CUdeviceptr)d, s);
1208       if (r != CUDA_SUCCESS)
1209         GOMP_PLUGIN_fatal ("cuMemcpyDtoH error: %s", cuda_error (r));
1210     }
1211
1212   return 0;
1213 }
1214
1215 static void
1216 nvptx_set_async (int async)
1217 {
1218   struct nvptx_thread *nvthd = nvptx_thread ();
1219   nvthd->current_stream
1220     = select_stream_for_async (async, pthread_self (), true, NULL);
1221 }
1222
1223 static int
1224 nvptx_async_test (int async)
1225 {
1226   CUresult r;
1227   struct ptx_stream *s;
1228
1229   s = select_stream_for_async (async, pthread_self (), false, NULL);
1230
1231   if (!s)
1232     GOMP_PLUGIN_fatal ("unknown async %d", async);
1233
1234   r = cuStreamQuery (s->stream);
1235   if (r == CUDA_SUCCESS)
1236     {
1237       /* The oacc-parallel.c:goacc_wait function calls this hook to determine
1238          whether all work has completed on this stream, and if so omits the call
1239          to the wait hook.  If that happens, event_gc might not get called
1240          (which prevents variables from getting unmapped and their associated
1241          device storage freed), so call it here.  */
1242       event_gc (true);
1243       return 1;
1244     }
1245   else if (r == CUDA_ERROR_NOT_READY)
1246     return 0;
1247
1248   GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
1249
1250   return 0;
1251 }
1252
1253 static int
1254 nvptx_async_test_all (void)
1255 {
1256   struct ptx_stream *s;
1257   pthread_t self = pthread_self ();
1258   struct nvptx_thread *nvthd = nvptx_thread ();
1259
1260   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1261
1262   for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
1263     {
1264       if ((s->multithreaded || pthread_equal (s->host_thread, self))
1265           && cuStreamQuery (s->stream) == CUDA_ERROR_NOT_READY)
1266         {
1267           pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1268           return 0;
1269         }
1270     }
1271
1272   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1273
1274   event_gc (true);
1275
1276   return 1;
1277 }
1278
1279 static void
1280 nvptx_wait (int async)
1281 {
1282   CUresult r;
1283   struct ptx_stream *s;
1284
1285   s = select_stream_for_async (async, pthread_self (), false, NULL);
1286
1287   if (!s)
1288     GOMP_PLUGIN_fatal ("unknown async %d", async);
1289
1290   r = cuStreamSynchronize (s->stream);
1291   if (r != CUDA_SUCCESS)
1292     GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1293
1294   event_gc (true);
1295 }
1296
1297 static void
1298 nvptx_wait_async (int async1, int async2)
1299 {
1300   CUresult r;
1301   CUevent *e;
1302   struct ptx_stream *s1, *s2;
1303   pthread_t self = pthread_self ();
1304
1305   /* The stream that is waiting (rather than being waited for) doesn't
1306      necessarily have to exist already.  */
1307   s2 = select_stream_for_async (async2, self, true, NULL);
1308
1309   s1 = select_stream_for_async (async1, self, false, NULL);
1310   if (!s1)
1311     GOMP_PLUGIN_fatal ("invalid async 1\n");
1312
1313   if (s1 == s2)
1314     GOMP_PLUGIN_fatal ("identical parameters");
1315
1316   e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1317
1318   r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1319   if (r != CUDA_SUCCESS)
1320     GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1321
1322   event_gc (true);
1323
1324   r = cuEventRecord (*e, s1->stream);
1325   if (r != CUDA_SUCCESS)
1326     GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1327
1328   event_add (PTX_EVT_SYNC, e, NULL);
1329
1330   r = cuStreamWaitEvent (s2->stream, *e, 0);
1331   if (r != CUDA_SUCCESS)
1332     GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuda_error (r));
1333 }
1334
1335 static void
1336 nvptx_wait_all (void)
1337 {
1338   CUresult r;
1339   struct ptx_stream *s;
1340   pthread_t self = pthread_self ();
1341   struct nvptx_thread *nvthd = nvptx_thread ();
1342
1343   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1344
1345   /* Wait for active streams initiated by this thread (or by multiple threads)
1346      to complete.  */
1347   for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
1348     {
1349       if (s->multithreaded || pthread_equal (s->host_thread, self))
1350         {
1351           r = cuStreamQuery (s->stream);
1352           if (r == CUDA_SUCCESS)
1353             continue;
1354           else if (r != CUDA_ERROR_NOT_READY)
1355             GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
1356
1357           r = cuStreamSynchronize (s->stream);
1358           if (r != CUDA_SUCCESS)
1359             GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1360         }
1361     }
1362
1363   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1364
1365   event_gc (true);
1366 }
1367
1368 static void
1369 nvptx_wait_all_async (int async)
1370 {
1371   CUresult r;
1372   struct ptx_stream *waiting_stream, *other_stream;
1373   CUevent *e;
1374   struct nvptx_thread *nvthd = nvptx_thread ();
1375   pthread_t self = pthread_self ();
1376
1377   /* The stream doing the waiting.  This could be the first mention of the
1378      stream, so create it if necessary.  */
1379   waiting_stream
1380     = select_stream_for_async (async, pthread_self (), true, NULL);
1381
1382   /* Launches on the null stream already block on other streams in the
1383      context.  */
1384   if (!waiting_stream || waiting_stream == nvthd->ptx_dev->null_stream)
1385     return;
1386
1387   event_gc (true);
1388
1389   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1390
1391   for (other_stream = nvthd->ptx_dev->active_streams;
1392        other_stream != NULL;
1393        other_stream = other_stream->next)
1394     {
1395       if (!other_stream->multithreaded
1396           && !pthread_equal (other_stream->host_thread, self))
1397         continue;
1398
1399       e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1400
1401       r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1402       if (r != CUDA_SUCCESS)
1403         GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1404
1405       /* Record an event on the waited-for stream.  */
1406       r = cuEventRecord (*e, other_stream->stream);
1407       if (r != CUDA_SUCCESS)
1408         GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1409
1410       event_add (PTX_EVT_SYNC, e, NULL);
1411
1412       r = cuStreamWaitEvent (waiting_stream->stream, *e, 0);
1413       if (r != CUDA_SUCCESS)
1414         GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuda_error (r));
1415    }
1416
1417   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1418 }
1419
1420 static void *
1421 nvptx_get_current_cuda_device (void)
1422 {
1423   struct nvptx_thread *nvthd = nvptx_thread ();
1424
1425   if (!nvthd || !nvthd->ptx_dev)
1426     return NULL;
1427
1428   return &nvthd->ptx_dev->dev;
1429 }
1430
1431 static void *
1432 nvptx_get_current_cuda_context (void)
1433 {
1434   struct nvptx_thread *nvthd = nvptx_thread ();
1435
1436   if (!nvthd || !nvthd->ptx_dev)
1437     return NULL;
1438
1439   return nvthd->ptx_dev->ctx;
1440 }
1441
1442 static void *
1443 nvptx_get_cuda_stream (int async)
1444 {
1445   struct ptx_stream *s;
1446   struct nvptx_thread *nvthd = nvptx_thread ();
1447
1448   if (!nvthd || !nvthd->ptx_dev)
1449     return NULL;
1450
1451   s = select_stream_for_async (async, pthread_self (), false, NULL);
1452
1453   return s ? s->stream : NULL;
1454 }
1455
1456 static int
1457 nvptx_set_cuda_stream (int async, void *stream)
1458 {
1459   struct ptx_stream *oldstream;
1460   pthread_t self = pthread_self ();
1461   struct nvptx_thread *nvthd = nvptx_thread ();
1462
1463   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1464
1465   if (async < 0)
1466     GOMP_PLUGIN_fatal ("bad async %d", async);
1467
1468   /* We have a list of active streams and an array mapping async values to
1469      entries of that list.  We need to take "ownership" of the passed-in stream,
1470      and add it to our list, removing the previous entry also (if there was one)
1471      in order to prevent resource leaks.  Note the potential for surprise
1472      here: maybe we should keep track of passed-in streams and leave it up to
1473      the user to tidy those up, but that doesn't work for stream handles
1474      returned from acc_get_cuda_stream above...  */
1475
1476   oldstream = select_stream_for_async (async, self, false, NULL);
1477
1478   if (oldstream)
1479     {
1480       if (nvthd->ptx_dev->active_streams == oldstream)
1481         nvthd->ptx_dev->active_streams = nvthd->ptx_dev->active_streams->next;
1482       else
1483         {
1484           struct ptx_stream *s = nvthd->ptx_dev->active_streams;
1485           while (s->next != oldstream)
1486             s = s->next;
1487           s->next = s->next->next;
1488         }
1489
1490       cuStreamDestroy (oldstream->stream);
1491       map_fini (oldstream);
1492       free (oldstream);
1493     }
1494
1495   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1496
1497   (void) select_stream_for_async (async, self, true, (CUstream) stream);
1498
1499   return 1;
1500 }
1501
1502 /* Plugin entry points.  */
1503
1504 const char *
1505 GOMP_OFFLOAD_get_name (void)
1506 {
1507   return "nvptx";
1508 }
1509
1510 unsigned int
1511 GOMP_OFFLOAD_get_caps (void)
1512 {
1513   return GOMP_OFFLOAD_CAP_OPENACC_200;
1514 }
1515
1516 int
1517 GOMP_OFFLOAD_get_type (void)
1518 {
1519   return OFFLOAD_TARGET_TYPE_NVIDIA_PTX;
1520 }
1521
1522 int
1523 GOMP_OFFLOAD_get_num_devices (void)
1524 {
1525   return nvptx_get_num_devices ();
1526 }
1527
1528 void
1529 GOMP_OFFLOAD_init_device (int n)
1530 {
1531   pthread_mutex_lock (&ptx_dev_lock);
1532
1533   if (!nvptx_init () || ptx_devices[n] != NULL)
1534     {
1535       pthread_mutex_unlock (&ptx_dev_lock);
1536       return;
1537     }
1538
1539   ptx_devices[n] = nvptx_open_device (n);
1540   instantiated_devices++;
1541
1542   pthread_mutex_unlock (&ptx_dev_lock);
1543 }
1544
1545 void
1546 GOMP_OFFLOAD_fini_device (int n)
1547 {
1548   pthread_mutex_lock (&ptx_dev_lock);
1549
1550   if (ptx_devices[n] != NULL)
1551     {
1552       nvptx_attach_host_thread_to_device (n);
1553       nvptx_close_device (ptx_devices[n]);
1554       ptx_devices[n] = NULL;
1555       instantiated_devices--;
1556     }
1557
1558   pthread_mutex_unlock (&ptx_dev_lock);
1559 }
1560
1561 /* Data emitted by mkoffload.  */
1562
1563 typedef struct nvptx_tdata
1564 {
1565   const char *ptx_src;
1566
1567   const char *const *var_names;
1568   size_t var_num;
1569
1570   const char *const *fn_names;
1571   size_t fn_num;
1572 } nvptx_tdata_t;
1573
1574 /* Return the libgomp version number we're compatible with.  There is
1575    no requirement for cross-version compatibility.  */
1576
1577 unsigned
1578 GOMP_OFFLOAD_version (void)
1579 {
1580   return GOMP_VERSION;
1581 }
1582
1583 /* Load the (partial) program described by TARGET_DATA to device
1584    number ORD.  Allocate and return TARGET_TABLE.  */
1585
1586 int
1587 GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
1588                          struct addr_pair **target_table)
1589 {
1590   CUmodule module;
1591   const char *const *fn_names, *const *var_names;
1592   unsigned int fn_entries, var_entries, i, j;
1593   CUresult r;
1594   struct targ_fn_descriptor *targ_fns;
1595   struct addr_pair *targ_tbl;
1596   const nvptx_tdata_t *img_header = (const nvptx_tdata_t *) target_data;
1597   struct ptx_image_data *new_image;
1598   struct ptx_device *dev;
1599
1600   if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1601     GOMP_PLUGIN_fatal ("Offload data incompatible with PTX plugin"
1602                        " (expected %u, received %u)",
1603                        GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1604
1605   GOMP_OFFLOAD_init_device (ord);
1606
1607   dev = ptx_devices[ord];
1608
1609   nvptx_attach_host_thread_to_device (ord);
1610
1611   link_ptx (&module, img_header->ptx_src);
1612
1613   /* The mkoffload utility emits a struct of pointers/integers at the
1614      start of each offload image.  The array of kernel names and the
1615      functions addresses form a one-to-one correspondence.  */
1616
1617   var_entries = img_header->var_num;
1618   var_names = img_header->var_names;
1619   fn_entries = img_header->fn_num;
1620   fn_names = img_header->fn_names;
1621
1622   targ_tbl = GOMP_PLUGIN_malloc (sizeof (struct addr_pair)
1623                                  * (fn_entries + var_entries));
1624   targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
1625                                  * fn_entries);
1626
1627   *target_table = targ_tbl;
1628
1629   new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data));
1630   new_image->target_data = target_data;
1631   new_image->module = module;
1632   new_image->fns = targ_fns;
1633
1634   pthread_mutex_lock (&dev->image_lock);
1635   new_image->next = dev->images;
1636   dev->images = new_image;
1637   pthread_mutex_unlock (&dev->image_lock);
1638
1639   for (i = 0; i < fn_entries; i++, targ_fns++, targ_tbl++)
1640     {
1641       CUfunction function;
1642
1643       r = cuModuleGetFunction (&function, module, fn_names[i]);
1644       if (r != CUDA_SUCCESS)
1645         GOMP_PLUGIN_fatal ("cuModuleGetFunction error: %s", cuda_error (r));
1646
1647       targ_fns->fn = function;
1648       targ_fns->name = (const char *) fn_names[i];
1649
1650       targ_tbl->start = (uintptr_t) targ_fns;
1651       targ_tbl->end = targ_tbl->start + 1;
1652     }
1653
1654   for (j = 0; j < var_entries; j++, targ_tbl++)
1655     {
1656       CUdeviceptr var;
1657       size_t bytes;
1658
1659       r = cuModuleGetGlobal (&var, &bytes, module, var_names[j]);
1660       if (r != CUDA_SUCCESS)
1661         GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
1662
1663       targ_tbl->start = (uintptr_t) var;
1664       targ_tbl->end = targ_tbl->start + bytes;
1665     }
1666
1667   return fn_entries + var_entries;
1668 }
1669
1670 /* Unload the program described by TARGET_DATA.  DEV_DATA is the
1671    function descriptors allocated by G_O_load_image.  */
1672
1673 void
1674 GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data)
1675 {
1676   struct ptx_image_data *image, **prev_p;
1677   struct ptx_device *dev = ptx_devices[ord];
1678
1679   if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1680     return;
1681
1682   pthread_mutex_lock (&dev->image_lock);
1683   for (prev_p = &dev->images; (image = *prev_p) != 0; prev_p = &image->next)
1684     if (image->target_data == target_data)
1685       {
1686         *prev_p = image->next;
1687         cuModuleUnload (image->module);
1688         free (image->fns);
1689         free (image);
1690         break;
1691       }
1692   pthread_mutex_unlock (&dev->image_lock);
1693 }
1694
1695 void *
1696 GOMP_OFFLOAD_alloc (int ord, size_t size)
1697 {
1698   nvptx_attach_host_thread_to_device (ord);
1699   return nvptx_alloc (size);
1700 }
1701
1702 void
1703 GOMP_OFFLOAD_free (int ord, void *ptr)
1704 {
1705   nvptx_attach_host_thread_to_device (ord);
1706   nvptx_free (ptr);
1707 }
1708
1709 void *
1710 GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
1711 {
1712   nvptx_attach_host_thread_to_device (ord);
1713   return nvptx_dev2host (dst, src, n);
1714 }
1715
1716 void *
1717 GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
1718 {
1719   nvptx_attach_host_thread_to_device (ord);
1720   return nvptx_host2dev (dst, src, n);
1721 }
1722
1723 void (*device_run) (int n, void *fn_ptr, void *vars) = NULL;
1724
1725 void
1726 GOMP_OFFLOAD_openacc_parallel (void (*fn) (void *), size_t mapnum,
1727                                void **hostaddrs, void **devaddrs, size_t *sizes,
1728                                unsigned short *kinds, int num_gangs,
1729                                int num_workers, int vector_length, int async,
1730                                void *targ_mem_desc)
1731 {
1732   nvptx_exec (fn, mapnum, hostaddrs, devaddrs, sizes, kinds, num_gangs,
1733             num_workers, vector_length, async, targ_mem_desc);
1734 }
1735
1736 void
1737 GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc)
1738 {
1739   CUevent *e;
1740   CUresult r;
1741   struct nvptx_thread *nvthd = nvptx_thread ();
1742
1743   e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1744
1745   r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1746   if (r != CUDA_SUCCESS)
1747     GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1748
1749   r = cuEventRecord (*e, nvthd->current_stream->stream);
1750   if (r != CUDA_SUCCESS)
1751     GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1752
1753   event_add (PTX_EVT_ASYNC_CLEANUP, e, targ_mem_desc);
1754 }
1755
1756 int
1757 GOMP_OFFLOAD_openacc_async_test (int async)
1758 {
1759   return nvptx_async_test (async);
1760 }
1761
1762 int
1763 GOMP_OFFLOAD_openacc_async_test_all (void)
1764 {
1765   return nvptx_async_test_all ();
1766 }
1767
1768 void
1769 GOMP_OFFLOAD_openacc_async_wait (int async)
1770 {
1771   nvptx_wait (async);
1772 }
1773
1774 void
1775 GOMP_OFFLOAD_openacc_async_wait_async (int async1, int async2)
1776 {
1777   nvptx_wait_async (async1, async2);
1778 }
1779
1780 void
1781 GOMP_OFFLOAD_openacc_async_wait_all (void)
1782 {
1783   nvptx_wait_all ();
1784 }
1785
1786 void
1787 GOMP_OFFLOAD_openacc_async_wait_all_async (int async)
1788 {
1789   nvptx_wait_all_async (async);
1790 }
1791
1792 void
1793 GOMP_OFFLOAD_openacc_async_set_async (int async)
1794 {
1795   nvptx_set_async (async);
1796 }
1797
1798 void *
1799 GOMP_OFFLOAD_openacc_create_thread_data (int ord)
1800 {
1801   struct ptx_device *ptx_dev;
1802   struct nvptx_thread *nvthd
1803     = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
1804   CUresult r;
1805   CUcontext thd_ctx;
1806
1807   ptx_dev = ptx_devices[ord];
1808
1809   assert (ptx_dev);
1810
1811   r = cuCtxGetCurrent (&thd_ctx);
1812   if (r != CUDA_SUCCESS)
1813     GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));
1814
1815   assert (ptx_dev->ctx);
1816
1817   if (!thd_ctx)
1818     {
1819       r = cuCtxPushCurrent (ptx_dev->ctx);
1820       if (r != CUDA_SUCCESS)
1821         GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s", cuda_error (r));
1822     }
1823
1824   nvthd->current_stream = ptx_dev->null_stream;
1825   nvthd->ptx_dev = ptx_dev;
1826
1827   return (void *) nvthd;
1828 }
1829
1830 void
1831 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
1832 {
1833   free (data);
1834 }
1835
1836 void *
1837 GOMP_OFFLOAD_openacc_get_current_cuda_device (void)
1838 {
1839   return nvptx_get_current_cuda_device ();
1840 }
1841
1842 void *
1843 GOMP_OFFLOAD_openacc_get_current_cuda_context (void)
1844 {
1845   return nvptx_get_current_cuda_context ();
1846 }
1847
1848 /* NOTE: This returns a CUstream, not a ptx_stream pointer.  */
1849
1850 void *
1851 GOMP_OFFLOAD_openacc_get_cuda_stream (int async)
1852 {
1853   return nvptx_get_cuda_stream (async);
1854 }
1855
1856 /* NOTE: This takes a CUstream, not a ptx_stream pointer.  */
1857
1858 int
1859 GOMP_OFFLOAD_openacc_set_cuda_stream (int async, void *stream)
1860 {
1861   return nvptx_set_cuda_stream (async, stream);
1862 }