libgomp/plugin/plugin-nvptx.c

   1 /* Plugin for NVPTX execution.
   2
   3    Copyright (C) 2013-2015 Free Software Foundation, Inc.
   4
   5    Contributed by Mentor Embedded.
   6
   7    This file is part of the GNU Offloading and Multi Processing Library
   8    (libgomp).
   9
  10    Libgomp is free software; you can redistribute it and/or modify it
  11    under the terms of the GNU General Public License as published by
  12    the Free Software Foundation; either version 3, or (at your option)
  13    any later version.
  14
  15    Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
  16    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  17    FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  18    more details.
  19
  20    Under Section 7 of GPL version 3, you are granted additional
  21    permissions described in the GCC Runtime Library Exception, version
  22    3.1, as published by the Free Software Foundation.
  23
  24    You should have received a copy of the GNU General Public License and
  25    a copy of the GCC Runtime Library Exception along with this program;
  26    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  27    <http://www.gnu.org/licenses/>.  */
  28
  29 /* Nvidia PTX-specific parts of OpenACC support.  The cuda driver
  30    library appears to hold some implicit state, but the documentation
  31    is not clear as to what that state might be.  Or how one might
  32    propagate it from one thread to another.  */
  33
  34 #include "openacc.h"
  35 #include "config.h"
  36 #include "libgomp-plugin.h"
  37 #include "oacc-plugin.h"
  38 #include "gomp-constants.h"
  39
  40 #include <pthread.h>
  41 #include <cuda.h>
  42 #include <stdbool.h>
  43 #include <stdint.h>
  44 #include <string.h>
  45 #include <stdio.h>
  46 #include <unistd.h>
  47 #include <assert.h>
  48
  49 static const char *
  50 cuda_error (CUresult r)
  51 {
  52 #if CUDA_VERSION < 7000
  53   /* Specified in documentation and present in library from at least
  54      5.5.  Not declared in header file prior to 7.0.  */
  55   extern CUresult cuGetErrorString (CUresult, const char **);
  56 #endif
  57   const char *desc;
  58
  59   r = cuGetErrorString (r, &desc);
  60   if (r != CUDA_SUCCESS)
  61     desc = "unknown cuda error";
  62
  63   return desc;
  64 }
  65
  66 static unsigned int instantiated_devices = 0;
  67 static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
  68
  69 struct ptx_stream
  70 {
  71   CUstream stream;
  72   pthread_t host_thread;
  73   bool multithreaded;
  74
  75   CUdeviceptr d;
  76   void *h;
  77   void *h_begin;
  78   void *h_end;
  79   void *h_next;
  80   void *h_prev;
  81   void *h_tail;
  82
  83   struct ptx_stream *next;
  84 };
  85
  86 /* Thread-specific data for PTX.  */
  87
  88 struct nvptx_thread
  89 {
  90   struct ptx_stream *current_stream;
  91   struct ptx_device *ptx_dev;
  92 };
  93
  94 struct map
  95 {
  96   int     async;
  97   size_t  size;
  98   char    mappings[0];
  99 };
 100
 101 static void
 102 map_init (struct ptx_stream *s)
 103 {
 104   CUresult r;
 105
 106   int size = getpagesize ();
 107
 108   assert (s);
 109   assert (!s->d);
 110   assert (!s->h);
 111
 112   r = cuMemAllocHost (&s->h, size);
 113   if (r != CUDA_SUCCESS)
 114     GOMP_PLUGIN_fatal ("cuMemAllocHost error: %s", cuda_error (r));
 115
 116   r = cuMemHostGetDevicePointer (&s->d, s->h, 0);
 117   if (r != CUDA_SUCCESS)
 118     GOMP_PLUGIN_fatal ("cuMemHostGetDevicePointer error: %s", cuda_error (r));
 119
 120   assert (s->h);
 121
 122   s->h_begin = s->h;
 123   s->h_end = s->h_begin + size;
 124   s->h_next = s->h_prev = s->h_tail = s->h_begin;
 125
 126   assert (s->h_next);
 127   assert (s->h_end);
 128 }
 129
 130 static void
 131 map_fini (struct ptx_stream *s)
 132 {
 133   CUresult r;
 134
 135   r = cuMemFreeHost (s->h);
 136   if (r != CUDA_SUCCESS)
 137     GOMP_PLUGIN_fatal ("cuMemFreeHost error: %s", cuda_error (r));
 138 }
 139
 140 static void
 141 map_pop (struct ptx_stream *s)
 142 {
 143   struct map *m;
 144
 145   assert (s != NULL);
 146   assert (s->h_next);
 147   assert (s->h_prev);
 148   assert (s->h_tail);
 149
 150   m = s->h_tail;
 151
 152   s->h_tail += m->size;
 153
 154   if (s->h_tail >= s->h_end)
 155     s->h_tail = s->h_begin + (int) (s->h_tail - s->h_end);
 156
 157   if (s->h_next == s->h_tail)
 158     s->h_prev = s->h_next;
 159
 160   assert (s->h_next >= s->h_begin);
 161   assert (s->h_tail >= s->h_begin);
 162   assert (s->h_prev >= s->h_begin);
 163
 164   assert (s->h_next <= s->h_end);
 165   assert (s->h_tail <= s->h_end);
 166   assert (s->h_prev <= s->h_end);
 167 }
 168
 169 static void
 170 map_push (struct ptx_stream *s, int async, size_t size, void **h, void **d)
 171 {
 172   int left;
 173   int offset;
 174   struct map *m;
 175
 176   assert (s != NULL);
 177
 178   left = s->h_end - s->h_next;
 179   size += sizeof (struct map);
 180
 181   assert (s->h_prev);
 182   assert (s->h_next);
 183
 184   if (size >= left)
 185     {
 186       m = s->h_prev;
 187       m->size += left;
 188       s->h_next = s->h_begin;
 189
 190       if (s->h_next + size > s->h_end)
 191         GOMP_PLUGIN_fatal ("unable to push map");
 192     }
 193
 194   assert (s->h_next);
 195
 196   m = s->h_next;
 197   m->async = async;
 198   m->size = size;
 199
 200   offset = (void *)&m->mappings[0] - s->h;
 201
 202   *d = (void *)(s->d + offset);
 203   *h = (void *)(s->h + offset);
 204
 205   s->h_prev = s->h_next;
 206   s->h_next += size;
 207
 208   assert (s->h_prev);
 209   assert (s->h_next);
 210
 211   assert (s->h_next >= s->h_begin);
 212   assert (s->h_tail >= s->h_begin);
 213   assert (s->h_prev >= s->h_begin);
 214   assert (s->h_next <= s->h_end);
 215   assert (s->h_tail <= s->h_end);
 216   assert (s->h_prev <= s->h_end);
 217
 218   return;
 219 }
 220
 221 /* Target data function launch information.  */
 222
 223 struct targ_fn_launch
 224 {
 225   const char *fn;
 226   unsigned short dim[GOMP_DIM_MAX];
 227 };
 228
 229 /* Target PTX object information.  */
 230
 231 struct targ_ptx_obj
 232 {
 233   const char *code;
 234   size_t size;
 235 };
 236
 237 /* Target data image information.  */
 238
 239 typedef struct nvptx_tdata
 240 {
 241   const struct targ_ptx_obj *ptx_objs;
 242   unsigned ptx_num;
 243
 244   const char *const *var_names;
 245   unsigned var_num;
 246
 247   const struct targ_fn_launch *fn_descs;
 248   unsigned fn_num;
 249 } nvptx_tdata_t;
 250
 251 /* Descriptor of a loaded function.  */
 252
 253 struct targ_fn_descriptor
 254 {
 255   CUfunction fn;
 256   const struct targ_fn_launch *launch;
 257 };
 258
 259 /* A loaded PTX image.  */
 260 struct ptx_image_data
 261 {
 262   const void *target_data;
 263   CUmodule module;
 264
 265   struct targ_fn_descriptor *fns;  /* Array of functions.  */
 266
 267   struct ptx_image_data *next;
 268 };
 269
 270 struct ptx_device
 271 {
 272   CUcontext ctx;
 273   bool ctx_shared;
 274   CUdevice dev;
 275   struct ptx_stream *null_stream;
 276   /* All non-null streams associated with this device (actually context),
 277      either created implicitly or passed in from the user (via
 278      acc_set_cuda_stream).  */
 279   struct ptx_stream *active_streams;
 280   struct {
 281     struct ptx_stream **arr;
 282     int size;
 283   } async_streams;
 284   /* A lock for use when manipulating the above stream list and array.  */
 285   pthread_mutex_t stream_lock;
 286   int ord;
 287   bool overlap;
 288   bool map;
 289   bool concur;
 290   int  mode;
 291   bool mkern;
 292
 293   struct ptx_image_data *images;  /* Images loaded on device.  */
 294   pthread_mutex_t image_lock;     /* Lock for above list.  */
 295
 296   struct ptx_device *next;
 297 };
 298
 299 enum ptx_event_type
 300 {
 301   PTX_EVT_MEM,
 302   PTX_EVT_KNL,
 303   PTX_EVT_SYNC,
 304   PTX_EVT_ASYNC_CLEANUP
 305 };
 306
 307 struct ptx_event
 308 {
 309   CUevent *evt;
 310   int type;
 311   void *addr;
 312   int ord;
 313
 314   struct ptx_event *next;
 315 };
 316
 317 static pthread_mutex_t ptx_event_lock;
 318 static struct ptx_event *ptx_events;
 319
 320 static struct ptx_device **ptx_devices;
 321
 322 static inline struct nvptx_thread *
 323 nvptx_thread (void)
 324 {
 325   return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
 326 }
 327
 328 static void
 329 init_streams_for_device (struct ptx_device *ptx_dev, int concurrency)
 330 {
 331   int i;
 332   struct ptx_stream *null_stream
 333     = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
 334
 335   null_stream->stream = NULL;
 336   null_stream->host_thread = pthread_self ();
 337   null_stream->multithreaded = true;
 338   null_stream->d = (CUdeviceptr) NULL;
 339   null_stream->h = NULL;
 340   map_init (null_stream);
 341   ptx_dev->null_stream = null_stream;
 342
 343   ptx_dev->active_streams = NULL;
 344   pthread_mutex_init (&ptx_dev->stream_lock, NULL);
 345
 346   if (concurrency < 1)
 347     concurrency = 1;
 348
 349   /* This is just a guess -- make space for as many async streams as the
 350      current device is capable of concurrently executing.  This can grow
 351      later as necessary.  No streams are created yet.  */
 352   ptx_dev->async_streams.arr
 353     = GOMP_PLUGIN_malloc (concurrency * sizeof (struct ptx_stream *));
 354   ptx_dev->async_streams.size = concurrency;
 355
 356   for (i = 0; i < concurrency; i++)
 357     ptx_dev->async_streams.arr[i] = NULL;
 358 }
 359
 360 static void
 361 fini_streams_for_device (struct ptx_device *ptx_dev)
 362 {
 363   free (ptx_dev->async_streams.arr);
 364
 365   while (ptx_dev->active_streams != NULL)
 366     {
 367       struct ptx_stream *s = ptx_dev->active_streams;
 368       ptx_dev->active_streams = ptx_dev->active_streams->next;
 369
 370       map_fini (s);
 371       cuStreamDestroy (s->stream);
 372       free (s);
 373     }
 374
 375   map_fini (ptx_dev->null_stream);
 376   free (ptx_dev->null_stream);
 377 }
 378
 379 /* Select a stream for (OpenACC-semantics) ASYNC argument for the current
 380    thread THREAD (and also current device/context).  If CREATE is true, create
 381    the stream if it does not exist (or use EXISTING if it is non-NULL), and
 382    associate the stream with the same thread argument.  Returns stream to use
 383    as result.  */
 384
 385 static struct ptx_stream *
 386 select_stream_for_async (int async, pthread_t thread, bool create,
 387                          CUstream existing)
 388 {
 389   struct nvptx_thread *nvthd = nvptx_thread ();
 390   /* Local copy of TLS variable.  */
 391   struct ptx_device *ptx_dev = nvthd->ptx_dev;
 392   struct ptx_stream *stream = NULL;
 393   int orig_async = async;
 394
 395   /* The special value acc_async_noval (-1) maps (for now) to an
 396      implicitly-created stream, which is then handled the same as any other
 397      numbered async stream.  Other options are available, e.g. using the null
 398      stream for anonymous async operations, or choosing an idle stream from an
 399      active set.  But, stick with this for now.  */
 400   if (async > acc_async_sync)
 401     async++;
 402
 403   if (create)
 404     pthread_mutex_lock (&ptx_dev->stream_lock);
 405
 406   /* NOTE: AFAICT there's no particular need for acc_async_sync to map to the
 407      null stream, and in fact better performance may be obtainable if it doesn't
 408      (because the null stream enforces overly-strict synchronisation with
 409      respect to other streams for legacy reasons, and that's probably not
 410      needed with OpenACC).  Maybe investigate later.  */
 411   if (async == acc_async_sync)
 412     stream = ptx_dev->null_stream;
 413   else if (async >= 0 && async < ptx_dev->async_streams.size
 414            && ptx_dev->async_streams.arr[async] && !(create && existing))
 415     stream = ptx_dev->async_streams.arr[async];
 416   else if (async >= 0 && create)
 417     {
 418       if (async >= ptx_dev->async_streams.size)
 419         {
 420           int i, newsize = ptx_dev->async_streams.size * 2;
 421
 422           if (async >= newsize)
 423             newsize = async + 1;
 424
 425           ptx_dev->async_streams.arr
 426             = GOMP_PLUGIN_realloc (ptx_dev->async_streams.arr,
 427                                    newsize * sizeof (struct ptx_stream *));
 428
 429           for (i = ptx_dev->async_streams.size; i < newsize; i++)
 430             ptx_dev->async_streams.arr[i] = NULL;
 431
 432           ptx_dev->async_streams.size = newsize;
 433         }
 434
 435       /* Create a new stream on-demand if there isn't one already, or if we're
 436          setting a particular async value to an existing (externally-provided)
 437          stream.  */
 438       if (!ptx_dev->async_streams.arr[async] || existing)
 439         {
 440           CUresult r;
 441           struct ptx_stream *s
 442             = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
 443
 444           if (existing)
 445             s->stream = existing;
 446           else
 447             {
 448               r = cuStreamCreate (&s->stream, CU_STREAM_DEFAULT);
 449               if (r != CUDA_SUCCESS)
 450                 GOMP_PLUGIN_fatal ("cuStreamCreate error: %s", cuda_error (r));
 451             }
 452
 453           /* If CREATE is true, we're going to be queueing some work on this
 454              stream.  Associate it with the current host thread.  */
 455           s->host_thread = thread;
 456           s->multithreaded = false;
 457
 458           s->d = (CUdeviceptr) NULL;
 459           s->h = NULL;
 460           map_init (s);
 461
 462           s->next = ptx_dev->active_streams;
 463           ptx_dev->active_streams = s;
 464           ptx_dev->async_streams.arr[async] = s;
 465         }
 466
 467       stream = ptx_dev->async_streams.arr[async];
 468     }
 469   else if (async < 0)
 470     GOMP_PLUGIN_fatal ("bad async %d", async);
 471
 472   if (create)
 473     {
 474       assert (stream != NULL);
 475
 476       /* If we're trying to use the same stream from different threads
 477          simultaneously, set stream->multithreaded to true.  This affects the
 478          behaviour of acc_async_test_all and acc_wait_all, which are supposed to
 479          only wait for asynchronous launches from the same host thread they are
 480          invoked on.  If multiple threads use the same async value, we make note
 481          of that here and fall back to testing/waiting for all threads in those
 482          functions.  */
 483       if (thread != stream->host_thread)
 484         stream->multithreaded = true;
 485
 486       pthread_mutex_unlock (&ptx_dev->stream_lock);
 487     }
 488   else if (stream && !stream->multithreaded
 489            && !pthread_equal (stream->host_thread, thread))
 490     GOMP_PLUGIN_fatal ("async %d used on wrong thread", orig_async);
 491
 492   return stream;
 493 }
 494
 495 /* Initialize the device.  Return TRUE on success, else FALSE.  PTX_DEV_LOCK
 496    should be locked on entry and remains locked on exit.  */
 497
 498 static bool
 499 nvptx_init (void)
 500 {
 501   CUresult r;
 502   int ndevs;
 503
 504   if (instantiated_devices != 0)
 505     return true;
 506
 507   r = cuInit (0);
 508   if (r != CUDA_SUCCESS)
 509     GOMP_PLUGIN_fatal ("cuInit error: %s", cuda_error (r));
 510
 511   ptx_events = NULL;
 512
 513   pthread_mutex_init (&ptx_event_lock, NULL);
 514
 515   r = cuDeviceGetCount (&ndevs);
 516   if (r != CUDA_SUCCESS)
 517     GOMP_PLUGIN_fatal ("cuDeviceGetCount error: %s", cuda_error (r));
 518
 519   ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *)
 520                                             * ndevs);
 521
 522   return true;
 523 }
 524
 525 /* Select the N'th PTX device for the current host thread.  The device must
 526    have been previously opened before calling this function.  */
 527
 528 static void
 529 nvptx_attach_host_thread_to_device (int n)
 530 {
 531   CUdevice dev;
 532   CUresult r;
 533   struct ptx_device *ptx_dev;
 534   CUcontext thd_ctx;
 535
 536   r = cuCtxGetDevice (&dev);
 537   if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
 538     GOMP_PLUGIN_fatal ("cuCtxGetDevice error: %s", cuda_error (r));
 539
 540   if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n)
 541     return;
 542   else
 543     {
 544       CUcontext old_ctx;
 545
 546       ptx_dev = ptx_devices[n];
 547       assert (ptx_dev);
 548
 549       r = cuCtxGetCurrent (&thd_ctx);
 550       if (r != CUDA_SUCCESS)
 551         GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));
 552
 553       /* We don't necessarily have a current context (e.g. if it has been
 554          destroyed.  Pop it if we do though.  */
 555       if (thd_ctx != NULL)
 556         {
 557           r = cuCtxPopCurrent (&old_ctx);
 558           if (r != CUDA_SUCCESS)
 559             GOMP_PLUGIN_fatal ("cuCtxPopCurrent error: %s", cuda_error (r));
 560         }
 561
 562       r = cuCtxPushCurrent (ptx_dev->ctx);
 563       if (r != CUDA_SUCCESS)
 564         GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s", cuda_error (r));
 565     }
 566 }
 567
 568 static struct ptx_device *
 569 nvptx_open_device (int n)
 570 {
 571   struct ptx_device *ptx_dev;
 572   CUdevice dev, ctx_dev;
 573   CUresult r;
 574   int async_engines, pi;
 575
 576   r = cuDeviceGet (&dev, n);
 577   if (r != CUDA_SUCCESS)
 578     GOMP_PLUGIN_fatal ("cuDeviceGet error: %s", cuda_error (r));
 579
 580   ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device));
 581
 582   ptx_dev->ord = n;
 583   ptx_dev->dev = dev;
 584   ptx_dev->ctx_shared = false;
 585
 586   r = cuCtxGetDevice (&ctx_dev);
 587   if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
 588     GOMP_PLUGIN_fatal ("cuCtxGetDevice error: %s", cuda_error (r));
 589
 590   if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev)
 591     {
 592       /* The current host thread has an active context for a different device.
 593          Detach it.  */
 594       CUcontext old_ctx;
 595
 596       r = cuCtxPopCurrent (&old_ctx);
 597       if (r != CUDA_SUCCESS)
 598         GOMP_PLUGIN_fatal ("cuCtxPopCurrent error: %s", cuda_error (r));
 599     }
 600
 601   r = cuCtxGetCurrent (&ptx_dev->ctx);
 602   if (r != CUDA_SUCCESS)
 603     GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));
 604
 605   if (!ptx_dev->ctx)
 606     {
 607       r = cuCtxCreate (&ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
 608       if (r != CUDA_SUCCESS)
 609         GOMP_PLUGIN_fatal ("cuCtxCreate error: %s", cuda_error (r));
 610     }
 611   else
 612     ptx_dev->ctx_shared = true;
 613
 614   r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
 615   if (r != CUDA_SUCCESS)
 616     GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
 617
 618   ptx_dev->overlap = pi;
 619
 620   r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
 621   if (r != CUDA_SUCCESS)
 622     GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
 623
 624   ptx_dev->map = pi;
 625
 626   r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
 627   if (r != CUDA_SUCCESS)
 628     GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
 629
 630   ptx_dev->concur = pi;
 631
 632   r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
 633   if (r != CUDA_SUCCESS)
 634     GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
 635
 636   ptx_dev->mode = pi;
 637
 638   r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
 639   if (r != CUDA_SUCCESS)
 640     GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
 641
 642   ptx_dev->mkern = pi;
 643
 644   r = cuDeviceGetAttribute (&async_engines,
 645                             CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
 646   if (r != CUDA_SUCCESS)
 647     async_engines = 1;
 648
 649   ptx_dev->images = NULL;
 650   pthread_mutex_init (&ptx_dev->image_lock, NULL);
 651
 652   init_streams_for_device (ptx_dev, async_engines);
 653
 654   return ptx_dev;
 655 }
 656
 657 static void
 658 nvptx_close_device (struct ptx_device *ptx_dev)
 659 {
 660   CUresult r;
 661
 662   if (!ptx_dev)
 663     return;
 664
 665   fini_streams_for_device (ptx_dev);
 666
 667   pthread_mutex_destroy (&ptx_dev->image_lock);
 668
 669   if (!ptx_dev->ctx_shared)
 670     {
 671       r = cuCtxDestroy (ptx_dev->ctx);
 672       if (r != CUDA_SUCCESS)
 673         GOMP_PLUGIN_fatal ("cuCtxDestroy error: %s", cuda_error (r));
 674     }
 675
 676   free (ptx_dev);
 677 }
 678
 679 static int
 680 nvptx_get_num_devices (void)
 681 {
 682   int n;
 683   CUresult r;
 684
 685   /* PR libgomp/65099: Currently, we only support offloading in 64-bit
 686      configurations.  */
 687   if (sizeof (void *) != 8)
 688     return 0;
 689
 690   /* This function will be called before the plugin has been initialized in
 691      order to enumerate available devices, but CUDA API routines can't be used
 692      until cuInit has been called.  Just call it now (but don't yet do any
 693      further initialization).  */
 694   if (instantiated_devices == 0)
 695     {
 696       r = cuInit (0);
 697       /* This is not an error: e.g. we may have CUDA libraries installed but
 698          no devices available.  */
 699       if (r != CUDA_SUCCESS)
 700         return 0;
 701     }
 702
 703   r = cuDeviceGetCount (&n);
 704   if (r!= CUDA_SUCCESS)
 705     GOMP_PLUGIN_fatal ("cuDeviceGetCount error: %s", cuda_error (r));
 706
 707   return n;
 708 }
 709
 710
 711 static void
 712 link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs,
 713           unsigned num_objs)
 714 {
 715   CUjit_option opts[7];
 716   void *optvals[7];
 717   float elapsed = 0.0;
 718 #define LOGSIZE 8192
 719   char elog[LOGSIZE];
 720   char ilog[LOGSIZE];
 721   unsigned long logsize = LOGSIZE;
 722   CUlinkState linkstate;
 723   CUresult r;
 724   void *linkout;
 725   size_t linkoutsize __attribute__ ((unused));
 726
 727   opts[0] = CU_JIT_WALL_TIME;
 728   optvals[0] = &elapsed;
 729
 730   opts[1] = CU_JIT_INFO_LOG_BUFFER;
 731   optvals[1] = &ilog[0];
 732
 733   opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
 734   optvals[2] = (void *) logsize;
 735
 736   opts[3] = CU_JIT_ERROR_LOG_BUFFER;
 737   optvals[3] = &elog[0];
 738
 739   opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
 740   optvals[4] = (void *) logsize;
 741
 742   opts[5] = CU_JIT_LOG_VERBOSE;
 743   optvals[5] = (void *) 1;
 744
 745   opts[6] = CU_JIT_TARGET;
 746   optvals[6] = (void *) CU_TARGET_COMPUTE_30;
 747
 748   r = cuLinkCreate (7, opts, optvals, &linkstate);
 749   if (r != CUDA_SUCCESS)
 750     GOMP_PLUGIN_fatal ("cuLinkCreate error: %s", cuda_error (r));
 751
 752   for (; num_objs--; ptx_objs++)
 753     {
 754       /* cuLinkAddData's 'data' argument erroneously omits the const
 755          qualifier.  */
 756       GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs->code);
 757       r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, (char*)ptx_objs->code,
 758                          ptx_objs->size, 0, 0, 0, 0);
 759       if (r != CUDA_SUCCESS)
 760         {
 761           GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
 762           GOMP_PLUGIN_fatal ("cuLinkAddData (ptx_code) error: %s",
 763                              cuda_error (r));
 764         }
 765     }
 766
 767   GOMP_PLUGIN_debug (0, "Linking\n");
 768   r = cuLinkComplete (linkstate, &linkout, &linkoutsize);
 769
 770   GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed);
 771   GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]);
 772
 773   if (r != CUDA_SUCCESS)
 774     GOMP_PLUGIN_fatal ("cuLinkComplete error: %s", cuda_error (r));
 775
 776   r = cuModuleLoadData (module, linkout);
 777   if (r != CUDA_SUCCESS)
 778     GOMP_PLUGIN_fatal ("cuModuleLoadData error: %s", cuda_error (r));
 779
 780   r = cuLinkDestroy (linkstate);
 781   if (r != CUDA_SUCCESS)
 782     GOMP_PLUGIN_fatal ("cuLinkDestory error: %s", cuda_error (r));
 783 }
 784
 785 static void
 786 event_gc (bool memmap_lockable)
 787 {
 788   struct ptx_event *ptx_event = ptx_events;
 789   struct nvptx_thread *nvthd = nvptx_thread ();
 790
 791   pthread_mutex_lock (&ptx_event_lock);
 792
 793   while (ptx_event != NULL)
 794     {
 795       CUresult r;
 796       struct ptx_event *e = ptx_event;
 797
 798       ptx_event = ptx_event->next;
 799
 800       if (e->ord != nvthd->ptx_dev->ord)
 801         continue;
 802
 803       r = cuEventQuery (*e->evt);
 804       if (r == CUDA_SUCCESS)
 805         {
 806           CUevent *te;
 807
 808           te = e->evt;
 809
 810           switch (e->type)
 811             {
 812             case PTX_EVT_MEM:
 813             case PTX_EVT_SYNC:
 814               break;
 815
 816             case PTX_EVT_KNL:
 817               map_pop (e->addr);
 818               break;
 819
 820             case PTX_EVT_ASYNC_CLEANUP:
 821               {
 822                 /* The function gomp_plugin_async_unmap_vars needs to claim the
 823                    memory-map splay tree lock for the current device, so we
 824                    can't call it when one of our callers has already claimed
 825                    the lock.  In that case, just delay the GC for this event
 826                    until later.  */
 827                 if (!memmap_lockable)
 828                   continue;
 829
 830                 GOMP_PLUGIN_async_unmap_vars (e->addr);
 831               }
 832               break;
 833             }
 834
 835           cuEventDestroy (*te);
 836           free ((void *)te);
 837
 838           if (ptx_events == e)
 839             ptx_events = ptx_events->next;
 840           else
 841             {
 842               struct ptx_event *e_ = ptx_events;
 843               while (e_->next != e)
 844                 e_ = e_->next;
 845               e_->next = e_->next->next;
 846             }
 847
 848           free (e);
 849         }
 850     }
 851
 852   pthread_mutex_unlock (&ptx_event_lock);
 853 }
 854
 855 static void
 856 event_add (enum ptx_event_type type, CUevent *e, void *h)
 857 {
 858   struct ptx_event *ptx_event;
 859   struct nvptx_thread *nvthd = nvptx_thread ();
 860
 861   assert (type == PTX_EVT_MEM || type == PTX_EVT_KNL || type == PTX_EVT_SYNC
 862           || type == PTX_EVT_ASYNC_CLEANUP);
 863
 864   ptx_event = GOMP_PLUGIN_malloc (sizeof (struct ptx_event));
 865   ptx_event->type = type;
 866   ptx_event->evt = e;
 867   ptx_event->addr = h;
 868   ptx_event->ord = nvthd->ptx_dev->ord;
 869
 870   pthread_mutex_lock (&ptx_event_lock);
 871
 872   ptx_event->next = ptx_events;
 873   ptx_events = ptx_event;
 874
 875   pthread_mutex_unlock (&ptx_event_lock);
 876 }
 877
 878 void
 879 nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
 880             int async, unsigned *dims, void *targ_mem_desc)
 881 {
 882   struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
 883   CUfunction function;
 884   CUresult r;
 885   int i;
 886   struct ptx_stream *dev_str;
 887   void *kargs[1];
 888   void *hp, *dp;
 889   struct nvptx_thread *nvthd = nvptx_thread ();
 890   const char *maybe_abort_msg = "(perhaps abort was called)";
 891
 892   function = targ_fn->fn;
 893
 894   dev_str = select_stream_for_async (async, pthread_self (), false, NULL);
 895   assert (dev_str == nvthd->current_stream);
 896
 897   /* Initialize the launch dimensions.  Typically this is constant,
 898      provided by the device compiler, but we must permit runtime
 899      values.  */
 900   for (i = 0; i != 3; i++)
 901     if (targ_fn->launch->dim[i])
 902       dims[i] = targ_fn->launch->dim[i];
 903
 904   /* This reserves a chunk of a pre-allocated page of memory mapped on both
 905      the host and the device. HP is a host pointer to the new chunk, and DP is
 906      the corresponding device pointer.  */
 907   map_push (dev_str, async, mapnum * sizeof (void *), &hp, &dp);
 908
 909   GOMP_PLUGIN_debug (0, "  %s: prepare mappings\n", __FUNCTION__);
 910
 911   /* Copy the array of arguments to the mapped page.  */
 912   for (i = 0; i < mapnum; i++)
 913     ((void **) hp)[i] = devaddrs[i];
 914
 915   /* Copy the (device) pointers to arguments to the device (dp and hp might in
 916      fact have the same value on a unified-memory system).  */
 917   r = cuMemcpy ((CUdeviceptr)dp, (CUdeviceptr)hp, mapnum * sizeof (void *));
 918   if (r != CUDA_SUCCESS)
 919     GOMP_PLUGIN_fatal ("cuMemcpy failed: %s", cuda_error (r));
 920
 921   GOMP_PLUGIN_debug (0, "  %s: kernel %s: launch"
 922                      " gangs=%u, workers=%u, vectors=%u\n",
 923                      __FUNCTION__, targ_fn->launch->fn,
 924                      dims[0], dims[1], dims[2]);
 925
 926   // OpenACC            CUDA
 927   //
 928   // num_gangs          nctaid.x
 929   // num_workers        ntid.y
 930   // vector length      ntid.x
 931
 932   kargs[0] = &dp;
 933   r = cuLaunchKernel (function,
 934                       dims[GOMP_DIM_GANG], 1, 1,
 935                       dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1,
 936                       0, dev_str->stream, kargs, 0);
 937   if (r != CUDA_SUCCESS)
 938     GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
 939
 940 #ifndef DISABLE_ASYNC
 941   if (async < acc_async_noval)
 942     {
 943       r = cuStreamSynchronize (dev_str->stream);
 944       if (r == CUDA_ERROR_LAUNCH_FAILED)
 945         GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
 946                            maybe_abort_msg);
 947       else if (r != CUDA_SUCCESS)
 948         GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
 949     }
 950   else
 951     {
 952       CUevent *e;
 953
 954       e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
 955
 956       r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
 957       if (r == CUDA_ERROR_LAUNCH_FAILED)
 958         GOMP_PLUGIN_fatal ("cuEventCreate error: %s %s\n", cuda_error (r),
 959                            maybe_abort_msg);
 960       else if (r != CUDA_SUCCESS)
 961         GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
 962
 963       event_gc (true);
 964
 965       r = cuEventRecord (*e, dev_str->stream);
 966       if (r != CUDA_SUCCESS)
 967         GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
 968
 969       event_add (PTX_EVT_KNL, e, (void *)dev_str);
 970     }
 971 #else
 972   r = cuCtxSynchronize ();
 973   if (r == CUDA_ERROR_LAUNCH_FAILED)
 974     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
 975                        maybe_abort_msg);
 976   else if (r != CUDA_SUCCESS)
 977     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
 978 #endif
 979
 980   GOMP_PLUGIN_debug (0, "  %s: kernel %s: finished\n", __FUNCTION__,
 981                      targ_fn->launch->fn);
 982
 983 #ifndef DISABLE_ASYNC
 984   if (async < acc_async_noval)
 985 #endif
 986     map_pop (dev_str);
 987 }
 988
 989 void * openacc_get_current_cuda_context (void);
 990
 991 static void *
 992 nvptx_alloc (size_t s)
 993 {
 994   CUdeviceptr d;
 995   CUresult r;
 996
 997   r = cuMemAlloc (&d, s);
 998   if (r == CUDA_ERROR_OUT_OF_MEMORY)
 999     return 0;
1000   if (r != CUDA_SUCCESS)
1001     GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
1002   return (void *)d;
1003 }
1004
1005 static void
1006 nvptx_free (void *p)
1007 {
1008   CUresult r;
1009   CUdeviceptr pb;
1010   size_t ps;
1011
1012   r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)p);
1013   if (r != CUDA_SUCCESS)
1014     GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r));
1015
1016   if ((CUdeviceptr)p != pb)
1017     GOMP_PLUGIN_fatal ("invalid device address");
1018
1019   r = cuMemFree ((CUdeviceptr)p);
1020   if (r != CUDA_SUCCESS)
1021     GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
1022 }
1023
1024 static void *
1025 nvptx_host2dev (void *d, const void *h, size_t s)
1026 {
1027   CUresult r;
1028   CUdeviceptr pb;
1029   size_t ps;
1030   struct nvptx_thread *nvthd = nvptx_thread ();
1031
1032   if (!s)
1033     return 0;
1034
1035   if (!d)
1036     GOMP_PLUGIN_fatal ("invalid device address");
1037
1038   r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)d);
1039   if (r != CUDA_SUCCESS)
1040     GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r));
1041
1042   if (!pb)
1043     GOMP_PLUGIN_fatal ("invalid device address");
1044
1045   if (!h)
1046     GOMP_PLUGIN_fatal ("invalid host address");
1047
1048   if (d == h)
1049     GOMP_PLUGIN_fatal ("invalid host or device address");
1050
1051   if ((void *)(d + s) > (void *)(pb + ps))
1052     GOMP_PLUGIN_fatal ("invalid size");
1053
1054 #ifndef DISABLE_ASYNC
1055   if (nvthd->current_stream != nvthd->ptx_dev->null_stream)
1056     {
1057       CUevent *e;
1058
1059       e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1060
1061       r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1062       if (r != CUDA_SUCCESS)
1063         GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1064
1065       event_gc (false);
1066
1067       r = cuMemcpyHtoDAsync ((CUdeviceptr)d, h, s,
1068                              nvthd->current_stream->stream);
1069       if (r != CUDA_SUCCESS)
1070         GOMP_PLUGIN_fatal ("cuMemcpyHtoDAsync error: %s", cuda_error (r));
1071
1072       r = cuEventRecord (*e, nvthd->current_stream->stream);
1073       if (r != CUDA_SUCCESS)
1074         GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1075
1076       event_add (PTX_EVT_MEM, e, (void *)h);
1077     }
1078   else
1079 #endif
1080     {
1081       r = cuMemcpyHtoD ((CUdeviceptr)d, h, s);
1082       if (r != CUDA_SUCCESS)
1083         GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
1084     }
1085
1086   return 0;
1087 }
1088
1089 static void *
1090 nvptx_dev2host (void *h, const void *d, size_t s)
1091 {
1092   CUresult r;
1093   CUdeviceptr pb;
1094   size_t ps;
1095   struct nvptx_thread *nvthd = nvptx_thread ();
1096
1097   if (!s)
1098     return 0;
1099
1100   if (!d)
1101     GOMP_PLUGIN_fatal ("invalid device address");
1102
1103   r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)d);
1104   if (r != CUDA_SUCCESS)
1105     GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r));
1106
1107   if (!pb)
1108     GOMP_PLUGIN_fatal ("invalid device address");
1109
1110   if (!h)
1111     GOMP_PLUGIN_fatal ("invalid host address");
1112
1113   if (d == h)
1114     GOMP_PLUGIN_fatal ("invalid host or device address");
1115
1116   if ((void *)(d + s) > (void *)(pb + ps))
1117     GOMP_PLUGIN_fatal ("invalid size");
1118
1119 #ifndef DISABLE_ASYNC
1120   if (nvthd->current_stream != nvthd->ptx_dev->null_stream)
1121     {
1122       CUevent *e;
1123
1124       e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1125
1126       r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1127       if (r != CUDA_SUCCESS)
1128         GOMP_PLUGIN_fatal ("cuEventCreate error: %s\n", cuda_error (r));
1129
1130       event_gc (false);
1131
1132       r = cuMemcpyDtoHAsync (h, (CUdeviceptr)d, s,
1133                              nvthd->current_stream->stream);
1134       if (r != CUDA_SUCCESS)
1135         GOMP_PLUGIN_fatal ("cuMemcpyDtoHAsync error: %s", cuda_error (r));
1136
1137       r = cuEventRecord (*e, nvthd->current_stream->stream);
1138       if (r != CUDA_SUCCESS)
1139         GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1140
1141       event_add (PTX_EVT_MEM, e, (void *)h);
1142     }
1143   else
1144 #endif
1145     {
1146       r = cuMemcpyDtoH (h, (CUdeviceptr)d, s);
1147       if (r != CUDA_SUCCESS)
1148         GOMP_PLUGIN_fatal ("cuMemcpyDtoH error: %s", cuda_error (r));
1149     }
1150
1151   return 0;
1152 }
1153
1154 static void
1155 nvptx_set_async (int async)
1156 {
1157   struct nvptx_thread *nvthd = nvptx_thread ();
1158   nvthd->current_stream
1159     = select_stream_for_async (async, pthread_self (), true, NULL);
1160 }
1161
1162 static int
1163 nvptx_async_test (int async)
1164 {
1165   CUresult r;
1166   struct ptx_stream *s;
1167
1168   s = select_stream_for_async (async, pthread_self (), false, NULL);
1169
1170   if (!s)
1171     GOMP_PLUGIN_fatal ("unknown async %d", async);
1172
1173   r = cuStreamQuery (s->stream);
1174   if (r == CUDA_SUCCESS)
1175     {
1176       /* The oacc-parallel.c:goacc_wait function calls this hook to determine
1177          whether all work has completed on this stream, and if so omits the call
1178          to the wait hook.  If that happens, event_gc might not get called
1179          (which prevents variables from getting unmapped and their associated
1180          device storage freed), so call it here.  */
1181       event_gc (true);
1182       return 1;
1183     }
1184   else if (r == CUDA_ERROR_NOT_READY)
1185     return 0;
1186
1187   GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
1188
1189   return 0;
1190 }
1191
1192 static int
1193 nvptx_async_test_all (void)
1194 {
1195   struct ptx_stream *s;
1196   pthread_t self = pthread_self ();
1197   struct nvptx_thread *nvthd = nvptx_thread ();
1198
1199   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1200
1201   for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
1202     {
1203       if ((s->multithreaded || pthread_equal (s->host_thread, self))
1204           && cuStreamQuery (s->stream) == CUDA_ERROR_NOT_READY)
1205         {
1206           pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1207           return 0;
1208         }
1209     }
1210
1211   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1212
1213   event_gc (true);
1214
1215   return 1;
1216 }
1217
1218 static void
1219 nvptx_wait (int async)
1220 {
1221   CUresult r;
1222   struct ptx_stream *s;
1223
1224   s = select_stream_for_async (async, pthread_self (), false, NULL);
1225
1226   if (!s)
1227     GOMP_PLUGIN_fatal ("unknown async %d", async);
1228
1229   r = cuStreamSynchronize (s->stream);
1230   if (r != CUDA_SUCCESS)
1231     GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1232
1233   event_gc (true);
1234 }
1235
1236 static void
1237 nvptx_wait_async (int async1, int async2)
1238 {
1239   CUresult r;
1240   CUevent *e;
1241   struct ptx_stream *s1, *s2;
1242   pthread_t self = pthread_self ();
1243
1244   /* The stream that is waiting (rather than being waited for) doesn't
1245      necessarily have to exist already.  */
1246   s2 = select_stream_for_async (async2, self, true, NULL);
1247
1248   s1 = select_stream_for_async (async1, self, false, NULL);
1249   if (!s1)
1250     GOMP_PLUGIN_fatal ("invalid async 1\n");
1251
1252   if (s1 == s2)
1253     GOMP_PLUGIN_fatal ("identical parameters");
1254
1255   e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1256
1257   r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1258   if (r != CUDA_SUCCESS)
1259     GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1260
1261   event_gc (true);
1262
1263   r = cuEventRecord (*e, s1->stream);
1264   if (r != CUDA_SUCCESS)
1265     GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1266
1267   event_add (PTX_EVT_SYNC, e, NULL);
1268
1269   r = cuStreamWaitEvent (s2->stream, *e, 0);
1270   if (r != CUDA_SUCCESS)
1271     GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuda_error (r));
1272 }
1273
1274 static void
1275 nvptx_wait_all (void)
1276 {
1277   CUresult r;
1278   struct ptx_stream *s;
1279   pthread_t self = pthread_self ();
1280   struct nvptx_thread *nvthd = nvptx_thread ();
1281
1282   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1283
1284   /* Wait for active streams initiated by this thread (or by multiple threads)
1285      to complete.  */
1286   for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
1287     {
1288       if (s->multithreaded || pthread_equal (s->host_thread, self))
1289         {
1290           r = cuStreamQuery (s->stream);
1291           if (r == CUDA_SUCCESS)
1292             continue;
1293           else if (r != CUDA_ERROR_NOT_READY)
1294             GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
1295
1296           r = cuStreamSynchronize (s->stream);
1297           if (r != CUDA_SUCCESS)
1298             GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1299         }
1300     }
1301
1302   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1303
1304   event_gc (true);
1305 }
1306
1307 static void
1308 nvptx_wait_all_async (int async)
1309 {
1310   CUresult r;
1311   struct ptx_stream *waiting_stream, *other_stream;
1312   CUevent *e;
1313   struct nvptx_thread *nvthd = nvptx_thread ();
1314   pthread_t self = pthread_self ();
1315
1316   /* The stream doing the waiting.  This could be the first mention of the
1317      stream, so create it if necessary.  */
1318   waiting_stream
1319     = select_stream_for_async (async, pthread_self (), true, NULL);
1320
1321   /* Launches on the null stream already block on other streams in the
1322      context.  */
1323   if (!waiting_stream || waiting_stream == nvthd->ptx_dev->null_stream)
1324     return;
1325
1326   event_gc (true);
1327
1328   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1329
1330   for (other_stream = nvthd->ptx_dev->active_streams;
1331        other_stream != NULL;
1332        other_stream = other_stream->next)
1333     {
1334       if (!other_stream->multithreaded
1335           && !pthread_equal (other_stream->host_thread, self))
1336         continue;
1337
1338       e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1339
1340       r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1341       if (r != CUDA_SUCCESS)
1342         GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1343
1344       /* Record an event on the waited-for stream.  */
1345       r = cuEventRecord (*e, other_stream->stream);
1346       if (r != CUDA_SUCCESS)
1347         GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1348
1349       event_add (PTX_EVT_SYNC, e, NULL);
1350
1351       r = cuStreamWaitEvent (waiting_stream->stream, *e, 0);
1352       if (r != CUDA_SUCCESS)
1353         GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuda_error (r));
1354    }
1355
1356   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1357 }
1358
1359 static void *
1360 nvptx_get_current_cuda_device (void)
1361 {
1362   struct nvptx_thread *nvthd = nvptx_thread ();
1363
1364   if (!nvthd || !nvthd->ptx_dev)
1365     return NULL;
1366
1367   return &nvthd->ptx_dev->dev;
1368 }
1369
1370 static void *
1371 nvptx_get_current_cuda_context (void)
1372 {
1373   struct nvptx_thread *nvthd = nvptx_thread ();
1374
1375   if (!nvthd || !nvthd->ptx_dev)
1376     return NULL;
1377
1378   return nvthd->ptx_dev->ctx;
1379 }
1380
1381 static void *
1382 nvptx_get_cuda_stream (int async)
1383 {
1384   struct ptx_stream *s;
1385   struct nvptx_thread *nvthd = nvptx_thread ();
1386
1387   if (!nvthd || !nvthd->ptx_dev)
1388     return NULL;
1389
1390   s = select_stream_for_async (async, pthread_self (), false, NULL);
1391
1392   return s ? s->stream : NULL;
1393 }
1394
1395 static int
1396 nvptx_set_cuda_stream (int async, void *stream)
1397 {
1398   struct ptx_stream *oldstream;
1399   pthread_t self = pthread_self ();
1400   struct nvptx_thread *nvthd = nvptx_thread ();
1401
1402   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1403
1404   if (async < 0)
1405     GOMP_PLUGIN_fatal ("bad async %d", async);
1406
1407   /* We have a list of active streams and an array mapping async values to
1408      entries of that list.  We need to take "ownership" of the passed-in stream,
1409      and add it to our list, removing the previous entry also (if there was one)
1410      in order to prevent resource leaks.  Note the potential for surprise
1411      here: maybe we should keep track of passed-in streams and leave it up to
1412      the user to tidy those up, but that doesn't work for stream handles
1413      returned from acc_get_cuda_stream above...  */
1414
1415   oldstream = select_stream_for_async (async, self, false, NULL);
1416
1417   if (oldstream)
1418     {
1419       if (nvthd->ptx_dev->active_streams == oldstream)
1420         nvthd->ptx_dev->active_streams = nvthd->ptx_dev->active_streams->next;
1421       else
1422         {
1423           struct ptx_stream *s = nvthd->ptx_dev->active_streams;
1424           while (s->next != oldstream)
1425             s = s->next;
1426           s->next = s->next->next;
1427         }
1428
1429       cuStreamDestroy (oldstream->stream);
1430       map_fini (oldstream);
1431       free (oldstream);
1432     }
1433
1434   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1435
1436   (void) select_stream_for_async (async, self, true, (CUstream) stream);
1437
1438   return 1;
1439 }
1440
1441 /* Plugin entry points.  */
1442
1443 const char *
1444 GOMP_OFFLOAD_get_name (void)
1445 {
1446   return "nvptx";
1447 }
1448
1449 unsigned int
1450 GOMP_OFFLOAD_get_caps (void)
1451 {
1452   return GOMP_OFFLOAD_CAP_OPENACC_200;
1453 }
1454
1455 int
1456 GOMP_OFFLOAD_get_type (void)
1457 {
1458   return OFFLOAD_TARGET_TYPE_NVIDIA_PTX;
1459 }
1460
1461 int
1462 GOMP_OFFLOAD_get_num_devices (void)
1463 {
1464   return nvptx_get_num_devices ();
1465 }
1466
1467 void
1468 GOMP_OFFLOAD_init_device (int n)
1469 {
1470   pthread_mutex_lock (&ptx_dev_lock);
1471
1472   if (!nvptx_init () || ptx_devices[n] != NULL)
1473     {
1474       pthread_mutex_unlock (&ptx_dev_lock);
1475       return;
1476     }
1477
1478   ptx_devices[n] = nvptx_open_device (n);
1479   instantiated_devices++;
1480
1481   pthread_mutex_unlock (&ptx_dev_lock);
1482 }
1483
1484 void
1485 GOMP_OFFLOAD_fini_device (int n)
1486 {
1487   pthread_mutex_lock (&ptx_dev_lock);
1488
1489   if (ptx_devices[n] != NULL)
1490     {
1491       nvptx_attach_host_thread_to_device (n);
1492       nvptx_close_device (ptx_devices[n]);
1493       ptx_devices[n] = NULL;
1494       instantiated_devices--;
1495     }
1496
1497   pthread_mutex_unlock (&ptx_dev_lock);
1498 }
1499
1500 /* Return the libgomp version number we're compatible with.  There is
1501    no requirement for cross-version compatibility.  */
1502
1503 unsigned
1504 GOMP_OFFLOAD_version (void)
1505 {
1506   return GOMP_VERSION;
1507 }
1508
1509 /* Load the (partial) program described by TARGET_DATA to device
1510    number ORD.  Allocate and return TARGET_TABLE.  */
1511
1512 int
1513 GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
1514                          struct addr_pair **target_table)
1515 {
1516   CUmodule module;
1517   const char *const *var_names;
1518   const struct targ_fn_launch *fn_descs;
1519   unsigned int fn_entries, var_entries, i, j;
1520   CUresult r;
1521   struct targ_fn_descriptor *targ_fns;
1522   struct addr_pair *targ_tbl;
1523   const nvptx_tdata_t *img_header = (const nvptx_tdata_t *) target_data;
1524   struct ptx_image_data *new_image;
1525   struct ptx_device *dev;
1526
1527   if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1528     GOMP_PLUGIN_fatal ("Offload data incompatible with PTX plugin"
1529                        " (expected %u, received %u)",
1530                        GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1531
1532   GOMP_OFFLOAD_init_device (ord);
1533
1534   dev = ptx_devices[ord];
1535
1536   nvptx_attach_host_thread_to_device (ord);
1537
1538   link_ptx (&module, img_header->ptx_objs, img_header->ptx_num);
1539
1540   /* The mkoffload utility emits a struct of pointers/integers at the
1541      start of each offload image.  The array of kernel names and the
1542      functions addresses form a one-to-one correspondence.  */
1543
1544   var_entries = img_header->var_num;
1545   var_names = img_header->var_names;
1546   fn_entries = img_header->fn_num;
1547   fn_descs = img_header->fn_descs;
1548
1549   targ_tbl = GOMP_PLUGIN_malloc (sizeof (struct addr_pair)
1550                                  * (fn_entries + var_entries));
1551   targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
1552                                  * fn_entries);
1553
1554   *target_table = targ_tbl;
1555
1556   new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data));
1557   new_image->target_data = target_data;
1558   new_image->module = module;
1559   new_image->fns = targ_fns;
1560
1561   pthread_mutex_lock (&dev->image_lock);
1562   new_image->next = dev->images;
1563   dev->images = new_image;
1564   pthread_mutex_unlock (&dev->image_lock);
1565
1566   for (i = 0; i < fn_entries; i++, targ_fns++, targ_tbl++)
1567     {
1568       CUfunction function;
1569
1570       r = cuModuleGetFunction (&function, module, fn_descs[i].fn);
1571       if (r != CUDA_SUCCESS)
1572         GOMP_PLUGIN_fatal ("cuModuleGetFunction error: %s", cuda_error (r));
1573
1574       targ_fns->fn = function;
1575       targ_fns->launch = &fn_descs[i];
1576
1577       targ_tbl->start = (uintptr_t) targ_fns;
1578       targ_tbl->end = targ_tbl->start + 1;
1579     }
1580
1581   for (j = 0; j < var_entries; j++, targ_tbl++)
1582     {
1583       CUdeviceptr var;
1584       size_t bytes;
1585
1586       r = cuModuleGetGlobal (&var, &bytes, module, var_names[j]);
1587       if (r != CUDA_SUCCESS)
1588         GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
1589
1590       targ_tbl->start = (uintptr_t) var;
1591       targ_tbl->end = targ_tbl->start + bytes;
1592     }
1593
1594   return fn_entries + var_entries;
1595 }
1596
1597 /* Unload the program described by TARGET_DATA.  DEV_DATA is the
1598    function descriptors allocated by G_O_load_image.  */
1599
1600 void
1601 GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data)
1602 {
1603   struct ptx_image_data *image, **prev_p;
1604   struct ptx_device *dev = ptx_devices[ord];
1605
1606   if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1607     return;
1608
1609   pthread_mutex_lock (&dev->image_lock);
1610   for (prev_p = &dev->images; (image = *prev_p) != 0; prev_p = &image->next)
1611     if (image->target_data == target_data)
1612       {
1613         *prev_p = image->next;
1614         cuModuleUnload (image->module);
1615         free (image->fns);
1616         free (image);
1617         break;
1618       }
1619   pthread_mutex_unlock (&dev->image_lock);
1620 }
1621
1622 void *
1623 GOMP_OFFLOAD_alloc (int ord, size_t size)
1624 {
1625   nvptx_attach_host_thread_to_device (ord);
1626   return nvptx_alloc (size);
1627 }
1628
1629 void
1630 GOMP_OFFLOAD_free (int ord, void *ptr)
1631 {
1632   nvptx_attach_host_thread_to_device (ord);
1633   nvptx_free (ptr);
1634 }
1635
1636 void *
1637 GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
1638 {
1639   nvptx_attach_host_thread_to_device (ord);
1640   return nvptx_dev2host (dst, src, n);
1641 }
1642
1643 void *
1644 GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
1645 {
1646   nvptx_attach_host_thread_to_device (ord);
1647   return nvptx_host2dev (dst, src, n);
1648 }
1649
1650 void (*device_run) (int n, void *fn_ptr, void *vars) = NULL;
1651
1652 void
1653 GOMP_OFFLOAD_openacc_parallel (void (*fn) (void *), size_t mapnum,
1654                                void **hostaddrs, void **devaddrs,
1655                                int async, unsigned *dims, void *targ_mem_desc)
1656 {
1657   nvptx_exec (fn, mapnum, hostaddrs, devaddrs, async, dims, targ_mem_desc);
1658 }
1659
1660 void
1661 GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc)
1662 {
1663   CUevent *e;
1664   CUresult r;
1665   struct nvptx_thread *nvthd = nvptx_thread ();
1666
1667   e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1668
1669   r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1670   if (r != CUDA_SUCCESS)
1671     GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1672
1673   r = cuEventRecord (*e, nvthd->current_stream->stream);
1674   if (r != CUDA_SUCCESS)
1675     GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1676
1677   event_add (PTX_EVT_ASYNC_CLEANUP, e, targ_mem_desc);
1678 }
1679
1680 int
1681 GOMP_OFFLOAD_openacc_async_test (int async)
1682 {
1683   return nvptx_async_test (async);
1684 }
1685
1686 int
1687 GOMP_OFFLOAD_openacc_async_test_all (void)
1688 {
1689   return nvptx_async_test_all ();
1690 }
1691
1692 void
1693 GOMP_OFFLOAD_openacc_async_wait (int async)
1694 {
1695   nvptx_wait (async);
1696 }
1697
1698 void
1699 GOMP_OFFLOAD_openacc_async_wait_async (int async1, int async2)
1700 {
1701   nvptx_wait_async (async1, async2);
1702 }
1703
1704 void
1705 GOMP_OFFLOAD_openacc_async_wait_all (void)
1706 {
1707   nvptx_wait_all ();
1708 }
1709
1710 void
1711 GOMP_OFFLOAD_openacc_async_wait_all_async (int async)
1712 {
1713   nvptx_wait_all_async (async);
1714 }
1715
1716 void
1717 GOMP_OFFLOAD_openacc_async_set_async (int async)
1718 {
1719   nvptx_set_async (async);
1720 }
1721
1722 void *
1723 GOMP_OFFLOAD_openacc_create_thread_data (int ord)
1724 {
1725   struct ptx_device *ptx_dev;
1726   struct nvptx_thread *nvthd
1727     = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
1728   CUresult r;
1729   CUcontext thd_ctx;
1730
1731   ptx_dev = ptx_devices[ord];
1732
1733   assert (ptx_dev);
1734
1735   r = cuCtxGetCurrent (&thd_ctx);
1736   if (r != CUDA_SUCCESS)
1737     GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));
1738
1739   assert (ptx_dev->ctx);
1740
1741   if (!thd_ctx)
1742     {
1743       r = cuCtxPushCurrent (ptx_dev->ctx);
1744       if (r != CUDA_SUCCESS)
1745         GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s", cuda_error (r));
1746     }
1747
1748   nvthd->current_stream = ptx_dev->null_stream;
1749   nvthd->ptx_dev = ptx_dev;
1750
1751   return (void *) nvthd;
1752 }
1753
1754 void
1755 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
1756 {
1757   free (data);
1758 }
1759
1760 void *
1761 GOMP_OFFLOAD_openacc_get_current_cuda_device (void)
1762 {
1763   return nvptx_get_current_cuda_device ();
1764 }
1765
1766 void *
1767 GOMP_OFFLOAD_openacc_get_current_cuda_context (void)
1768 {
1769   return nvptx_get_current_cuda_context ();
1770 }
1771
1772 /* NOTE: This returns a CUstream, not a ptx_stream pointer.  */
1773
1774 void *
1775 GOMP_OFFLOAD_openacc_get_cuda_stream (int async)
1776 {
1777   return nvptx_get_cuda_stream (async);
1778 }
1779
1780 /* NOTE: This takes a CUstream, not a ptx_stream pointer.  */
1781
1782 int
1783 GOMP_OFFLOAD_openacc_set_cuda_stream (int async, void *stream)
1784 {
1785   return nvptx_set_cuda_stream (async, stream);
1786 }