libgomp/plugin/plugin-nvptx.c

   1 /* Plugin for NVPTX execution.
   2
   3    Copyright (C) 2013-2016 Free Software Foundation, Inc.
   4
   5    Contributed by Mentor Embedded.
   6
   7    This file is part of the GNU Offloading and Multi Processing Library
   8    (libgomp).
   9
  10    Libgomp is free software; you can redistribute it and/or modify it
  11    under the terms of the GNU General Public License as published by
  12    the Free Software Foundation; either version 3, or (at your option)
  13    any later version.
  14
  15    Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
  16    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  17    FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  18    more details.
  19
  20    Under Section 7 of GPL version 3, you are granted additional
  21    permissions described in the GCC Runtime Library Exception, version
  22    3.1, as published by the Free Software Foundation.
  23
  24    You should have received a copy of the GNU General Public License and
  25    a copy of the GCC Runtime Library Exception along with this program;
  26    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  27    <http://www.gnu.org/licenses/>.  */
  28
  29 /* Nvidia PTX-specific parts of OpenACC support.  The cuda driver
  30    library appears to hold some implicit state, but the documentation
  31    is not clear as to what that state might be.  Or how one might
  32    propagate it from one thread to another.  */
  33
  34 #include "openacc.h"
  35 #include "config.h"
  36 #include "libgomp-plugin.h"
  37 #include "oacc-plugin.h"
  38 #include "gomp-constants.h"
  39
  40 #include <pthread.h>
  41 #include <cuda.h>
  42 #include <stdbool.h>
  43 #include <stdint.h>
  44 #include <string.h>
  45 #include <stdio.h>
  46 #include <unistd.h>
  47 #include <assert.h>
  48
  49 static const char *
  50 cuda_error (CUresult r)
  51 {
  52 #if CUDA_VERSION < 7000
  53   /* Specified in documentation and present in library from at least
  54      5.5.  Not declared in header file prior to 7.0.  */
  55   extern CUresult cuGetErrorString (CUresult, const char **);
  56 #endif
  57   const char *desc;
  58
  59   r = cuGetErrorString (r, &desc);
  60   if (r != CUDA_SUCCESS)
  61     desc = "unknown cuda error";
  62
  63   return desc;
  64 }
  65
  66 static unsigned int instantiated_devices = 0;
  67 static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
  68
  69 struct ptx_stream
  70 {
  71   CUstream stream;
  72   pthread_t host_thread;
  73   bool multithreaded;
  74
  75   CUdeviceptr d;
  76   void *h;
  77   void *h_begin;
  78   void *h_end;
  79   void *h_next;
  80   void *h_prev;
  81   void *h_tail;
  82
  83   struct ptx_stream *next;
  84 };
  85
  86 /* Thread-specific data for PTX.  */
  87
  88 struct nvptx_thread
  89 {
  90   struct ptx_stream *current_stream;
  91   struct ptx_device *ptx_dev;
  92 };
  93
  94 struct map
  95 {
  96   int     async;
  97   size_t  size;
  98   char    mappings[0];
  99 };
 100
 101 static void
 102 map_init (struct ptx_stream *s)
 103 {
 104   CUresult r;
 105
 106   int size = getpagesize ();
 107
 108   assert (s);
 109   assert (!s->d);
 110   assert (!s->h);
 111
 112   r = cuMemAllocHost (&s->h, size);
 113   if (r != CUDA_SUCCESS)
 114     GOMP_PLUGIN_fatal ("cuMemAllocHost error: %s", cuda_error (r));
 115
 116   r = cuMemHostGetDevicePointer (&s->d, s->h, 0);
 117   if (r != CUDA_SUCCESS)
 118     GOMP_PLUGIN_fatal ("cuMemHostGetDevicePointer error: %s", cuda_error (r));
 119
 120   assert (s->h);
 121
 122   s->h_begin = s->h;
 123   s->h_end = s->h_begin + size;
 124   s->h_next = s->h_prev = s->h_tail = s->h_begin;
 125
 126   assert (s->h_next);
 127   assert (s->h_end);
 128 }
 129
 130 static void
 131 map_fini (struct ptx_stream *s)
 132 {
 133   CUresult r;
 134
 135   r = cuMemFreeHost (s->h);
 136   if (r != CUDA_SUCCESS)
 137     GOMP_PLUGIN_fatal ("cuMemFreeHost error: %s", cuda_error (r));
 138 }
 139
 140 static void
 141 map_pop (struct ptx_stream *s)
 142 {
 143   struct map *m;
 144
 145   assert (s != NULL);
 146   assert (s->h_next);
 147   assert (s->h_prev);
 148   assert (s->h_tail);
 149
 150   m = s->h_tail;
 151
 152   s->h_tail += m->size;
 153
 154   if (s->h_tail >= s->h_end)
 155     s->h_tail = s->h_begin + (int) (s->h_tail - s->h_end);
 156
 157   if (s->h_next == s->h_tail)
 158     s->h_prev = s->h_next;
 159
 160   assert (s->h_next >= s->h_begin);
 161   assert (s->h_tail >= s->h_begin);
 162   assert (s->h_prev >= s->h_begin);
 163
 164   assert (s->h_next <= s->h_end);
 165   assert (s->h_tail <= s->h_end);
 166   assert (s->h_prev <= s->h_end);
 167 }
 168
 169 static void
 170 map_push (struct ptx_stream *s, int async, size_t size, void **h, void **d)
 171 {
 172   int left;
 173   int offset;
 174   struct map *m;
 175
 176   assert (s != NULL);
 177
 178   left = s->h_end - s->h_next;
 179   size += sizeof (struct map);
 180
 181   assert (s->h_prev);
 182   assert (s->h_next);
 183
 184   if (size >= left)
 185     {
 186       m = s->h_prev;
 187       m->size += left;
 188       s->h_next = s->h_begin;
 189
 190       if (s->h_next + size > s->h_end)
 191         GOMP_PLUGIN_fatal ("unable to push map");
 192     }
 193
 194   assert (s->h_next);
 195
 196   m = s->h_next;
 197   m->async = async;
 198   m->size = size;
 199
 200   offset = (void *)&m->mappings[0] - s->h;
 201
 202   *d = (void *)(s->d + offset);
 203   *h = (void *)(s->h + offset);
 204
 205   s->h_prev = s->h_next;
 206   s->h_next += size;
 207
 208   assert (s->h_prev);
 209   assert (s->h_next);
 210
 211   assert (s->h_next >= s->h_begin);
 212   assert (s->h_tail >= s->h_begin);
 213   assert (s->h_prev >= s->h_begin);
 214   assert (s->h_next <= s->h_end);
 215   assert (s->h_tail <= s->h_end);
 216   assert (s->h_prev <= s->h_end);
 217
 218   return;
 219 }
 220
 221 /* Target data function launch information.  */
 222
 223 struct targ_fn_launch
 224 {
 225   const char *fn;
 226   unsigned short dim[GOMP_DIM_MAX];
 227 };
 228
 229 /* Target PTX object information.  */
 230
 231 struct targ_ptx_obj
 232 {
 233   const char *code;
 234   size_t size;
 235 };
 236
 237 /* Target data image information.  */
 238
 239 typedef struct nvptx_tdata
 240 {
 241   const struct targ_ptx_obj *ptx_objs;
 242   unsigned ptx_num;
 243
 244   const char *const *var_names;
 245   unsigned var_num;
 246
 247   const struct targ_fn_launch *fn_descs;
 248   unsigned fn_num;
 249 } nvptx_tdata_t;
 250
 251 /* Descriptor of a loaded function.  */
 252
 253 struct targ_fn_descriptor
 254 {
 255   CUfunction fn;
 256   const struct targ_fn_launch *launch;
 257 };
 258
 259 /* A loaded PTX image.  */
 260 struct ptx_image_data
 261 {
 262   const void *target_data;
 263   CUmodule module;
 264
 265   struct targ_fn_descriptor *fns;  /* Array of functions.  */
 266
 267   struct ptx_image_data *next;
 268 };
 269
 270 struct ptx_device
 271 {
 272   CUcontext ctx;
 273   bool ctx_shared;
 274   CUdevice dev;
 275   struct ptx_stream *null_stream;
 276   /* All non-null streams associated with this device (actually context),
 277      either created implicitly or passed in from the user (via
 278      acc_set_cuda_stream).  */
 279   struct ptx_stream *active_streams;
 280   struct {
 281     struct ptx_stream **arr;
 282     int size;
 283   } async_streams;
 284   /* A lock for use when manipulating the above stream list and array.  */
 285   pthread_mutex_t stream_lock;
 286   int ord;
 287   bool overlap;
 288   bool map;
 289   bool concur;
 290   int  mode;
 291   bool mkern;
 292
 293   struct ptx_image_data *images;  /* Images loaded on device.  */
 294   pthread_mutex_t image_lock;     /* Lock for above list.  */
 295
 296   struct ptx_device *next;
 297 };
 298
 299 enum ptx_event_type
 300 {
 301   PTX_EVT_MEM,
 302   PTX_EVT_KNL,
 303   PTX_EVT_SYNC,
 304   PTX_EVT_ASYNC_CLEANUP
 305 };
 306
 307 struct ptx_event
 308 {
 309   CUevent *evt;
 310   int type;
 311   void *addr;
 312   int ord;
 313
 314   struct ptx_event *next;
 315 };
 316
 317 static pthread_mutex_t ptx_event_lock;
 318 static struct ptx_event *ptx_events;
 319
 320 static struct ptx_device **ptx_devices;
 321
 322 static inline struct nvptx_thread *
 323 nvptx_thread (void)
 324 {
 325   return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
 326 }
 327
 328 static void
 329 init_streams_for_device (struct ptx_device *ptx_dev, int concurrency)
 330 {
 331   int i;
 332   struct ptx_stream *null_stream
 333     = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
 334
 335   null_stream->stream = NULL;
 336   null_stream->host_thread = pthread_self ();
 337   null_stream->multithreaded = true;
 338   null_stream->d = (CUdeviceptr) NULL;
 339   null_stream->h = NULL;
 340   map_init (null_stream);
 341   ptx_dev->null_stream = null_stream;
 342
 343   ptx_dev->active_streams = NULL;
 344   pthread_mutex_init (&ptx_dev->stream_lock, NULL);
 345
 346   if (concurrency < 1)
 347     concurrency = 1;
 348
 349   /* This is just a guess -- make space for as many async streams as the
 350      current device is capable of concurrently executing.  This can grow
 351      later as necessary.  No streams are created yet.  */
 352   ptx_dev->async_streams.arr
 353     = GOMP_PLUGIN_malloc (concurrency * sizeof (struct ptx_stream *));
 354   ptx_dev->async_streams.size = concurrency;
 355
 356   for (i = 0; i < concurrency; i++)
 357     ptx_dev->async_streams.arr[i] = NULL;
 358 }
 359
 360 static void
 361 fini_streams_for_device (struct ptx_device *ptx_dev)
 362 {
 363   free (ptx_dev->async_streams.arr);
 364
 365   while (ptx_dev->active_streams != NULL)
 366     {
 367       struct ptx_stream *s = ptx_dev->active_streams;
 368       ptx_dev->active_streams = ptx_dev->active_streams->next;
 369
 370       map_fini (s);
 371       cuStreamDestroy (s->stream);
 372       free (s);
 373     }
 374
 375   map_fini (ptx_dev->null_stream);
 376   free (ptx_dev->null_stream);
 377 }
 378
 379 /* Select a stream for (OpenACC-semantics) ASYNC argument for the current
 380    thread THREAD (and also current device/context).  If CREATE is true, create
 381    the stream if it does not exist (or use EXISTING if it is non-NULL), and
 382    associate the stream with the same thread argument.  Returns stream to use
 383    as result.  */
 384
 385 static struct ptx_stream *
 386 select_stream_for_async (int async, pthread_t thread, bool create,
 387                          CUstream existing)
 388 {
 389   struct nvptx_thread *nvthd = nvptx_thread ();
 390   /* Local copy of TLS variable.  */
 391   struct ptx_device *ptx_dev = nvthd->ptx_dev;
 392   struct ptx_stream *stream = NULL;
 393   int orig_async = async;
 394
 395   /* The special value acc_async_noval (-1) maps (for now) to an
 396      implicitly-created stream, which is then handled the same as any other
 397      numbered async stream.  Other options are available, e.g. using the null
 398      stream for anonymous async operations, or choosing an idle stream from an
 399      active set.  But, stick with this for now.  */
 400   if (async > acc_async_sync)
 401     async++;
 402
 403   if (create)
 404     pthread_mutex_lock (&ptx_dev->stream_lock);
 405
 406   /* NOTE: AFAICT there's no particular need for acc_async_sync to map to the
 407      null stream, and in fact better performance may be obtainable if it doesn't
 408      (because the null stream enforces overly-strict synchronisation with
 409      respect to other streams for legacy reasons, and that's probably not
 410      needed with OpenACC).  Maybe investigate later.  */
 411   if (async == acc_async_sync)
 412     stream = ptx_dev->null_stream;
 413   else if (async >= 0 && async < ptx_dev->async_streams.size
 414            && ptx_dev->async_streams.arr[async] && !(create && existing))
 415     stream = ptx_dev->async_streams.arr[async];
 416   else if (async >= 0 && create)
 417     {
 418       if (async >= ptx_dev->async_streams.size)
 419         {
 420           int i, newsize = ptx_dev->async_streams.size * 2;
 421
 422           if (async >= newsize)
 423             newsize = async + 1;
 424
 425           ptx_dev->async_streams.arr
 426             = GOMP_PLUGIN_realloc (ptx_dev->async_streams.arr,
 427                                    newsize * sizeof (struct ptx_stream *));
 428
 429           for (i = ptx_dev->async_streams.size; i < newsize; i++)
 430             ptx_dev->async_streams.arr[i] = NULL;
 431
 432           ptx_dev->async_streams.size = newsize;
 433         }
 434
 435       /* Create a new stream on-demand if there isn't one already, or if we're
 436          setting a particular async value to an existing (externally-provided)
 437          stream.  */
 438       if (!ptx_dev->async_streams.arr[async] || existing)
 439         {
 440           CUresult r;
 441           struct ptx_stream *s
 442             = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
 443
 444           if (existing)
 445             s->stream = existing;
 446           else
 447             {
 448               r = cuStreamCreate (&s->stream, CU_STREAM_DEFAULT);
 449               if (r != CUDA_SUCCESS)
 450                 GOMP_PLUGIN_fatal ("cuStreamCreate error: %s", cuda_error (r));
 451             }
 452
 453           /* If CREATE is true, we're going to be queueing some work on this
 454              stream.  Associate it with the current host thread.  */
 455           s->host_thread = thread;
 456           s->multithreaded = false;
 457
 458           s->d = (CUdeviceptr) NULL;
 459           s->h = NULL;
 460           map_init (s);
 461
 462           s->next = ptx_dev->active_streams;
 463           ptx_dev->active_streams = s;
 464           ptx_dev->async_streams.arr[async] = s;
 465         }
 466
 467       stream = ptx_dev->async_streams.arr[async];
 468     }
 469   else if (async < 0)
 470     GOMP_PLUGIN_fatal ("bad async %d", async);
 471
 472   if (create)
 473     {
 474       assert (stream != NULL);
 475
 476       /* If we're trying to use the same stream from different threads
 477          simultaneously, set stream->multithreaded to true.  This affects the
 478          behaviour of acc_async_test_all and acc_wait_all, which are supposed to
 479          only wait for asynchronous launches from the same host thread they are
 480          invoked on.  If multiple threads use the same async value, we make note
 481          of that here and fall back to testing/waiting for all threads in those
 482          functions.  */
 483       if (thread != stream->host_thread)
 484         stream->multithreaded = true;
 485
 486       pthread_mutex_unlock (&ptx_dev->stream_lock);
 487     }
 488   else if (stream && !stream->multithreaded
 489            && !pthread_equal (stream->host_thread, thread))
 490     GOMP_PLUGIN_fatal ("async %d used on wrong thread", orig_async);
 491
 492   return stream;
 493 }
 494
 495 /* Initialize the device.  Return TRUE on success, else FALSE.  PTX_DEV_LOCK
 496    should be locked on entry and remains locked on exit.  */
 497
 498 static bool
 499 nvptx_init (void)
 500 {
 501   CUresult r;
 502   int ndevs;
 503
 504   if (instantiated_devices != 0)
 505     return true;
 506
 507   r = cuInit (0);
 508   if (r != CUDA_SUCCESS)
 509     GOMP_PLUGIN_fatal ("cuInit error: %s", cuda_error (r));
 510
 511   ptx_events = NULL;
 512
 513   pthread_mutex_init (&ptx_event_lock, NULL);
 514
 515   r = cuDeviceGetCount (&ndevs);
 516   if (r != CUDA_SUCCESS)
 517     GOMP_PLUGIN_fatal ("cuDeviceGetCount error: %s", cuda_error (r));
 518
 519   ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *)
 520                                             * ndevs);
 521
 522   return true;
 523 }
 524
 525 /* Select the N'th PTX device for the current host thread.  The device must
 526    have been previously opened before calling this function.  */
 527
 528 static void
 529 nvptx_attach_host_thread_to_device (int n)
 530 {
 531   CUdevice dev;
 532   CUresult r;
 533   struct ptx_device *ptx_dev;
 534   CUcontext thd_ctx;
 535
 536   r = cuCtxGetDevice (&dev);
 537   if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
 538     GOMP_PLUGIN_fatal ("cuCtxGetDevice error: %s", cuda_error (r));
 539
 540   if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n)
 541     return;
 542   else
 543     {
 544       CUcontext old_ctx;
 545
 546       ptx_dev = ptx_devices[n];
 547       assert (ptx_dev);
 548
 549       r = cuCtxGetCurrent (&thd_ctx);
 550       if (r != CUDA_SUCCESS)
 551         GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));
 552
 553       /* We don't necessarily have a current context (e.g. if it has been
 554          destroyed.  Pop it if we do though.  */
 555       if (thd_ctx != NULL)
 556         {
 557           r = cuCtxPopCurrent (&old_ctx);
 558           if (r != CUDA_SUCCESS)
 559             GOMP_PLUGIN_fatal ("cuCtxPopCurrent error: %s", cuda_error (r));
 560         }
 561
 562       r = cuCtxPushCurrent (ptx_dev->ctx);
 563       if (r != CUDA_SUCCESS)
 564         GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s", cuda_error (r));
 565     }
 566 }
 567
 568 static struct ptx_device *
 569 nvptx_open_device (int n)
 570 {
 571   struct ptx_device *ptx_dev;
 572   CUdevice dev, ctx_dev;
 573   CUresult r;
 574   int async_engines, pi;
 575
 576   r = cuDeviceGet (&dev, n);
 577   if (r != CUDA_SUCCESS)
 578     GOMP_PLUGIN_fatal ("cuDeviceGet error: %s", cuda_error (r));
 579
 580   ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device));
 581
 582   ptx_dev->ord = n;
 583   ptx_dev->dev = dev;
 584   ptx_dev->ctx_shared = false;
 585
 586   r = cuCtxGetDevice (&ctx_dev);
 587   if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
 588     GOMP_PLUGIN_fatal ("cuCtxGetDevice error: %s", cuda_error (r));
 589
 590   if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev)
 591     {
 592       /* The current host thread has an active context for a different device.
 593          Detach it.  */
 594       CUcontext old_ctx;
 595
 596       r = cuCtxPopCurrent (&old_ctx);
 597       if (r != CUDA_SUCCESS)
 598         GOMP_PLUGIN_fatal ("cuCtxPopCurrent error: %s", cuda_error (r));
 599     }
 600
 601   r = cuCtxGetCurrent (&ptx_dev->ctx);
 602   if (r != CUDA_SUCCESS)
 603     GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));
 604
 605   if (!ptx_dev->ctx)
 606     {
 607       r = cuCtxCreate (&ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
 608       if (r != CUDA_SUCCESS)
 609         GOMP_PLUGIN_fatal ("cuCtxCreate error: %s", cuda_error (r));
 610     }
 611   else
 612     ptx_dev->ctx_shared = true;
 613
 614   r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
 615   if (r != CUDA_SUCCESS)
 616     GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
 617
 618   ptx_dev->overlap = pi;
 619
 620   r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
 621   if (r != CUDA_SUCCESS)
 622     GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
 623
 624   ptx_dev->map = pi;
 625
 626   r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
 627   if (r != CUDA_SUCCESS)
 628     GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
 629
 630   ptx_dev->concur = pi;
 631
 632   r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
 633   if (r != CUDA_SUCCESS)
 634     GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
 635
 636   ptx_dev->mode = pi;
 637
 638   r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
 639   if (r != CUDA_SUCCESS)
 640     GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
 641
 642   ptx_dev->mkern = pi;
 643
 644   r = cuDeviceGetAttribute (&async_engines,
 645                             CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
 646   if (r != CUDA_SUCCESS)
 647     async_engines = 1;
 648
 649   ptx_dev->images = NULL;
 650   pthread_mutex_init (&ptx_dev->image_lock, NULL);
 651
 652   init_streams_for_device (ptx_dev, async_engines);
 653
 654   return ptx_dev;
 655 }
 656
 657 static void
 658 nvptx_close_device (struct ptx_device *ptx_dev)
 659 {
 660   CUresult r;
 661
 662   if (!ptx_dev)
 663     return;
 664
 665   fini_streams_for_device (ptx_dev);
 666
 667   pthread_mutex_destroy (&ptx_dev->image_lock);
 668
 669   if (!ptx_dev->ctx_shared)
 670     {
 671       r = cuCtxDestroy (ptx_dev->ctx);
 672       if (r != CUDA_SUCCESS)
 673         GOMP_PLUGIN_fatal ("cuCtxDestroy error: %s", cuda_error (r));
 674     }
 675
 676   free (ptx_dev);
 677 }
 678
 679 static int
 680 nvptx_get_num_devices (void)
 681 {
 682   int n;
 683   CUresult r;
 684
 685   /* PR libgomp/65099: Currently, we only support offloading in 64-bit
 686      configurations.  */
 687   if (sizeof (void *) != 8)
 688     return 0;
 689
 690   /* This function will be called before the plugin has been initialized in
 691      order to enumerate available devices, but CUDA API routines can't be used
 692      until cuInit has been called.  Just call it now (but don't yet do any
 693      further initialization).  */
 694   if (instantiated_devices == 0)
 695     {
 696       r = cuInit (0);
 697       /* This is not an error: e.g. we may have CUDA libraries installed but
 698          no devices available.  */
 699       if (r != CUDA_SUCCESS)
 700         return 0;
 701     }
 702
 703   r = cuDeviceGetCount (&n);
 704   if (r!= CUDA_SUCCESS)
 705     GOMP_PLUGIN_fatal ("cuDeviceGetCount error: %s", cuda_error (r));
 706
 707   return n;
 708 }
 709
 710
 711 static void
 712 link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs,
 713           unsigned num_objs)
 714 {
 715   CUjit_option opts[6];
 716   void *optvals[6];
 717   float elapsed = 0.0;
 718 #define LOGSIZE 8192
 719   char elog[LOGSIZE];
 720   char ilog[LOGSIZE];
 721   unsigned long logsize = LOGSIZE;
 722   CUlinkState linkstate;
 723   CUresult r;
 724   void *linkout;
 725   size_t linkoutsize __attribute__ ((unused));
 726
 727   opts[0] = CU_JIT_WALL_TIME;
 728   optvals[0] = &elapsed;
 729
 730   opts[1] = CU_JIT_INFO_LOG_BUFFER;
 731   optvals[1] = &ilog[0];
 732
 733   opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
 734   optvals[2] = (void *) logsize;
 735
 736   opts[3] = CU_JIT_ERROR_LOG_BUFFER;
 737   optvals[3] = &elog[0];
 738
 739   opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
 740   optvals[4] = (void *) logsize;
 741
 742   opts[5] = CU_JIT_LOG_VERBOSE;
 743   optvals[5] = (void *) 1;
 744
 745   r = cuLinkCreate (6, opts, optvals, &linkstate);
 746   if (r != CUDA_SUCCESS)
 747     GOMP_PLUGIN_fatal ("cuLinkCreate error: %s", cuda_error (r));
 748
 749   for (; num_objs--; ptx_objs++)
 750     {
 751       /* cuLinkAddData's 'data' argument erroneously omits the const
 752          qualifier.  */
 753       GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs->code);
 754       r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, (char*)ptx_objs->code,
 755                          ptx_objs->size, 0, 0, 0, 0);
 756       if (r != CUDA_SUCCESS)
 757         {
 758           GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
 759           GOMP_PLUGIN_fatal ("cuLinkAddData (ptx_code) error: %s",
 760                              cuda_error (r));
 761         }
 762     }
 763
 764   GOMP_PLUGIN_debug (0, "Linking\n");
 765   r = cuLinkComplete (linkstate, &linkout, &linkoutsize);
 766
 767   GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed);
 768   GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]);
 769
 770   if (r != CUDA_SUCCESS)
 771     GOMP_PLUGIN_fatal ("cuLinkComplete error: %s", cuda_error (r));
 772
 773   r = cuModuleLoadData (module, linkout);
 774   if (r != CUDA_SUCCESS)
 775     GOMP_PLUGIN_fatal ("cuModuleLoadData error: %s", cuda_error (r));
 776
 777   r = cuLinkDestroy (linkstate);
 778   if (r != CUDA_SUCCESS)
 779     GOMP_PLUGIN_fatal ("cuLinkDestory error: %s", cuda_error (r));
 780 }
 781
 782 static void
 783 event_gc (bool memmap_lockable)
 784 {
 785   struct ptx_event *ptx_event = ptx_events;
 786   struct nvptx_thread *nvthd = nvptx_thread ();
 787
 788   pthread_mutex_lock (&ptx_event_lock);
 789
 790   while (ptx_event != NULL)
 791     {
 792       CUresult r;
 793       struct ptx_event *e = ptx_event;
 794
 795       ptx_event = ptx_event->next;
 796
 797       if (e->ord != nvthd->ptx_dev->ord)
 798         continue;
 799
 800       r = cuEventQuery (*e->evt);
 801       if (r == CUDA_SUCCESS)
 802         {
 803           CUevent *te;
 804
 805           te = e->evt;
 806
 807           switch (e->type)
 808             {
 809             case PTX_EVT_MEM:
 810             case PTX_EVT_SYNC:
 811               break;
 812
 813             case PTX_EVT_KNL:
 814               map_pop (e->addr);
 815               break;
 816
 817             case PTX_EVT_ASYNC_CLEANUP:
 818               {
 819                 /* The function gomp_plugin_async_unmap_vars needs to claim the
 820                    memory-map splay tree lock for the current device, so we
 821                    can't call it when one of our callers has already claimed
 822                    the lock.  In that case, just delay the GC for this event
 823                    until later.  */
 824                 if (!memmap_lockable)
 825                   continue;
 826
 827                 GOMP_PLUGIN_async_unmap_vars (e->addr);
 828               }
 829               break;
 830             }
 831
 832           cuEventDestroy (*te);
 833           free ((void *)te);
 834
 835           if (ptx_events == e)
 836             ptx_events = ptx_events->next;
 837           else
 838             {
 839               struct ptx_event *e_ = ptx_events;
 840               while (e_->next != e)
 841                 e_ = e_->next;
 842               e_->next = e_->next->next;
 843             }
 844
 845           free (e);
 846         }
 847     }
 848
 849   pthread_mutex_unlock (&ptx_event_lock);
 850 }
 851
 852 static void
 853 event_add (enum ptx_event_type type, CUevent *e, void *h)
 854 {
 855   struct ptx_event *ptx_event;
 856   struct nvptx_thread *nvthd = nvptx_thread ();
 857
 858   assert (type == PTX_EVT_MEM || type == PTX_EVT_KNL || type == PTX_EVT_SYNC
 859           || type == PTX_EVT_ASYNC_CLEANUP);
 860
 861   ptx_event = GOMP_PLUGIN_malloc (sizeof (struct ptx_event));
 862   ptx_event->type = type;
 863   ptx_event->evt = e;
 864   ptx_event->addr = h;
 865   ptx_event->ord = nvthd->ptx_dev->ord;
 866
 867   pthread_mutex_lock (&ptx_event_lock);
 868
 869   ptx_event->next = ptx_events;
 870   ptx_events = ptx_event;
 871
 872   pthread_mutex_unlock (&ptx_event_lock);
 873 }
 874
 875 void
 876 nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
 877             int async, unsigned *dims, void *targ_mem_desc)
 878 {
 879   struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
 880   CUfunction function;
 881   CUresult r;
 882   int i;
 883   struct ptx_stream *dev_str;
 884   void *kargs[1];
 885   void *hp, *dp;
 886   struct nvptx_thread *nvthd = nvptx_thread ();
 887   const char *maybe_abort_msg = "(perhaps abort was called)";
 888
 889   function = targ_fn->fn;
 890
 891   dev_str = select_stream_for_async (async, pthread_self (), false, NULL);
 892   assert (dev_str == nvthd->current_stream);
 893
 894   /* Initialize the launch dimensions.  Typically this is constant,
 895      provided by the device compiler, but we must permit runtime
 896      values.  */
 897   int seen_zero = 0;
 898   for (i = 0; i != GOMP_DIM_MAX; i++)
 899     {
 900       if (targ_fn->launch->dim[i])
 901        dims[i] = targ_fn->launch->dim[i];
 902       if (!dims[i])
 903        seen_zero = 1;
 904     }
 905
 906   if (seen_zero)
 907     {
 908       for (i = 0; i != GOMP_DIM_MAX; i++)
 909        if (!dims[i])
 910          dims[i] = /* TODO */ 32;
 911     }
 912
 913   /* This reserves a chunk of a pre-allocated page of memory mapped on both
 914      the host and the device. HP is a host pointer to the new chunk, and DP is
 915      the corresponding device pointer.  */
 916   map_push (dev_str, async, mapnum * sizeof (void *), &hp, &dp);
 917
 918   GOMP_PLUGIN_debug (0, "  %s: prepare mappings\n", __FUNCTION__);
 919
 920   /* Copy the array of arguments to the mapped page.  */
 921   for (i = 0; i < mapnum; i++)
 922     ((void **) hp)[i] = devaddrs[i];
 923
 924   /* Copy the (device) pointers to arguments to the device (dp and hp might in
 925      fact have the same value on a unified-memory system).  */
 926   r = cuMemcpy ((CUdeviceptr)dp, (CUdeviceptr)hp, mapnum * sizeof (void *));
 927   if (r != CUDA_SUCCESS)
 928     GOMP_PLUGIN_fatal ("cuMemcpy failed: %s", cuda_error (r));
 929
 930   GOMP_PLUGIN_debug (0, "  %s: kernel %s: launch"
 931                      " gangs=%u, workers=%u, vectors=%u\n",
 932                      __FUNCTION__, targ_fn->launch->fn,
 933                      dims[0], dims[1], dims[2]);
 934
 935   // OpenACC            CUDA
 936   //
 937   // num_gangs          nctaid.x
 938   // num_workers        ntid.y
 939   // vector length      ntid.x
 940
 941   kargs[0] = &dp;
 942   r = cuLaunchKernel (function,
 943                       dims[GOMP_DIM_GANG], 1, 1,
 944                       dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1,
 945                       0, dev_str->stream, kargs, 0);
 946   if (r != CUDA_SUCCESS)
 947     GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
 948
 949 #ifndef DISABLE_ASYNC
 950   if (async < acc_async_noval)
 951     {
 952       r = cuStreamSynchronize (dev_str->stream);
 953       if (r == CUDA_ERROR_LAUNCH_FAILED)
 954         GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
 955                            maybe_abort_msg);
 956       else if (r != CUDA_SUCCESS)
 957         GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
 958     }
 959   else
 960     {
 961       CUevent *e;
 962
 963       e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
 964
 965       r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
 966       if (r == CUDA_ERROR_LAUNCH_FAILED)
 967         GOMP_PLUGIN_fatal ("cuEventCreate error: %s %s\n", cuda_error (r),
 968                            maybe_abort_msg);
 969       else if (r != CUDA_SUCCESS)
 970         GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
 971
 972       event_gc (true);
 973
 974       r = cuEventRecord (*e, dev_str->stream);
 975       if (r != CUDA_SUCCESS)
 976         GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
 977
 978       event_add (PTX_EVT_KNL, e, (void *)dev_str);
 979     }
 980 #else
 981   r = cuCtxSynchronize ();
 982   if (r == CUDA_ERROR_LAUNCH_FAILED)
 983     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
 984                        maybe_abort_msg);
 985   else if (r != CUDA_SUCCESS)
 986     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
 987 #endif
 988
 989   GOMP_PLUGIN_debug (0, "  %s: kernel %s: finished\n", __FUNCTION__,
 990                      targ_fn->launch->fn);
 991
 992 #ifndef DISABLE_ASYNC
 993   if (async < acc_async_noval)
 994 #endif
 995     map_pop (dev_str);
 996 }
 997
 998 void * openacc_get_current_cuda_context (void);
 999
1000 static void *
1001 nvptx_alloc (size_t s)
1002 {
1003   CUdeviceptr d;
1004   CUresult r;
1005
1006   r = cuMemAlloc (&d, s);
1007   if (r == CUDA_ERROR_OUT_OF_MEMORY)
1008     return 0;
1009   if (r != CUDA_SUCCESS)
1010     GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
1011   return (void *)d;
1012 }
1013
1014 static void
1015 nvptx_free (void *p)
1016 {
1017   CUresult r;
1018   CUdeviceptr pb;
1019   size_t ps;
1020
1021   r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)p);
1022   if (r != CUDA_SUCCESS)
1023     GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r));
1024
1025   if ((CUdeviceptr)p != pb)
1026     GOMP_PLUGIN_fatal ("invalid device address");
1027
1028   r = cuMemFree ((CUdeviceptr)p);
1029   if (r != CUDA_SUCCESS)
1030     GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
1031 }
1032
1033 static void *
1034 nvptx_host2dev (void *d, const void *h, size_t s)
1035 {
1036   CUresult r;
1037   CUdeviceptr pb;
1038   size_t ps;
1039   struct nvptx_thread *nvthd = nvptx_thread ();
1040
1041   if (!s)
1042     return 0;
1043
1044   if (!d)
1045     GOMP_PLUGIN_fatal ("invalid device address");
1046
1047   r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)d);
1048   if (r != CUDA_SUCCESS)
1049     GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r));
1050
1051   if (!pb)
1052     GOMP_PLUGIN_fatal ("invalid device address");
1053
1054   if (!h)
1055     GOMP_PLUGIN_fatal ("invalid host address");
1056
1057   if (d == h)
1058     GOMP_PLUGIN_fatal ("invalid host or device address");
1059
1060   if ((void *)(d + s) > (void *)(pb + ps))
1061     GOMP_PLUGIN_fatal ("invalid size");
1062
1063 #ifndef DISABLE_ASYNC
1064   if (nvthd->current_stream != nvthd->ptx_dev->null_stream)
1065     {
1066       CUevent *e;
1067
1068       e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1069
1070       r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1071       if (r != CUDA_SUCCESS)
1072         GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1073
1074       event_gc (false);
1075
1076       r = cuMemcpyHtoDAsync ((CUdeviceptr)d, h, s,
1077                              nvthd->current_stream->stream);
1078       if (r != CUDA_SUCCESS)
1079         GOMP_PLUGIN_fatal ("cuMemcpyHtoDAsync error: %s", cuda_error (r));
1080
1081       r = cuEventRecord (*e, nvthd->current_stream->stream);
1082       if (r != CUDA_SUCCESS)
1083         GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1084
1085       event_add (PTX_EVT_MEM, e, (void *)h);
1086     }
1087   else
1088 #endif
1089     {
1090       r = cuMemcpyHtoD ((CUdeviceptr)d, h, s);
1091       if (r != CUDA_SUCCESS)
1092         GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
1093     }
1094
1095   return 0;
1096 }
1097
1098 static void *
1099 nvptx_dev2host (void *h, const void *d, size_t s)
1100 {
1101   CUresult r;
1102   CUdeviceptr pb;
1103   size_t ps;
1104   struct nvptx_thread *nvthd = nvptx_thread ();
1105
1106   if (!s)
1107     return 0;
1108
1109   if (!d)
1110     GOMP_PLUGIN_fatal ("invalid device address");
1111
1112   r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)d);
1113   if (r != CUDA_SUCCESS)
1114     GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r));
1115
1116   if (!pb)
1117     GOMP_PLUGIN_fatal ("invalid device address");
1118
1119   if (!h)
1120     GOMP_PLUGIN_fatal ("invalid host address");
1121
1122   if (d == h)
1123     GOMP_PLUGIN_fatal ("invalid host or device address");
1124
1125   if ((void *)(d + s) > (void *)(pb + ps))
1126     GOMP_PLUGIN_fatal ("invalid size");
1127
1128 #ifndef DISABLE_ASYNC
1129   if (nvthd->current_stream != nvthd->ptx_dev->null_stream)
1130     {
1131       CUevent *e;
1132
1133       e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1134
1135       r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1136       if (r != CUDA_SUCCESS)
1137         GOMP_PLUGIN_fatal ("cuEventCreate error: %s\n", cuda_error (r));
1138
1139       event_gc (false);
1140
1141       r = cuMemcpyDtoHAsync (h, (CUdeviceptr)d, s,
1142                              nvthd->current_stream->stream);
1143       if (r != CUDA_SUCCESS)
1144         GOMP_PLUGIN_fatal ("cuMemcpyDtoHAsync error: %s", cuda_error (r));
1145
1146       r = cuEventRecord (*e, nvthd->current_stream->stream);
1147       if (r != CUDA_SUCCESS)
1148         GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1149
1150       event_add (PTX_EVT_MEM, e, (void *)h);
1151     }
1152   else
1153 #endif
1154     {
1155       r = cuMemcpyDtoH (h, (CUdeviceptr)d, s);
1156       if (r != CUDA_SUCCESS)
1157         GOMP_PLUGIN_fatal ("cuMemcpyDtoH error: %s", cuda_error (r));
1158     }
1159
1160   return 0;
1161 }
1162
1163 static void
1164 nvptx_set_async (int async)
1165 {
1166   struct nvptx_thread *nvthd = nvptx_thread ();
1167   nvthd->current_stream
1168     = select_stream_for_async (async, pthread_self (), true, NULL);
1169 }
1170
1171 static int
1172 nvptx_async_test (int async)
1173 {
1174   CUresult r;
1175   struct ptx_stream *s;
1176
1177   s = select_stream_for_async (async, pthread_self (), false, NULL);
1178
1179   if (!s)
1180     GOMP_PLUGIN_fatal ("unknown async %d", async);
1181
1182   r = cuStreamQuery (s->stream);
1183   if (r == CUDA_SUCCESS)
1184     {
1185       /* The oacc-parallel.c:goacc_wait function calls this hook to determine
1186          whether all work has completed on this stream, and if so omits the call
1187          to the wait hook.  If that happens, event_gc might not get called
1188          (which prevents variables from getting unmapped and their associated
1189          device storage freed), so call it here.  */
1190       event_gc (true);
1191       return 1;
1192     }
1193   else if (r == CUDA_ERROR_NOT_READY)
1194     return 0;
1195
1196   GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
1197
1198   return 0;
1199 }
1200
1201 static int
1202 nvptx_async_test_all (void)
1203 {
1204   struct ptx_stream *s;
1205   pthread_t self = pthread_self ();
1206   struct nvptx_thread *nvthd = nvptx_thread ();
1207
1208   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1209
1210   for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
1211     {
1212       if ((s->multithreaded || pthread_equal (s->host_thread, self))
1213           && cuStreamQuery (s->stream) == CUDA_ERROR_NOT_READY)
1214         {
1215           pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1216           return 0;
1217         }
1218     }
1219
1220   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1221
1222   event_gc (true);
1223
1224   return 1;
1225 }
1226
1227 static void
1228 nvptx_wait (int async)
1229 {
1230   CUresult r;
1231   struct ptx_stream *s;
1232
1233   s = select_stream_for_async (async, pthread_self (), false, NULL);
1234
1235   if (!s)
1236     GOMP_PLUGIN_fatal ("unknown async %d", async);
1237
1238   r = cuStreamSynchronize (s->stream);
1239   if (r != CUDA_SUCCESS)
1240     GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1241
1242   event_gc (true);
1243 }
1244
1245 static void
1246 nvptx_wait_async (int async1, int async2)
1247 {
1248   CUresult r;
1249   CUevent *e;
1250   struct ptx_stream *s1, *s2;
1251   pthread_t self = pthread_self ();
1252
1253   /* The stream that is waiting (rather than being waited for) doesn't
1254      necessarily have to exist already.  */
1255   s2 = select_stream_for_async (async2, self, true, NULL);
1256
1257   s1 = select_stream_for_async (async1, self, false, NULL);
1258   if (!s1)
1259     GOMP_PLUGIN_fatal ("invalid async 1\n");
1260
1261   if (s1 == s2)
1262     GOMP_PLUGIN_fatal ("identical parameters");
1263
1264   e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1265
1266   r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1267   if (r != CUDA_SUCCESS)
1268     GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1269
1270   event_gc (true);
1271
1272   r = cuEventRecord (*e, s1->stream);
1273   if (r != CUDA_SUCCESS)
1274     GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1275
1276   event_add (PTX_EVT_SYNC, e, NULL);
1277
1278   r = cuStreamWaitEvent (s2->stream, *e, 0);
1279   if (r != CUDA_SUCCESS)
1280     GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuda_error (r));
1281 }
1282
1283 static void
1284 nvptx_wait_all (void)
1285 {
1286   CUresult r;
1287   struct ptx_stream *s;
1288   pthread_t self = pthread_self ();
1289   struct nvptx_thread *nvthd = nvptx_thread ();
1290
1291   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1292
1293   /* Wait for active streams initiated by this thread (or by multiple threads)
1294      to complete.  */
1295   for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
1296     {
1297       if (s->multithreaded || pthread_equal (s->host_thread, self))
1298         {
1299           r = cuStreamQuery (s->stream);
1300           if (r == CUDA_SUCCESS)
1301             continue;
1302           else if (r != CUDA_ERROR_NOT_READY)
1303             GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
1304
1305           r = cuStreamSynchronize (s->stream);
1306           if (r != CUDA_SUCCESS)
1307             GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1308         }
1309     }
1310
1311   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1312
1313   event_gc (true);
1314 }
1315
1316 static void
1317 nvptx_wait_all_async (int async)
1318 {
1319   CUresult r;
1320   struct ptx_stream *waiting_stream, *other_stream;
1321   CUevent *e;
1322   struct nvptx_thread *nvthd = nvptx_thread ();
1323   pthread_t self = pthread_self ();
1324
1325   /* The stream doing the waiting.  This could be the first mention of the
1326      stream, so create it if necessary.  */
1327   waiting_stream
1328     = select_stream_for_async (async, pthread_self (), true, NULL);
1329
1330   /* Launches on the null stream already block on other streams in the
1331      context.  */
1332   if (!waiting_stream || waiting_stream == nvthd->ptx_dev->null_stream)
1333     return;
1334
1335   event_gc (true);
1336
1337   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1338
1339   for (other_stream = nvthd->ptx_dev->active_streams;
1340        other_stream != NULL;
1341        other_stream = other_stream->next)
1342     {
1343       if (!other_stream->multithreaded
1344           && !pthread_equal (other_stream->host_thread, self))
1345         continue;
1346
1347       e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1348
1349       r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1350       if (r != CUDA_SUCCESS)
1351         GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1352
1353       /* Record an event on the waited-for stream.  */
1354       r = cuEventRecord (*e, other_stream->stream);
1355       if (r != CUDA_SUCCESS)
1356         GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1357
1358       event_add (PTX_EVT_SYNC, e, NULL);
1359
1360       r = cuStreamWaitEvent (waiting_stream->stream, *e, 0);
1361       if (r != CUDA_SUCCESS)
1362         GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuda_error (r));
1363    }
1364
1365   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1366 }
1367
1368 static void *
1369 nvptx_get_current_cuda_device (void)
1370 {
1371   struct nvptx_thread *nvthd = nvptx_thread ();
1372
1373   if (!nvthd || !nvthd->ptx_dev)
1374     return NULL;
1375
1376   return &nvthd->ptx_dev->dev;
1377 }
1378
1379 static void *
1380 nvptx_get_current_cuda_context (void)
1381 {
1382   struct nvptx_thread *nvthd = nvptx_thread ();
1383
1384   if (!nvthd || !nvthd->ptx_dev)
1385     return NULL;
1386
1387   return nvthd->ptx_dev->ctx;
1388 }
1389
1390 static void *
1391 nvptx_get_cuda_stream (int async)
1392 {
1393   struct ptx_stream *s;
1394   struct nvptx_thread *nvthd = nvptx_thread ();
1395
1396   if (!nvthd || !nvthd->ptx_dev)
1397     return NULL;
1398
1399   s = select_stream_for_async (async, pthread_self (), false, NULL);
1400
1401   return s ? s->stream : NULL;
1402 }
1403
1404 static int
1405 nvptx_set_cuda_stream (int async, void *stream)
1406 {
1407   struct ptx_stream *oldstream;
1408   pthread_t self = pthread_self ();
1409   struct nvptx_thread *nvthd = nvptx_thread ();
1410
1411   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1412
1413   if (async < 0)
1414     GOMP_PLUGIN_fatal ("bad async %d", async);
1415
1416   /* We have a list of active streams and an array mapping async values to
1417      entries of that list.  We need to take "ownership" of the passed-in stream,
1418      and add it to our list, removing the previous entry also (if there was one)
1419      in order to prevent resource leaks.  Note the potential for surprise
1420      here: maybe we should keep track of passed-in streams and leave it up to
1421      the user to tidy those up, but that doesn't work for stream handles
1422      returned from acc_get_cuda_stream above...  */
1423
1424   oldstream = select_stream_for_async (async, self, false, NULL);
1425
1426   if (oldstream)
1427     {
1428       if (nvthd->ptx_dev->active_streams == oldstream)
1429         nvthd->ptx_dev->active_streams = nvthd->ptx_dev->active_streams->next;
1430       else
1431         {
1432           struct ptx_stream *s = nvthd->ptx_dev->active_streams;
1433           while (s->next != oldstream)
1434             s = s->next;
1435           s->next = s->next->next;
1436         }
1437
1438       cuStreamDestroy (oldstream->stream);
1439       map_fini (oldstream);
1440       free (oldstream);
1441     }
1442
1443   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1444
1445   (void) select_stream_for_async (async, self, true, (CUstream) stream);
1446
1447   return 1;
1448 }
1449
1450 /* Plugin entry points.  */
1451
1452 const char *
1453 GOMP_OFFLOAD_get_name (void)
1454 {
1455   return "nvptx";
1456 }
1457
1458 unsigned int
1459 GOMP_OFFLOAD_get_caps (void)
1460 {
1461   return GOMP_OFFLOAD_CAP_OPENACC_200;
1462 }
1463
1464 int
1465 GOMP_OFFLOAD_get_type (void)
1466 {
1467   return OFFLOAD_TARGET_TYPE_NVIDIA_PTX;
1468 }
1469
1470 int
1471 GOMP_OFFLOAD_get_num_devices (void)
1472 {
1473   return nvptx_get_num_devices ();
1474 }
1475
1476 void
1477 GOMP_OFFLOAD_init_device (int n)
1478 {
1479   pthread_mutex_lock (&ptx_dev_lock);
1480
1481   if (!nvptx_init () || ptx_devices[n] != NULL)
1482     {
1483       pthread_mutex_unlock (&ptx_dev_lock);
1484       return;
1485     }
1486
1487   ptx_devices[n] = nvptx_open_device (n);
1488   instantiated_devices++;
1489
1490   pthread_mutex_unlock (&ptx_dev_lock);
1491 }
1492
1493 void
1494 GOMP_OFFLOAD_fini_device (int n)
1495 {
1496   pthread_mutex_lock (&ptx_dev_lock);
1497
1498   if (ptx_devices[n] != NULL)
1499     {
1500       nvptx_attach_host_thread_to_device (n);
1501       nvptx_close_device (ptx_devices[n]);
1502       ptx_devices[n] = NULL;
1503       instantiated_devices--;
1504     }
1505
1506   pthread_mutex_unlock (&ptx_dev_lock);
1507 }
1508
1509 /* Return the libgomp version number we're compatible with.  There is
1510    no requirement for cross-version compatibility.  */
1511
1512 unsigned
1513 GOMP_OFFLOAD_version (void)
1514 {
1515   return GOMP_VERSION;
1516 }
1517
1518 /* Load the (partial) program described by TARGET_DATA to device
1519    number ORD.  Allocate and return TARGET_TABLE.  */
1520
1521 int
1522 GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
1523                          struct addr_pair **target_table)
1524 {
1525   CUmodule module;
1526   const char *const *var_names;
1527   const struct targ_fn_launch *fn_descs;
1528   unsigned int fn_entries, var_entries, i, j;
1529   CUresult r;
1530   struct targ_fn_descriptor *targ_fns;
1531   struct addr_pair *targ_tbl;
1532   const nvptx_tdata_t *img_header = (const nvptx_tdata_t *) target_data;
1533   struct ptx_image_data *new_image;
1534   struct ptx_device *dev;
1535
1536   if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1537     GOMP_PLUGIN_fatal ("Offload data incompatible with PTX plugin"
1538                        " (expected %u, received %u)",
1539                        GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1540
1541   GOMP_OFFLOAD_init_device (ord);
1542
1543   dev = ptx_devices[ord];
1544
1545   nvptx_attach_host_thread_to_device (ord);
1546
1547   link_ptx (&module, img_header->ptx_objs, img_header->ptx_num);
1548
1549   /* The mkoffload utility emits a struct of pointers/integers at the
1550      start of each offload image.  The array of kernel names and the
1551      functions addresses form a one-to-one correspondence.  */
1552
1553   var_entries = img_header->var_num;
1554   var_names = img_header->var_names;
1555   fn_entries = img_header->fn_num;
1556   fn_descs = img_header->fn_descs;
1557
1558   targ_tbl = GOMP_PLUGIN_malloc (sizeof (struct addr_pair)
1559                                  * (fn_entries + var_entries));
1560   targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
1561                                  * fn_entries);
1562
1563   *target_table = targ_tbl;
1564
1565   new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data));
1566   new_image->target_data = target_data;
1567   new_image->module = module;
1568   new_image->fns = targ_fns;
1569
1570   pthread_mutex_lock (&dev->image_lock);
1571   new_image->next = dev->images;
1572   dev->images = new_image;
1573   pthread_mutex_unlock (&dev->image_lock);
1574
1575   for (i = 0; i < fn_entries; i++, targ_fns++, targ_tbl++)
1576     {
1577       CUfunction function;
1578
1579       r = cuModuleGetFunction (&function, module, fn_descs[i].fn);
1580       if (r != CUDA_SUCCESS)
1581         GOMP_PLUGIN_fatal ("cuModuleGetFunction error: %s", cuda_error (r));
1582
1583       targ_fns->fn = function;
1584       targ_fns->launch = &fn_descs[i];
1585
1586       targ_tbl->start = (uintptr_t) targ_fns;
1587       targ_tbl->end = targ_tbl->start + 1;
1588     }
1589
1590   for (j = 0; j < var_entries; j++, targ_tbl++)
1591     {
1592       CUdeviceptr var;
1593       size_t bytes;
1594
1595       r = cuModuleGetGlobal (&var, &bytes, module, var_names[j]);
1596       if (r != CUDA_SUCCESS)
1597         GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
1598
1599       targ_tbl->start = (uintptr_t) var;
1600       targ_tbl->end = targ_tbl->start + bytes;
1601     }
1602
1603   return fn_entries + var_entries;
1604 }
1605
1606 /* Unload the program described by TARGET_DATA.  DEV_DATA is the
1607    function descriptors allocated by G_O_load_image.  */
1608
1609 void
1610 GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data)
1611 {
1612   struct ptx_image_data *image, **prev_p;
1613   struct ptx_device *dev = ptx_devices[ord];
1614
1615   if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1616     return;
1617
1618   pthread_mutex_lock (&dev->image_lock);
1619   for (prev_p = &dev->images; (image = *prev_p) != 0; prev_p = &image->next)
1620     if (image->target_data == target_data)
1621       {
1622         *prev_p = image->next;
1623         cuModuleUnload (image->module);
1624         free (image->fns);
1625         free (image);
1626         break;
1627       }
1628   pthread_mutex_unlock (&dev->image_lock);
1629 }
1630
1631 void *
1632 GOMP_OFFLOAD_alloc (int ord, size_t size)
1633 {
1634   nvptx_attach_host_thread_to_device (ord);
1635   return nvptx_alloc (size);
1636 }
1637
1638 void
1639 GOMP_OFFLOAD_free (int ord, void *ptr)
1640 {
1641   nvptx_attach_host_thread_to_device (ord);
1642   nvptx_free (ptr);
1643 }
1644
1645 void *
1646 GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
1647 {
1648   nvptx_attach_host_thread_to_device (ord);
1649   return nvptx_dev2host (dst, src, n);
1650 }
1651
1652 void *
1653 GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
1654 {
1655   nvptx_attach_host_thread_to_device (ord);
1656   return nvptx_host2dev (dst, src, n);
1657 }
1658
1659 void (*device_run) (int n, void *fn_ptr, void *vars) = NULL;
1660
1661 void
1662 GOMP_OFFLOAD_openacc_parallel (void (*fn) (void *), size_t mapnum,
1663                                void **hostaddrs, void **devaddrs,
1664                                int async, unsigned *dims, void *targ_mem_desc)
1665 {
1666   nvptx_exec (fn, mapnum, hostaddrs, devaddrs, async, dims, targ_mem_desc);
1667 }
1668
1669 void
1670 GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc)
1671 {
1672   CUevent *e;
1673   CUresult r;
1674   struct nvptx_thread *nvthd = nvptx_thread ();
1675
1676   e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1677
1678   r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1679   if (r != CUDA_SUCCESS)
1680     GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1681
1682   r = cuEventRecord (*e, nvthd->current_stream->stream);
1683   if (r != CUDA_SUCCESS)
1684     GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1685
1686   event_add (PTX_EVT_ASYNC_CLEANUP, e, targ_mem_desc);
1687 }
1688
1689 int
1690 GOMP_OFFLOAD_openacc_async_test (int async)
1691 {
1692   return nvptx_async_test (async);
1693 }
1694
1695 int
1696 GOMP_OFFLOAD_openacc_async_test_all (void)
1697 {
1698   return nvptx_async_test_all ();
1699 }
1700
1701 void
1702 GOMP_OFFLOAD_openacc_async_wait (int async)
1703 {
1704   nvptx_wait (async);
1705 }
1706
1707 void
1708 GOMP_OFFLOAD_openacc_async_wait_async (int async1, int async2)
1709 {
1710   nvptx_wait_async (async1, async2);
1711 }
1712
1713 void
1714 GOMP_OFFLOAD_openacc_async_wait_all (void)
1715 {
1716   nvptx_wait_all ();
1717 }
1718
1719 void
1720 GOMP_OFFLOAD_openacc_async_wait_all_async (int async)
1721 {
1722   nvptx_wait_all_async (async);
1723 }
1724
1725 void
1726 GOMP_OFFLOAD_openacc_async_set_async (int async)
1727 {
1728   nvptx_set_async (async);
1729 }
1730
1731 void *
1732 GOMP_OFFLOAD_openacc_create_thread_data (int ord)
1733 {
1734   struct ptx_device *ptx_dev;
1735   struct nvptx_thread *nvthd
1736     = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
1737   CUresult r;
1738   CUcontext thd_ctx;
1739
1740   ptx_dev = ptx_devices[ord];
1741
1742   assert (ptx_dev);
1743
1744   r = cuCtxGetCurrent (&thd_ctx);
1745   if (r != CUDA_SUCCESS)
1746     GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));
1747
1748   assert (ptx_dev->ctx);
1749
1750   if (!thd_ctx)
1751     {
1752       r = cuCtxPushCurrent (ptx_dev->ctx);
1753       if (r != CUDA_SUCCESS)
1754         GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s", cuda_error (r));
1755     }
1756
1757   nvthd->current_stream = ptx_dev->null_stream;
1758   nvthd->ptx_dev = ptx_dev;
1759
1760   return (void *) nvthd;
1761 }
1762
1763 void
1764 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
1765 {
1766   free (data);
1767 }
1768
1769 void *
1770 GOMP_OFFLOAD_openacc_get_current_cuda_device (void)
1771 {
1772   return nvptx_get_current_cuda_device ();
1773 }
1774
1775 void *
1776 GOMP_OFFLOAD_openacc_get_current_cuda_context (void)
1777 {
1778   return nvptx_get_current_cuda_context ();
1779 }
1780
1781 /* NOTE: This returns a CUstream, not a ptx_stream pointer.  */
1782
1783 void *
1784 GOMP_OFFLOAD_openacc_get_cuda_stream (int async)
1785 {
1786   return nvptx_get_cuda_stream (async);
1787 }
1788
1789 /* NOTE: This takes a CUstream, not a ptx_stream pointer.  */
1790
1791 int
1792 GOMP_OFFLOAD_openacc_set_cuda_stream (int async, void *stream)
1793 {
1794   return nvptx_set_cuda_stream (async, stream);
1795 }