libgomp/plugin/plugin-nvptx.c

   1 /* Plugin for NVPTX execution.
   2
   3    Copyright (C) 2013-2015 Free Software Foundation, Inc.
   4
   5    Contributed by Mentor Embedded.
   6
   7    This file is part of the GNU Offloading and Multi Processing Library
   8    (libgomp).
   9
  10    Libgomp is free software; you can redistribute it and/or modify it
  11    under the terms of the GNU General Public License as published by
  12    the Free Software Foundation; either version 3, or (at your option)
  13    any later version.
  14
  15    Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
  16    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  17    FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  18    more details.
  19
  20    Under Section 7 of GPL version 3, you are granted additional
  21    permissions described in the GCC Runtime Library Exception, version
  22    3.1, as published by the Free Software Foundation.
  23
  24    You should have received a copy of the GNU General Public License and
  25    a copy of the GCC Runtime Library Exception along with this program;
  26    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  27    <http://www.gnu.org/licenses/>.  */
  28
  29 /* Nvidia PTX-specific parts of OpenACC support.  The cuda driver
  30    library appears to hold some implicit state, but the documentation
  31    is not clear as to what that state might be.  Or how one might
  32    propagate it from one thread to another.  */
  33
  34 #include "openacc.h"
  35 #include "config.h"
  36 #include "libgomp-plugin.h"
  37 #include "oacc-ptx.h"
  38 #include "oacc-plugin.h"
  39 #include "gomp-constants.h"
  40
  41 #include <pthread.h>
  42 #include <cuda.h>
  43 #include <stdbool.h>
  44 #include <stdint.h>
  45 #include <string.h>
  46 #include <stdio.h>
  47 #include <unistd.h>
  48 #include <assert.h>
  49
  50 static const char *
  51 cuda_error (CUresult r)
  52 {
  53 #if CUDA_VERSION < 7000
  54   /* Specified in documentation and present in library from at least
  55      5.5.  Not declared in header file prior to 7.0.  */
  56   extern CUresult cuGetErrorString (CUresult, const char **);
  57 #endif
  58   const char *desc;
  59
  60   r = cuGetErrorString (r, &desc);
  61   if (r != CUDA_SUCCESS)
  62     desc = "unknown cuda error";
  63
  64   return desc;
  65 }
  66
  67 static unsigned int instantiated_devices = 0;
  68 static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
  69
  70 struct ptx_stream
  71 {
  72   CUstream stream;
  73   pthread_t host_thread;
  74   bool multithreaded;
  75
  76   CUdeviceptr d;
  77   void *h;
  78   void *h_begin;
  79   void *h_end;
  80   void *h_next;
  81   void *h_prev;
  82   void *h_tail;
  83
  84   struct ptx_stream *next;
  85 };
  86
  87 /* Thread-specific data for PTX.  */
  88
  89 struct nvptx_thread
  90 {
  91   struct ptx_stream *current_stream;
  92   struct ptx_device *ptx_dev;
  93 };
  94
  95 struct map
  96 {
  97   int     async;
  98   size_t  size;
  99   char    mappings[0];
 100 };
 101
 102 static void
 103 map_init (struct ptx_stream *s)
 104 {
 105   CUresult r;
 106
 107   int size = getpagesize ();
 108
 109   assert (s);
 110   assert (!s->d);
 111   assert (!s->h);
 112
 113   r = cuMemAllocHost (&s->h, size);
 114   if (r != CUDA_SUCCESS)
 115     GOMP_PLUGIN_fatal ("cuMemAllocHost error: %s", cuda_error (r));
 116
 117   r = cuMemHostGetDevicePointer (&s->d, s->h, 0);
 118   if (r != CUDA_SUCCESS)
 119     GOMP_PLUGIN_fatal ("cuMemHostGetDevicePointer error: %s", cuda_error (r));
 120
 121   assert (s->h);
 122
 123   s->h_begin = s->h;
 124   s->h_end = s->h_begin + size;
 125   s->h_next = s->h_prev = s->h_tail = s->h_begin;
 126
 127   assert (s->h_next);
 128   assert (s->h_end);
 129 }
 130
 131 static void
 132 map_fini (struct ptx_stream *s)
 133 {
 134   CUresult r;
 135
 136   r = cuMemFreeHost (s->h);
 137   if (r != CUDA_SUCCESS)
 138     GOMP_PLUGIN_fatal ("cuMemFreeHost error: %s", cuda_error (r));
 139 }
 140
 141 static void
 142 map_pop (struct ptx_stream *s)
 143 {
 144   struct map *m;
 145
 146   assert (s != NULL);
 147   assert (s->h_next);
 148   assert (s->h_prev);
 149   assert (s->h_tail);
 150
 151   m = s->h_tail;
 152
 153   s->h_tail += m->size;
 154
 155   if (s->h_tail >= s->h_end)
 156     s->h_tail = s->h_begin + (int) (s->h_tail - s->h_end);
 157
 158   if (s->h_next == s->h_tail)
 159     s->h_prev = s->h_next;
 160
 161   assert (s->h_next >= s->h_begin);
 162   assert (s->h_tail >= s->h_begin);
 163   assert (s->h_prev >= s->h_begin);
 164
 165   assert (s->h_next <= s->h_end);
 166   assert (s->h_tail <= s->h_end);
 167   assert (s->h_prev <= s->h_end);
 168 }
 169
 170 static void
 171 map_push (struct ptx_stream *s, int async, size_t size, void **h, void **d)
 172 {
 173   int left;
 174   int offset;
 175   struct map *m;
 176
 177   assert (s != NULL);
 178
 179   left = s->h_end - s->h_next;
 180   size += sizeof (struct map);
 181
 182   assert (s->h_prev);
 183   assert (s->h_next);
 184
 185   if (size >= left)
 186     {
 187       m = s->h_prev;
 188       m->size += left;
 189       s->h_next = s->h_begin;
 190
 191       if (s->h_next + size > s->h_end)
 192         GOMP_PLUGIN_fatal ("unable to push map");
 193     }
 194
 195   assert (s->h_next);
 196
 197   m = s->h_next;
 198   m->async = async;
 199   m->size = size;
 200
 201   offset = (void *)&m->mappings[0] - s->h;
 202
 203   *d = (void *)(s->d + offset);
 204   *h = (void *)(s->h + offset);
 205
 206   s->h_prev = s->h_next;
 207   s->h_next += size;
 208
 209   assert (s->h_prev);
 210   assert (s->h_next);
 211
 212   assert (s->h_next >= s->h_begin);
 213   assert (s->h_tail >= s->h_begin);
 214   assert (s->h_prev >= s->h_begin);
 215   assert (s->h_next <= s->h_end);
 216   assert (s->h_tail <= s->h_end);
 217   assert (s->h_prev <= s->h_end);
 218
 219   return;
 220 }
 221
 222 /* Target data function launch information.  */
 223
 224 struct targ_fn_launch
 225 {
 226   const char *fn;
 227   unsigned short dim[3];
 228 };
 229
 230 /* Descriptor of a loaded function.  */
 231
 232 struct targ_fn_descriptor
 233 {
 234   CUfunction fn;
 235   const struct targ_fn_launch *launch;
 236 };
 237
 238 /* A loaded PTX image.  */
 239 struct ptx_image_data
 240 {
 241   const void *target_data;
 242   CUmodule module;
 243
 244   struct targ_fn_descriptor *fns;  /* Array of functions.  */
 245
 246   struct ptx_image_data *next;
 247 };
 248
 249 struct ptx_device
 250 {
 251   CUcontext ctx;
 252   bool ctx_shared;
 253   CUdevice dev;
 254   struct ptx_stream *null_stream;
 255   /* All non-null streams associated with this device (actually context),
 256      either created implicitly or passed in from the user (via
 257      acc_set_cuda_stream).  */
 258   struct ptx_stream *active_streams;
 259   struct {
 260     struct ptx_stream **arr;
 261     int size;
 262   } async_streams;
 263   /* A lock for use when manipulating the above stream list and array.  */
 264   pthread_mutex_t stream_lock;
 265   int ord;
 266   bool overlap;
 267   bool map;
 268   bool concur;
 269   int  mode;
 270   bool mkern;
 271
 272   struct ptx_image_data *images;  /* Images loaded on device.  */
 273   pthread_mutex_t image_lock;     /* Lock for above list.  */
 274
 275   struct ptx_device *next;
 276 };
 277
 278 enum ptx_event_type
 279 {
 280   PTX_EVT_MEM,
 281   PTX_EVT_KNL,
 282   PTX_EVT_SYNC,
 283   PTX_EVT_ASYNC_CLEANUP
 284 };
 285
 286 struct ptx_event
 287 {
 288   CUevent *evt;
 289   int type;
 290   void *addr;
 291   int ord;
 292
 293   struct ptx_event *next;
 294 };
 295
 296 static pthread_mutex_t ptx_event_lock;
 297 static struct ptx_event *ptx_events;
 298
 299 static struct ptx_device **ptx_devices;
 300
 301 static inline struct nvptx_thread *
 302 nvptx_thread (void)
 303 {
 304   return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
 305 }
 306
 307 static void
 308 init_streams_for_device (struct ptx_device *ptx_dev, int concurrency)
 309 {
 310   int i;
 311   struct ptx_stream *null_stream
 312     = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
 313
 314   null_stream->stream = NULL;
 315   null_stream->host_thread = pthread_self ();
 316   null_stream->multithreaded = true;
 317   null_stream->d = (CUdeviceptr) NULL;
 318   null_stream->h = NULL;
 319   map_init (null_stream);
 320   ptx_dev->null_stream = null_stream;
 321
 322   ptx_dev->active_streams = NULL;
 323   pthread_mutex_init (&ptx_dev->stream_lock, NULL);
 324
 325   if (concurrency < 1)
 326     concurrency = 1;
 327
 328   /* This is just a guess -- make space for as many async streams as the
 329      current device is capable of concurrently executing.  This can grow
 330      later as necessary.  No streams are created yet.  */
 331   ptx_dev->async_streams.arr
 332     = GOMP_PLUGIN_malloc (concurrency * sizeof (struct ptx_stream *));
 333   ptx_dev->async_streams.size = concurrency;
 334
 335   for (i = 0; i < concurrency; i++)
 336     ptx_dev->async_streams.arr[i] = NULL;
 337 }
 338
 339 static void
 340 fini_streams_for_device (struct ptx_device *ptx_dev)
 341 {
 342   free (ptx_dev->async_streams.arr);
 343
 344   while (ptx_dev->active_streams != NULL)
 345     {
 346       struct ptx_stream *s = ptx_dev->active_streams;
 347       ptx_dev->active_streams = ptx_dev->active_streams->next;
 348
 349       map_fini (s);
 350       cuStreamDestroy (s->stream);
 351       free (s);
 352     }
 353
 354   map_fini (ptx_dev->null_stream);
 355   free (ptx_dev->null_stream);
 356 }
 357
 358 /* Select a stream for (OpenACC-semantics) ASYNC argument for the current
 359    thread THREAD (and also current device/context).  If CREATE is true, create
 360    the stream if it does not exist (or use EXISTING if it is non-NULL), and
 361    associate the stream with the same thread argument.  Returns stream to use
 362    as result.  */
 363
 364 static struct ptx_stream *
 365 select_stream_for_async (int async, pthread_t thread, bool create,
 366                          CUstream existing)
 367 {
 368   struct nvptx_thread *nvthd = nvptx_thread ();
 369   /* Local copy of TLS variable.  */
 370   struct ptx_device *ptx_dev = nvthd->ptx_dev;
 371   struct ptx_stream *stream = NULL;
 372   int orig_async = async;
 373
 374   /* The special value acc_async_noval (-1) maps (for now) to an
 375      implicitly-created stream, which is then handled the same as any other
 376      numbered async stream.  Other options are available, e.g. using the null
 377      stream for anonymous async operations, or choosing an idle stream from an
 378      active set.  But, stick with this for now.  */
 379   if (async > acc_async_sync)
 380     async++;
 381
 382   if (create)
 383     pthread_mutex_lock (&ptx_dev->stream_lock);
 384
 385   /* NOTE: AFAICT there's no particular need for acc_async_sync to map to the
 386      null stream, and in fact better performance may be obtainable if it doesn't
 387      (because the null stream enforces overly-strict synchronisation with
 388      respect to other streams for legacy reasons, and that's probably not
 389      needed with OpenACC).  Maybe investigate later.  */
 390   if (async == acc_async_sync)
 391     stream = ptx_dev->null_stream;
 392   else if (async >= 0 && async < ptx_dev->async_streams.size
 393            && ptx_dev->async_streams.arr[async] && !(create && existing))
 394     stream = ptx_dev->async_streams.arr[async];
 395   else if (async >= 0 && create)
 396     {
 397       if (async >= ptx_dev->async_streams.size)
 398         {
 399           int i, newsize = ptx_dev->async_streams.size * 2;
 400
 401           if (async >= newsize)
 402             newsize = async + 1;
 403
 404           ptx_dev->async_streams.arr
 405             = GOMP_PLUGIN_realloc (ptx_dev->async_streams.arr,
 406                                    newsize * sizeof (struct ptx_stream *));
 407
 408           for (i = ptx_dev->async_streams.size; i < newsize; i++)
 409             ptx_dev->async_streams.arr[i] = NULL;
 410
 411           ptx_dev->async_streams.size = newsize;
 412         }
 413
 414       /* Create a new stream on-demand if there isn't one already, or if we're
 415          setting a particular async value to an existing (externally-provided)
 416          stream.  */
 417       if (!ptx_dev->async_streams.arr[async] || existing)
 418         {
 419           CUresult r;
 420           struct ptx_stream *s
 421             = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
 422
 423           if (existing)
 424             s->stream = existing;
 425           else
 426             {
 427               r = cuStreamCreate (&s->stream, CU_STREAM_DEFAULT);
 428               if (r != CUDA_SUCCESS)
 429                 GOMP_PLUGIN_fatal ("cuStreamCreate error: %s", cuda_error (r));
 430             }
 431
 432           /* If CREATE is true, we're going to be queueing some work on this
 433              stream.  Associate it with the current host thread.  */
 434           s->host_thread = thread;
 435           s->multithreaded = false;
 436
 437           s->d = (CUdeviceptr) NULL;
 438           s->h = NULL;
 439           map_init (s);
 440
 441           s->next = ptx_dev->active_streams;
 442           ptx_dev->active_streams = s;
 443           ptx_dev->async_streams.arr[async] = s;
 444         }
 445
 446       stream = ptx_dev->async_streams.arr[async];
 447     }
 448   else if (async < 0)
 449     GOMP_PLUGIN_fatal ("bad async %d", async);
 450
 451   if (create)
 452     {
 453       assert (stream != NULL);
 454
 455       /* If we're trying to use the same stream from different threads
 456          simultaneously, set stream->multithreaded to true.  This affects the
 457          behaviour of acc_async_test_all and acc_wait_all, which are supposed to
 458          only wait for asynchronous launches from the same host thread they are
 459          invoked on.  If multiple threads use the same async value, we make note
 460          of that here and fall back to testing/waiting for all threads in those
 461          functions.  */
 462       if (thread != stream->host_thread)
 463         stream->multithreaded = true;
 464
 465       pthread_mutex_unlock (&ptx_dev->stream_lock);
 466     }
 467   else if (stream && !stream->multithreaded
 468            && !pthread_equal (stream->host_thread, thread))
 469     GOMP_PLUGIN_fatal ("async %d used on wrong thread", orig_async);
 470
 471   return stream;
 472 }
 473
 474 /* Initialize the device.  Return TRUE on success, else FALSE.  PTX_DEV_LOCK
 475    should be locked on entry and remains locked on exit.  */
 476
 477 static bool
 478 nvptx_init (void)
 479 {
 480   CUresult r;
 481   int ndevs;
 482
 483   if (instantiated_devices != 0)
 484     return true;
 485
 486   r = cuInit (0);
 487   if (r != CUDA_SUCCESS)
 488     GOMP_PLUGIN_fatal ("cuInit error: %s", cuda_error (r));
 489
 490   ptx_events = NULL;
 491
 492   pthread_mutex_init (&ptx_event_lock, NULL);
 493
 494   r = cuDeviceGetCount (&ndevs);
 495   if (r != CUDA_SUCCESS)
 496     GOMP_PLUGIN_fatal ("cuDeviceGetCount error: %s", cuda_error (r));
 497
 498   ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *)
 499                                             * ndevs);
 500
 501   return true;
 502 }
 503
 504 /* Select the N'th PTX device for the current host thread.  The device must
 505    have been previously opened before calling this function.  */
 506
 507 static void
 508 nvptx_attach_host_thread_to_device (int n)
 509 {
 510   CUdevice dev;
 511   CUresult r;
 512   struct ptx_device *ptx_dev;
 513   CUcontext thd_ctx;
 514
 515   r = cuCtxGetDevice (&dev);
 516   if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
 517     GOMP_PLUGIN_fatal ("cuCtxGetDevice error: %s", cuda_error (r));
 518
 519   if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n)
 520     return;
 521   else
 522     {
 523       CUcontext old_ctx;
 524
 525       ptx_dev = ptx_devices[n];
 526       assert (ptx_dev);
 527
 528       r = cuCtxGetCurrent (&thd_ctx);
 529       if (r != CUDA_SUCCESS)
 530         GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));
 531
 532       /* We don't necessarily have a current context (e.g. if it has been
 533          destroyed.  Pop it if we do though.  */
 534       if (thd_ctx != NULL)
 535         {
 536           r = cuCtxPopCurrent (&old_ctx);
 537           if (r != CUDA_SUCCESS)
 538             GOMP_PLUGIN_fatal ("cuCtxPopCurrent error: %s", cuda_error (r));
 539         }
 540
 541       r = cuCtxPushCurrent (ptx_dev->ctx);
 542       if (r != CUDA_SUCCESS)
 543         GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s", cuda_error (r));
 544     }
 545 }
 546
 547 static struct ptx_device *
 548 nvptx_open_device (int n)
 549 {
 550   struct ptx_device *ptx_dev;
 551   CUdevice dev, ctx_dev;
 552   CUresult r;
 553   int async_engines, pi;
 554
 555   r = cuDeviceGet (&dev, n);
 556   if (r != CUDA_SUCCESS)
 557     GOMP_PLUGIN_fatal ("cuDeviceGet error: %s", cuda_error (r));
 558
 559   ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device));
 560
 561   ptx_dev->ord = n;
 562   ptx_dev->dev = dev;
 563   ptx_dev->ctx_shared = false;
 564
 565   r = cuCtxGetDevice (&ctx_dev);
 566   if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
 567     GOMP_PLUGIN_fatal ("cuCtxGetDevice error: %s", cuda_error (r));
 568
 569   if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev)
 570     {
 571       /* The current host thread has an active context for a different device.
 572          Detach it.  */
 573       CUcontext old_ctx;
 574
 575       r = cuCtxPopCurrent (&old_ctx);
 576       if (r != CUDA_SUCCESS)
 577         GOMP_PLUGIN_fatal ("cuCtxPopCurrent error: %s", cuda_error (r));
 578     }
 579
 580   r = cuCtxGetCurrent (&ptx_dev->ctx);
 581   if (r != CUDA_SUCCESS)
 582     GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));
 583
 584   if (!ptx_dev->ctx)
 585     {
 586       r = cuCtxCreate (&ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
 587       if (r != CUDA_SUCCESS)
 588         GOMP_PLUGIN_fatal ("cuCtxCreate error: %s", cuda_error (r));
 589     }
 590   else
 591     ptx_dev->ctx_shared = true;
 592
 593   r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
 594   if (r != CUDA_SUCCESS)
 595     GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
 596
 597   ptx_dev->overlap = pi;
 598
 599   r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
 600   if (r != CUDA_SUCCESS)
 601     GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
 602
 603   ptx_dev->map = pi;
 604
 605   r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
 606   if (r != CUDA_SUCCESS)
 607     GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
 608
 609   ptx_dev->concur = pi;
 610
 611   r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
 612   if (r != CUDA_SUCCESS)
 613     GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
 614
 615   ptx_dev->mode = pi;
 616
 617   r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
 618   if (r != CUDA_SUCCESS)
 619     GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
 620
 621   ptx_dev->mkern = pi;
 622
 623   r = cuDeviceGetAttribute (&async_engines,
 624                             CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
 625   if (r != CUDA_SUCCESS)
 626     async_engines = 1;
 627
 628   ptx_dev->images = NULL;
 629   pthread_mutex_init (&ptx_dev->image_lock, NULL);
 630
 631   init_streams_for_device (ptx_dev, async_engines);
 632
 633   return ptx_dev;
 634 }
 635
 636 static void
 637 nvptx_close_device (struct ptx_device *ptx_dev)
 638 {
 639   CUresult r;
 640
 641   if (!ptx_dev)
 642     return;
 643
 644   fini_streams_for_device (ptx_dev);
 645
 646   pthread_mutex_destroy (&ptx_dev->image_lock);
 647
 648   if (!ptx_dev->ctx_shared)
 649     {
 650       r = cuCtxDestroy (ptx_dev->ctx);
 651       if (r != CUDA_SUCCESS)
 652         GOMP_PLUGIN_fatal ("cuCtxDestroy error: %s", cuda_error (r));
 653     }
 654
 655   free (ptx_dev);
 656 }
 657
 658 static int
 659 nvptx_get_num_devices (void)
 660 {
 661   int n;
 662   CUresult r;
 663
 664   /* PR libgomp/65099: Currently, we only support offloading in 64-bit
 665      configurations.  */
 666   if (sizeof (void *) != 8)
 667     return 0;
 668
 669   /* This function will be called before the plugin has been initialized in
 670      order to enumerate available devices, but CUDA API routines can't be used
 671      until cuInit has been called.  Just call it now (but don't yet do any
 672      further initialization).  */
 673   if (instantiated_devices == 0)
 674     {
 675       r = cuInit (0);
 676       /* This is not an error: e.g. we may have CUDA libraries installed but
 677          no devices available.  */
 678       if (r != CUDA_SUCCESS)
 679         return 0;
 680     }
 681
 682   r = cuDeviceGetCount (&n);
 683   if (r!= CUDA_SUCCESS)
 684     GOMP_PLUGIN_fatal ("cuDeviceGetCount error: %s", cuda_error (r));
 685
 686   return n;
 687 }
 688
 689
 690 static void
 691 link_ptx (CUmodule *module, const char *ptx_code)
 692 {
 693   CUjit_option opts[7];
 694   void *optvals[7];
 695   float elapsed = 0.0;
 696 #define LOGSIZE 8192
 697   char elog[LOGSIZE];
 698   char ilog[LOGSIZE];
 699   unsigned long logsize = LOGSIZE;
 700   CUlinkState linkstate;
 701   CUresult r;
 702   void *linkout;
 703   size_t linkoutsize __attribute__ ((unused));
 704
 705   GOMP_PLUGIN_debug (0, "attempting to load:\n---\n%s\n---\n", ptx_code);
 706
 707   opts[0] = CU_JIT_WALL_TIME;
 708   optvals[0] = &elapsed;
 709
 710   opts[1] = CU_JIT_INFO_LOG_BUFFER;
 711   optvals[1] = &ilog[0];
 712
 713   opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
 714   optvals[2] = (void *) logsize;
 715
 716   opts[3] = CU_JIT_ERROR_LOG_BUFFER;
 717   optvals[3] = &elog[0];
 718
 719   opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
 720   optvals[4] = (void *) logsize;
 721
 722   opts[5] = CU_JIT_LOG_VERBOSE;
 723   optvals[5] = (void *) 1;
 724
 725   opts[6] = CU_JIT_TARGET;
 726   optvals[6] = (void *) CU_TARGET_COMPUTE_30;
 727
 728   r = cuLinkCreate (7, opts, optvals, &linkstate);
 729   if (r != CUDA_SUCCESS)
 730     GOMP_PLUGIN_fatal ("cuLinkCreate error: %s", cuda_error (r));
 731
 732   char *abort_ptx = ABORT_PTX;
 733   r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, abort_ptx,
 734                      strlen (abort_ptx) + 1, 0, 0, 0, 0);
 735   if (r != CUDA_SUCCESS)
 736     {
 737       GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
 738       GOMP_PLUGIN_fatal ("cuLinkAddData (abort) error: %s", cuda_error (r));
 739     }
 740
 741   char *acc_on_device_ptx = ACC_ON_DEVICE_PTX;
 742   r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, acc_on_device_ptx,
 743                      strlen (acc_on_device_ptx) + 1, 0, 0, 0, 0);
 744   if (r != CUDA_SUCCESS)
 745     {
 746       GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
 747       GOMP_PLUGIN_fatal ("cuLinkAddData (acc_on_device) error: %s",
 748                          cuda_error (r));
 749     }
 750
 751   char *goacc_internal_ptx = GOACC_INTERNAL_PTX;
 752   r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, goacc_internal_ptx,
 753                      strlen (goacc_internal_ptx) + 1, 0, 0, 0, 0);
 754   if (r != CUDA_SUCCESS)
 755     {
 756       GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
 757       GOMP_PLUGIN_fatal ("cuLinkAddData (goacc_internal_ptx) error: %s",
 758                          cuda_error (r));
 759     }
 760
 761   /* cuLinkAddData's 'data' argument erroneously omits the const qualifier.  */
 762   r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, (char *)ptx_code,
 763               strlen (ptx_code) + 1, 0, 0, 0, 0);
 764   if (r != CUDA_SUCCESS)
 765     {
 766       GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
 767       GOMP_PLUGIN_fatal ("cuLinkAddData (ptx_code) error: %s", cuda_error (r));
 768     }
 769
 770   r = cuLinkComplete (linkstate, &linkout, &linkoutsize);
 771   if (r != CUDA_SUCCESS)
 772     GOMP_PLUGIN_fatal ("cuLinkComplete error: %s", cuda_error (r));
 773
 774   GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed);
 775   GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]);
 776
 777   r = cuModuleLoadData (module, linkout);
 778   if (r != CUDA_SUCCESS)
 779     GOMP_PLUGIN_fatal ("cuModuleLoadData error: %s", cuda_error (r));
 780 }
 781
 782 static void
 783 event_gc (bool memmap_lockable)
 784 {
 785   struct ptx_event *ptx_event = ptx_events;
 786   struct nvptx_thread *nvthd = nvptx_thread ();
 787
 788   pthread_mutex_lock (&ptx_event_lock);
 789
 790   while (ptx_event != NULL)
 791     {
 792       CUresult r;
 793       struct ptx_event *e = ptx_event;
 794
 795       ptx_event = ptx_event->next;
 796
 797       if (e->ord != nvthd->ptx_dev->ord)
 798         continue;
 799
 800       r = cuEventQuery (*e->evt);
 801       if (r == CUDA_SUCCESS)
 802         {
 803           CUevent *te;
 804
 805           te = e->evt;
 806
 807           switch (e->type)
 808             {
 809             case PTX_EVT_MEM:
 810             case PTX_EVT_SYNC:
 811               break;
 812
 813             case PTX_EVT_KNL:
 814               map_pop (e->addr);
 815               break;
 816
 817             case PTX_EVT_ASYNC_CLEANUP:
 818               {
 819                 /* The function gomp_plugin_async_unmap_vars needs to claim the
 820                    memory-map splay tree lock for the current device, so we
 821                    can't call it when one of our callers has already claimed
 822                    the lock.  In that case, just delay the GC for this event
 823                    until later.  */
 824                 if (!memmap_lockable)
 825                   continue;
 826
 827                 GOMP_PLUGIN_async_unmap_vars (e->addr);
 828               }
 829               break;
 830             }
 831
 832           cuEventDestroy (*te);
 833           free ((void *)te);
 834
 835           if (ptx_events == e)
 836             ptx_events = ptx_events->next;
 837           else
 838             {
 839               struct ptx_event *e_ = ptx_events;
 840               while (e_->next != e)
 841                 e_ = e_->next;
 842               e_->next = e_->next->next;
 843             }
 844
 845           free (e);
 846         }
 847     }
 848
 849   pthread_mutex_unlock (&ptx_event_lock);
 850 }
 851
 852 static void
 853 event_add (enum ptx_event_type type, CUevent *e, void *h)
 854 {
 855   struct ptx_event *ptx_event;
 856   struct nvptx_thread *nvthd = nvptx_thread ();
 857
 858   assert (type == PTX_EVT_MEM || type == PTX_EVT_KNL || type == PTX_EVT_SYNC
 859           || type == PTX_EVT_ASYNC_CLEANUP);
 860
 861   ptx_event = GOMP_PLUGIN_malloc (sizeof (struct ptx_event));
 862   ptx_event->type = type;
 863   ptx_event->evt = e;
 864   ptx_event->addr = h;
 865   ptx_event->ord = nvthd->ptx_dev->ord;
 866
 867   pthread_mutex_lock (&ptx_event_lock);
 868
 869   ptx_event->next = ptx_events;
 870   ptx_events = ptx_event;
 871
 872   pthread_mutex_unlock (&ptx_event_lock);
 873 }
 874
 875 void
 876 nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
 877             size_t *sizes, unsigned short *kinds, int async, unsigned *dims,
 878             void *targ_mem_desc)
 879 {
 880   struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
 881   CUfunction function;
 882   CUresult r;
 883   int i;
 884   struct ptx_stream *dev_str;
 885   void *kargs[1];
 886   void *hp, *dp;
 887   struct nvptx_thread *nvthd = nvptx_thread ();
 888   const char *maybe_abort_msg = "(perhaps abort was called)";
 889
 890   function = targ_fn->fn;
 891
 892   dev_str = select_stream_for_async (async, pthread_self (), false, NULL);
 893   assert (dev_str == nvthd->current_stream);
 894
 895   /* Initialize the launch dimensions.  Typically this is constant,
 896      provided by the device compiler, but we must permit runtime
 897      values.  */
 898   for (i = 0; i != 3; i++)
 899     if (targ_fn->launch->dim[i])
 900       dims[i] = targ_fn->launch->dim[i];
 901
 902   if (dims[GOMP_DIM_GANG] != 1)
 903     GOMP_PLUGIN_fatal ("non-unity num_gangs (%d) not supported",
 904                        dims[GOMP_DIM_GANG]);
 905   if (dims[GOMP_DIM_WORKER] != 1)
 906     GOMP_PLUGIN_fatal ("non-unity num_workers (%d) not supported",
 907                        dims[GOMP_DIM_WORKER]);
 908
 909   /* This reserves a chunk of a pre-allocated page of memory mapped on both
 910      the host and the device. HP is a host pointer to the new chunk, and DP is
 911      the corresponding device pointer.  */
 912   map_push (dev_str, async, mapnum * sizeof (void *), &hp, &dp);
 913
 914   GOMP_PLUGIN_debug (0, "  %s: prepare mappings\n", __FUNCTION__);
 915
 916   /* Copy the array of arguments to the mapped page.  */
 917   for (i = 0; i < mapnum; i++)
 918     ((void **) hp)[i] = devaddrs[i];
 919
 920   /* Copy the (device) pointers to arguments to the device (dp and hp might in
 921      fact have the same value on a unified-memory system).  */
 922   r = cuMemcpy ((CUdeviceptr)dp, (CUdeviceptr)hp, mapnum * sizeof (void *));
 923   if (r != CUDA_SUCCESS)
 924     GOMP_PLUGIN_fatal ("cuMemcpy failed: %s", cuda_error (r));
 925
 926   GOMP_PLUGIN_debug (0, "  %s: kernel %s: launch"
 927                      " gangs=%u, workers=%u, vectors=%u\n",
 928                      __FUNCTION__, targ_fn->launch->fn,
 929                      dims[0], dims[1], dims[2]);
 930
 931   // OpenACC            CUDA
 932   //
 933   // num_gangs          nctaid.x
 934   // num_workers        ntid.y
 935   // vector length      ntid.x
 936
 937   kargs[0] = &dp;
 938   r = cuLaunchKernel (function,
 939                       dims[GOMP_DIM_GANG], 1, 1,
 940                       dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1,
 941                       0, dev_str->stream, kargs, 0);
 942   if (r != CUDA_SUCCESS)
 943     GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
 944
 945 #ifndef DISABLE_ASYNC
 946   if (async < acc_async_noval)
 947     {
 948       r = cuStreamSynchronize (dev_str->stream);
 949       if (r == CUDA_ERROR_LAUNCH_FAILED)
 950         GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
 951                            maybe_abort_msg);
 952       else if (r != CUDA_SUCCESS)
 953         GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
 954     }
 955   else
 956     {
 957       CUevent *e;
 958
 959       e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
 960
 961       r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
 962       if (r == CUDA_ERROR_LAUNCH_FAILED)
 963         GOMP_PLUGIN_fatal ("cuEventCreate error: %s %s\n", cuda_error (r),
 964                            maybe_abort_msg);
 965       else if (r != CUDA_SUCCESS)
 966         GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
 967
 968       event_gc (true);
 969
 970       r = cuEventRecord (*e, dev_str->stream);
 971       if (r != CUDA_SUCCESS)
 972         GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
 973
 974       event_add (PTX_EVT_KNL, e, (void *)dev_str);
 975     }
 976 #else
 977   r = cuCtxSynchronize ();
 978   if (r == CUDA_ERROR_LAUNCH_FAILED)
 979     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
 980                        maybe_abort_msg);
 981   else if (r != CUDA_SUCCESS)
 982     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
 983 #endif
 984
 985   GOMP_PLUGIN_debug (0, "  %s: kernel %s: finished\n", __FUNCTION__,
 986                      targ_fn->launch->fn);
 987
 988 #ifndef DISABLE_ASYNC
 989   if (async < acc_async_noval)
 990 #endif
 991     map_pop (dev_str);
 992 }
 993
 994 void * openacc_get_current_cuda_context (void);
 995
 996 static void *
 997 nvptx_alloc (size_t s)
 998 {
 999   CUdeviceptr d;
1000   CUresult r;
1001
1002   r = cuMemAlloc (&d, s);
1003   if (r == CUDA_ERROR_OUT_OF_MEMORY)
1004     return 0;
1005   if (r != CUDA_SUCCESS)
1006     GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
1007   return (void *)d;
1008 }
1009
1010 static void
1011 nvptx_free (void *p)
1012 {
1013   CUresult r;
1014   CUdeviceptr pb;
1015   size_t ps;
1016
1017   r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)p);
1018   if (r != CUDA_SUCCESS)
1019     GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r));
1020
1021   if ((CUdeviceptr)p != pb)
1022     GOMP_PLUGIN_fatal ("invalid device address");
1023
1024   r = cuMemFree ((CUdeviceptr)p);
1025   if (r != CUDA_SUCCESS)
1026     GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
1027 }
1028
1029 static void *
1030 nvptx_host2dev (void *d, const void *h, size_t s)
1031 {
1032   CUresult r;
1033   CUdeviceptr pb;
1034   size_t ps;
1035   struct nvptx_thread *nvthd = nvptx_thread ();
1036
1037   if (!s)
1038     return 0;
1039
1040   if (!d)
1041     GOMP_PLUGIN_fatal ("invalid device address");
1042
1043   r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)d);
1044   if (r != CUDA_SUCCESS)
1045     GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r));
1046
1047   if (!pb)
1048     GOMP_PLUGIN_fatal ("invalid device address");
1049
1050   if (!h)
1051     GOMP_PLUGIN_fatal ("invalid host address");
1052
1053   if (d == h)
1054     GOMP_PLUGIN_fatal ("invalid host or device address");
1055
1056   if ((void *)(d + s) > (void *)(pb + ps))
1057     GOMP_PLUGIN_fatal ("invalid size");
1058
1059 #ifndef DISABLE_ASYNC
1060   if (nvthd->current_stream != nvthd->ptx_dev->null_stream)
1061     {
1062       CUevent *e;
1063
1064       e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1065
1066       r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1067       if (r != CUDA_SUCCESS)
1068         GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1069
1070       event_gc (false);
1071
1072       r = cuMemcpyHtoDAsync ((CUdeviceptr)d, h, s,
1073                              nvthd->current_stream->stream);
1074       if (r != CUDA_SUCCESS)
1075         GOMP_PLUGIN_fatal ("cuMemcpyHtoDAsync error: %s", cuda_error (r));
1076
1077       r = cuEventRecord (*e, nvthd->current_stream->stream);
1078       if (r != CUDA_SUCCESS)
1079         GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1080
1081       event_add (PTX_EVT_MEM, e, (void *)h);
1082     }
1083   else
1084 #endif
1085     {
1086       r = cuMemcpyHtoD ((CUdeviceptr)d, h, s);
1087       if (r != CUDA_SUCCESS)
1088         GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
1089     }
1090
1091   return 0;
1092 }
1093
1094 static void *
1095 nvptx_dev2host (void *h, const void *d, size_t s)
1096 {
1097   CUresult r;
1098   CUdeviceptr pb;
1099   size_t ps;
1100   struct nvptx_thread *nvthd = nvptx_thread ();
1101
1102   if (!s)
1103     return 0;
1104
1105   if (!d)
1106     GOMP_PLUGIN_fatal ("invalid device address");
1107
1108   r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)d);
1109   if (r != CUDA_SUCCESS)
1110     GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r));
1111
1112   if (!pb)
1113     GOMP_PLUGIN_fatal ("invalid device address");
1114
1115   if (!h)
1116     GOMP_PLUGIN_fatal ("invalid host address");
1117
1118   if (d == h)
1119     GOMP_PLUGIN_fatal ("invalid host or device address");
1120
1121   if ((void *)(d + s) > (void *)(pb + ps))
1122     GOMP_PLUGIN_fatal ("invalid size");
1123
1124 #ifndef DISABLE_ASYNC
1125   if (nvthd->current_stream != nvthd->ptx_dev->null_stream)
1126     {
1127       CUevent *e;
1128
1129       e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1130
1131       r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1132       if (r != CUDA_SUCCESS)
1133         GOMP_PLUGIN_fatal ("cuEventCreate error: %s\n", cuda_error (r));
1134
1135       event_gc (false);
1136
1137       r = cuMemcpyDtoHAsync (h, (CUdeviceptr)d, s,
1138                              nvthd->current_stream->stream);
1139       if (r != CUDA_SUCCESS)
1140         GOMP_PLUGIN_fatal ("cuMemcpyDtoHAsync error: %s", cuda_error (r));
1141
1142       r = cuEventRecord (*e, nvthd->current_stream->stream);
1143       if (r != CUDA_SUCCESS)
1144         GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1145
1146       event_add (PTX_EVT_MEM, e, (void *)h);
1147     }
1148   else
1149 #endif
1150     {
1151       r = cuMemcpyDtoH (h, (CUdeviceptr)d, s);
1152       if (r != CUDA_SUCCESS)
1153         GOMP_PLUGIN_fatal ("cuMemcpyDtoH error: %s", cuda_error (r));
1154     }
1155
1156   return 0;
1157 }
1158
1159 static void
1160 nvptx_set_async (int async)
1161 {
1162   struct nvptx_thread *nvthd = nvptx_thread ();
1163   nvthd->current_stream
1164     = select_stream_for_async (async, pthread_self (), true, NULL);
1165 }
1166
1167 static int
1168 nvptx_async_test (int async)
1169 {
1170   CUresult r;
1171   struct ptx_stream *s;
1172
1173   s = select_stream_for_async (async, pthread_self (), false, NULL);
1174
1175   if (!s)
1176     GOMP_PLUGIN_fatal ("unknown async %d", async);
1177
1178   r = cuStreamQuery (s->stream);
1179   if (r == CUDA_SUCCESS)
1180     {
1181       /* The oacc-parallel.c:goacc_wait function calls this hook to determine
1182          whether all work has completed on this stream, and if so omits the call
1183          to the wait hook.  If that happens, event_gc might not get called
1184          (which prevents variables from getting unmapped and their associated
1185          device storage freed), so call it here.  */
1186       event_gc (true);
1187       return 1;
1188     }
1189   else if (r == CUDA_ERROR_NOT_READY)
1190     return 0;
1191
1192   GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
1193
1194   return 0;
1195 }
1196
1197 static int
1198 nvptx_async_test_all (void)
1199 {
1200   struct ptx_stream *s;
1201   pthread_t self = pthread_self ();
1202   struct nvptx_thread *nvthd = nvptx_thread ();
1203
1204   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1205
1206   for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
1207     {
1208       if ((s->multithreaded || pthread_equal (s->host_thread, self))
1209           && cuStreamQuery (s->stream) == CUDA_ERROR_NOT_READY)
1210         {
1211           pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1212           return 0;
1213         }
1214     }
1215
1216   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1217
1218   event_gc (true);
1219
1220   return 1;
1221 }
1222
1223 static void
1224 nvptx_wait (int async)
1225 {
1226   CUresult r;
1227   struct ptx_stream *s;
1228
1229   s = select_stream_for_async (async, pthread_self (), false, NULL);
1230
1231   if (!s)
1232     GOMP_PLUGIN_fatal ("unknown async %d", async);
1233
1234   r = cuStreamSynchronize (s->stream);
1235   if (r != CUDA_SUCCESS)
1236     GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1237
1238   event_gc (true);
1239 }
1240
1241 static void
1242 nvptx_wait_async (int async1, int async2)
1243 {
1244   CUresult r;
1245   CUevent *e;
1246   struct ptx_stream *s1, *s2;
1247   pthread_t self = pthread_self ();
1248
1249   /* The stream that is waiting (rather than being waited for) doesn't
1250      necessarily have to exist already.  */
1251   s2 = select_stream_for_async (async2, self, true, NULL);
1252
1253   s1 = select_stream_for_async (async1, self, false, NULL);
1254   if (!s1)
1255     GOMP_PLUGIN_fatal ("invalid async 1\n");
1256
1257   if (s1 == s2)
1258     GOMP_PLUGIN_fatal ("identical parameters");
1259
1260   e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1261
1262   r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1263   if (r != CUDA_SUCCESS)
1264     GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1265
1266   event_gc (true);
1267
1268   r = cuEventRecord (*e, s1->stream);
1269   if (r != CUDA_SUCCESS)
1270     GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1271
1272   event_add (PTX_EVT_SYNC, e, NULL);
1273
1274   r = cuStreamWaitEvent (s2->stream, *e, 0);
1275   if (r != CUDA_SUCCESS)
1276     GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuda_error (r));
1277 }
1278
1279 static void
1280 nvptx_wait_all (void)
1281 {
1282   CUresult r;
1283   struct ptx_stream *s;
1284   pthread_t self = pthread_self ();
1285   struct nvptx_thread *nvthd = nvptx_thread ();
1286
1287   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1288
1289   /* Wait for active streams initiated by this thread (or by multiple threads)
1290      to complete.  */
1291   for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
1292     {
1293       if (s->multithreaded || pthread_equal (s->host_thread, self))
1294         {
1295           r = cuStreamQuery (s->stream);
1296           if (r == CUDA_SUCCESS)
1297             continue;
1298           else if (r != CUDA_ERROR_NOT_READY)
1299             GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
1300
1301           r = cuStreamSynchronize (s->stream);
1302           if (r != CUDA_SUCCESS)
1303             GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1304         }
1305     }
1306
1307   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1308
1309   event_gc (true);
1310 }
1311
1312 static void
1313 nvptx_wait_all_async (int async)
1314 {
1315   CUresult r;
1316   struct ptx_stream *waiting_stream, *other_stream;
1317   CUevent *e;
1318   struct nvptx_thread *nvthd = nvptx_thread ();
1319   pthread_t self = pthread_self ();
1320
1321   /* The stream doing the waiting.  This could be the first mention of the
1322      stream, so create it if necessary.  */
1323   waiting_stream
1324     = select_stream_for_async (async, pthread_self (), true, NULL);
1325
1326   /* Launches on the null stream already block on other streams in the
1327      context.  */
1328   if (!waiting_stream || waiting_stream == nvthd->ptx_dev->null_stream)
1329     return;
1330
1331   event_gc (true);
1332
1333   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1334
1335   for (other_stream = nvthd->ptx_dev->active_streams;
1336        other_stream != NULL;
1337        other_stream = other_stream->next)
1338     {
1339       if (!other_stream->multithreaded
1340           && !pthread_equal (other_stream->host_thread, self))
1341         continue;
1342
1343       e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1344
1345       r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1346       if (r != CUDA_SUCCESS)
1347         GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1348
1349       /* Record an event on the waited-for stream.  */
1350       r = cuEventRecord (*e, other_stream->stream);
1351       if (r != CUDA_SUCCESS)
1352         GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1353
1354       event_add (PTX_EVT_SYNC, e, NULL);
1355
1356       r = cuStreamWaitEvent (waiting_stream->stream, *e, 0);
1357       if (r != CUDA_SUCCESS)
1358         GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuda_error (r));
1359    }
1360
1361   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1362 }
1363
1364 static void *
1365 nvptx_get_current_cuda_device (void)
1366 {
1367   struct nvptx_thread *nvthd = nvptx_thread ();
1368
1369   if (!nvthd || !nvthd->ptx_dev)
1370     return NULL;
1371
1372   return &nvthd->ptx_dev->dev;
1373 }
1374
1375 static void *
1376 nvptx_get_current_cuda_context (void)
1377 {
1378   struct nvptx_thread *nvthd = nvptx_thread ();
1379
1380   if (!nvthd || !nvthd->ptx_dev)
1381     return NULL;
1382
1383   return nvthd->ptx_dev->ctx;
1384 }
1385
1386 static void *
1387 nvptx_get_cuda_stream (int async)
1388 {
1389   struct ptx_stream *s;
1390   struct nvptx_thread *nvthd = nvptx_thread ();
1391
1392   if (!nvthd || !nvthd->ptx_dev)
1393     return NULL;
1394
1395   s = select_stream_for_async (async, pthread_self (), false, NULL);
1396
1397   return s ? s->stream : NULL;
1398 }
1399
1400 static int
1401 nvptx_set_cuda_stream (int async, void *stream)
1402 {
1403   struct ptx_stream *oldstream;
1404   pthread_t self = pthread_self ();
1405   struct nvptx_thread *nvthd = nvptx_thread ();
1406
1407   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1408
1409   if (async < 0)
1410     GOMP_PLUGIN_fatal ("bad async %d", async);
1411
1412   /* We have a list of active streams and an array mapping async values to
1413      entries of that list.  We need to take "ownership" of the passed-in stream,
1414      and add it to our list, removing the previous entry also (if there was one)
1415      in order to prevent resource leaks.  Note the potential for surprise
1416      here: maybe we should keep track of passed-in streams and leave it up to
1417      the user to tidy those up, but that doesn't work for stream handles
1418      returned from acc_get_cuda_stream above...  */
1419
1420   oldstream = select_stream_for_async (async, self, false, NULL);
1421
1422   if (oldstream)
1423     {
1424       if (nvthd->ptx_dev->active_streams == oldstream)
1425         nvthd->ptx_dev->active_streams = nvthd->ptx_dev->active_streams->next;
1426       else
1427         {
1428           struct ptx_stream *s = nvthd->ptx_dev->active_streams;
1429           while (s->next != oldstream)
1430             s = s->next;
1431           s->next = s->next->next;
1432         }
1433
1434       cuStreamDestroy (oldstream->stream);
1435       map_fini (oldstream);
1436       free (oldstream);
1437     }
1438
1439   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1440
1441   (void) select_stream_for_async (async, self, true, (CUstream) stream);
1442
1443   return 1;
1444 }
1445
1446 /* Plugin entry points.  */
1447
1448 const char *
1449 GOMP_OFFLOAD_get_name (void)
1450 {
1451   return "nvptx";
1452 }
1453
1454 unsigned int
1455 GOMP_OFFLOAD_get_caps (void)
1456 {
1457   return GOMP_OFFLOAD_CAP_OPENACC_200;
1458 }
1459
1460 int
1461 GOMP_OFFLOAD_get_type (void)
1462 {
1463   return OFFLOAD_TARGET_TYPE_NVIDIA_PTX;
1464 }
1465
1466 int
1467 GOMP_OFFLOAD_get_num_devices (void)
1468 {
1469   return nvptx_get_num_devices ();
1470 }
1471
1472 void
1473 GOMP_OFFLOAD_init_device (int n)
1474 {
1475   pthread_mutex_lock (&ptx_dev_lock);
1476
1477   if (!nvptx_init () || ptx_devices[n] != NULL)
1478     {
1479       pthread_mutex_unlock (&ptx_dev_lock);
1480       return;
1481     }
1482
1483   ptx_devices[n] = nvptx_open_device (n);
1484   instantiated_devices++;
1485
1486   pthread_mutex_unlock (&ptx_dev_lock);
1487 }
1488
1489 void
1490 GOMP_OFFLOAD_fini_device (int n)
1491 {
1492   pthread_mutex_lock (&ptx_dev_lock);
1493
1494   if (ptx_devices[n] != NULL)
1495     {
1496       nvptx_attach_host_thread_to_device (n);
1497       nvptx_close_device (ptx_devices[n]);
1498       ptx_devices[n] = NULL;
1499       instantiated_devices--;
1500     }
1501
1502   pthread_mutex_unlock (&ptx_dev_lock);
1503 }
1504
1505 /* Data emitted by mkoffload.  */
1506
1507 typedef struct nvptx_tdata
1508 {
1509   const char *ptx_src;
1510
1511   const char *const *var_names;
1512   size_t var_num;
1513
1514   const struct targ_fn_launch *fn_descs;
1515   size_t fn_num;
1516 } nvptx_tdata_t;
1517
1518 /* Return the libgomp version number we're compatible with.  There is
1519    no requirement for cross-version compatibility.  */
1520
1521 unsigned
1522 GOMP_OFFLOAD_version (void)
1523 {
1524   return GOMP_VERSION;
1525 }
1526
1527 /* Load the (partial) program described by TARGET_DATA to device
1528    number ORD.  Allocate and return TARGET_TABLE.  */
1529
1530 int
1531 GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
1532                          struct addr_pair **target_table)
1533 {
1534   CUmodule module;
1535   const char *const *var_names;
1536   const struct targ_fn_launch *fn_descs;
1537   unsigned int fn_entries, var_entries, i, j;
1538   CUresult r;
1539   struct targ_fn_descriptor *targ_fns;
1540   struct addr_pair *targ_tbl;
1541   const nvptx_tdata_t *img_header = (const nvptx_tdata_t *) target_data;
1542   struct ptx_image_data *new_image;
1543   struct ptx_device *dev;
1544
1545   if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1546     GOMP_PLUGIN_fatal ("Offload data incompatible with PTX plugin"
1547                        " (expected %u, received %u)",
1548                        GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1549
1550   GOMP_OFFLOAD_init_device (ord);
1551
1552   dev = ptx_devices[ord];
1553
1554   nvptx_attach_host_thread_to_device (ord);
1555
1556   link_ptx (&module, img_header->ptx_src);
1557
1558   /* The mkoffload utility emits a struct of pointers/integers at the
1559      start of each offload image.  The array of kernel names and the
1560      functions addresses form a one-to-one correspondence.  */
1561
1562   var_entries = img_header->var_num;
1563   var_names = img_header->var_names;
1564   fn_entries = img_header->fn_num;
1565   fn_descs = img_header->fn_descs;
1566
1567   targ_tbl = GOMP_PLUGIN_malloc (sizeof (struct addr_pair)
1568                                  * (fn_entries + var_entries));
1569   targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
1570                                  * fn_entries);
1571
1572   *target_table = targ_tbl;
1573
1574   new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data));
1575   new_image->target_data = target_data;
1576   new_image->module = module;
1577   new_image->fns = targ_fns;
1578
1579   pthread_mutex_lock (&dev->image_lock);
1580   new_image->next = dev->images;
1581   dev->images = new_image;
1582   pthread_mutex_unlock (&dev->image_lock);
1583
1584   for (i = 0; i < fn_entries; i++, targ_fns++, targ_tbl++)
1585     {
1586       CUfunction function;
1587
1588       r = cuModuleGetFunction (&function, module, fn_descs[i].fn);
1589       if (r != CUDA_SUCCESS)
1590         GOMP_PLUGIN_fatal ("cuModuleGetFunction error: %s", cuda_error (r));
1591
1592       targ_fns->fn = function;
1593       targ_fns->launch = &fn_descs[i];
1594
1595       targ_tbl->start = (uintptr_t) targ_fns;
1596       targ_tbl->end = targ_tbl->start + 1;
1597     }
1598
1599   for (j = 0; j < var_entries; j++, targ_tbl++)
1600     {
1601       CUdeviceptr var;
1602       size_t bytes;
1603
1604       r = cuModuleGetGlobal (&var, &bytes, module, var_names[j]);
1605       if (r != CUDA_SUCCESS)
1606         GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
1607
1608       targ_tbl->start = (uintptr_t) var;
1609       targ_tbl->end = targ_tbl->start + bytes;
1610     }
1611
1612   return fn_entries + var_entries;
1613 }
1614
1615 /* Unload the program described by TARGET_DATA.  DEV_DATA is the
1616    function descriptors allocated by G_O_load_image.  */
1617
1618 void
1619 GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data)
1620 {
1621   struct ptx_image_data *image, **prev_p;
1622   struct ptx_device *dev = ptx_devices[ord];
1623
1624   if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1625     return;
1626
1627   pthread_mutex_lock (&dev->image_lock);
1628   for (prev_p = &dev->images; (image = *prev_p) != 0; prev_p = &image->next)
1629     if (image->target_data == target_data)
1630       {
1631         *prev_p = image->next;
1632         cuModuleUnload (image->module);
1633         free (image->fns);
1634         free (image);
1635         break;
1636       }
1637   pthread_mutex_unlock (&dev->image_lock);
1638 }
1639
1640 void *
1641 GOMP_OFFLOAD_alloc (int ord, size_t size)
1642 {
1643   nvptx_attach_host_thread_to_device (ord);
1644   return nvptx_alloc (size);
1645 }
1646
1647 void
1648 GOMP_OFFLOAD_free (int ord, void *ptr)
1649 {
1650   nvptx_attach_host_thread_to_device (ord);
1651   nvptx_free (ptr);
1652 }
1653
1654 void *
1655 GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
1656 {
1657   nvptx_attach_host_thread_to_device (ord);
1658   return nvptx_dev2host (dst, src, n);
1659 }
1660
1661 void *
1662 GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
1663 {
1664   nvptx_attach_host_thread_to_device (ord);
1665   return nvptx_host2dev (dst, src, n);
1666 }
1667
1668 void (*device_run) (int n, void *fn_ptr, void *vars) = NULL;
1669
1670 void
1671 GOMP_OFFLOAD_openacc_parallel (void (*fn) (void *), size_t mapnum,
1672                                void **hostaddrs, void **devaddrs,
1673                                size_t *sizes, unsigned short *kinds,
1674                                int async, unsigned *dims, void *targ_mem_desc)
1675 {
1676   nvptx_exec (fn, mapnum, hostaddrs, devaddrs, sizes, kinds,
1677               async, dims, targ_mem_desc);
1678 }
1679
1680 void
1681 GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc)
1682 {
1683   CUevent *e;
1684   CUresult r;
1685   struct nvptx_thread *nvthd = nvptx_thread ();
1686
1687   e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1688
1689   r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1690   if (r != CUDA_SUCCESS)
1691     GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1692
1693   r = cuEventRecord (*e, nvthd->current_stream->stream);
1694   if (r != CUDA_SUCCESS)
1695     GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1696
1697   event_add (PTX_EVT_ASYNC_CLEANUP, e, targ_mem_desc);
1698 }
1699
1700 int
1701 GOMP_OFFLOAD_openacc_async_test (int async)
1702 {
1703   return nvptx_async_test (async);
1704 }
1705
1706 int
1707 GOMP_OFFLOAD_openacc_async_test_all (void)
1708 {
1709   return nvptx_async_test_all ();
1710 }
1711
1712 void
1713 GOMP_OFFLOAD_openacc_async_wait (int async)
1714 {
1715   nvptx_wait (async);
1716 }
1717
1718 void
1719 GOMP_OFFLOAD_openacc_async_wait_async (int async1, int async2)
1720 {
1721   nvptx_wait_async (async1, async2);
1722 }
1723
1724 void
1725 GOMP_OFFLOAD_openacc_async_wait_all (void)
1726 {
1727   nvptx_wait_all ();
1728 }
1729
1730 void
1731 GOMP_OFFLOAD_openacc_async_wait_all_async (int async)
1732 {
1733   nvptx_wait_all_async (async);
1734 }
1735
1736 void
1737 GOMP_OFFLOAD_openacc_async_set_async (int async)
1738 {
1739   nvptx_set_async (async);
1740 }
1741
1742 void *
1743 GOMP_OFFLOAD_openacc_create_thread_data (int ord)
1744 {
1745   struct ptx_device *ptx_dev;
1746   struct nvptx_thread *nvthd
1747     = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
1748   CUresult r;
1749   CUcontext thd_ctx;
1750
1751   ptx_dev = ptx_devices[ord];
1752
1753   assert (ptx_dev);
1754
1755   r = cuCtxGetCurrent (&thd_ctx);
1756   if (r != CUDA_SUCCESS)
1757     GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));
1758
1759   assert (ptx_dev->ctx);
1760
1761   if (!thd_ctx)
1762     {
1763       r = cuCtxPushCurrent (ptx_dev->ctx);
1764       if (r != CUDA_SUCCESS)
1765         GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s", cuda_error (r));
1766     }
1767
1768   nvthd->current_stream = ptx_dev->null_stream;
1769   nvthd->ptx_dev = ptx_dev;
1770
1771   return (void *) nvthd;
1772 }
1773
1774 void
1775 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
1776 {
1777   free (data);
1778 }
1779
1780 void *
1781 GOMP_OFFLOAD_openacc_get_current_cuda_device (void)
1782 {
1783   return nvptx_get_current_cuda_device ();
1784 }
1785
1786 void *
1787 GOMP_OFFLOAD_openacc_get_current_cuda_context (void)
1788 {
1789   return nvptx_get_current_cuda_context ();
1790 }
1791
1792 /* NOTE: This returns a CUstream, not a ptx_stream pointer.  */
1793
1794 void *
1795 GOMP_OFFLOAD_openacc_get_cuda_stream (int async)
1796 {
1797   return nvptx_get_cuda_stream (async);
1798 }
1799
1800 /* NOTE: This takes a CUstream, not a ptx_stream pointer.  */
1801
1802 int
1803 GOMP_OFFLOAD_openacc_set_cuda_stream (int async, void *stream)
1804 {
1805   return nvptx_set_cuda_stream (async, stream);
1806 }