libgomp/plugin/plugin-nvptx.c

   1 /* Plugin for NVPTX execution.
   2
   3    Copyright (C) 2013-2016 Free Software Foundation, Inc.
   4
   5    Contributed by Mentor Embedded.
   6
   7    This file is part of the GNU Offloading and Multi Processing Library
   8    (libgomp).
   9
  10    Libgomp is free software; you can redistribute it and/or modify it
  11    under the terms of the GNU General Public License as published by
  12    the Free Software Foundation; either version 3, or (at your option)
  13    any later version.
  14
  15    Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
  16    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  17    FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  18    more details.
  19
  20    Under Section 7 of GPL version 3, you are granted additional
  21    permissions described in the GCC Runtime Library Exception, version
  22    3.1, as published by the Free Software Foundation.
  23
  24    You should have received a copy of the GNU General Public License and
  25    a copy of the GCC Runtime Library Exception along with this program;
  26    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  27    <http://www.gnu.org/licenses/>.  */
  28
  29 /* Nvidia PTX-specific parts of OpenACC support.  The cuda driver
  30    library appears to hold some implicit state, but the documentation
  31    is not clear as to what that state might be.  Or how one might
  32    propagate it from one thread to another.  */
  33
  34 #include "openacc.h"
  35 #include "config.h"
  36 #include "libgomp-plugin.h"
  37 #include "oacc-plugin.h"
  38 #include "gomp-constants.h"
  39
  40 #include <pthread.h>
  41 #include <cuda.h>
  42 #include <stdbool.h>
  43 #include <stdint.h>
  44 #include <string.h>
  45 #include <stdio.h>
  46 #include <unistd.h>
  47 #include <assert.h>
  48
  49 static const char *
  50 cuda_error (CUresult r)
  51 {
  52 #if CUDA_VERSION < 7000
  53   /* Specified in documentation and present in library from at least
  54      5.5.  Not declared in header file prior to 7.0.  */
  55   extern CUresult cuGetErrorString (CUresult, const char **);
  56 #endif
  57   const char *desc;
  58
  59   r = cuGetErrorString (r, &desc);
  60   if (r != CUDA_SUCCESS)
  61     desc = "unknown cuda error";
  62
  63   return desc;
  64 }
  65
  66 /* Convenience macros for the frequently used CUDA library call and
  67    error handling sequence.  This does not capture all the cases we
  68    use in this file, but is common enough.  */
  69
  70 #define CUDA_CALL_ERET(ERET, FN, ...)           \
  71   do {                                          \
  72     unsigned __r = FN (__VA_ARGS__);            \
  73     if (__r != CUDA_SUCCESS)                    \
  74       {                                         \
  75         GOMP_PLUGIN_error (#FN " error: %s",    \
  76                            cuda_error (__r));   \
  77         return ERET;                            \
  78       }                                         \
  79   } while (0)
  80
  81 #define CUDA_CALL(FN, ...)                      \
  82   CUDA_CALL_ERET (false, (FN), __VA_ARGS__)
  83
  84 #define CUDA_CALL_ASSERT(FN, ...)               \
  85   do {                                          \
  86     unsigned __r = FN (__VA_ARGS__);            \
  87     if (__r != CUDA_SUCCESS)                    \
  88       {                                         \
  89         GOMP_PLUGIN_fatal (#FN " error: %s",    \
  90                            cuda_error (__r));   \
  91       }                                         \
  92   } while (0)
  93
  94 static unsigned int instantiated_devices = 0;
  95 static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
  96
  97 struct ptx_stream
  98 {
  99   CUstream stream;
 100   pthread_t host_thread;
 101   bool multithreaded;
 102
 103   CUdeviceptr d;
 104   void *h;
 105   void *h_begin;
 106   void *h_end;
 107   void *h_next;
 108   void *h_prev;
 109   void *h_tail;
 110
 111   struct ptx_stream *next;
 112 };
 113
 114 /* Thread-specific data for PTX.  */
 115
 116 struct nvptx_thread
 117 {
 118   struct ptx_stream *current_stream;
 119   struct ptx_device *ptx_dev;
 120 };
 121
 122 struct map
 123 {
 124   int     async;
 125   size_t  size;
 126   char    mappings[0];
 127 };
 128
 129 static bool
 130 map_init (struct ptx_stream *s)
 131 {
 132   int size = getpagesize ();
 133
 134   assert (s);
 135   assert (!s->d);
 136   assert (!s->h);
 137
 138   CUDA_CALL (cuMemAllocHost, &s->h, size);
 139   CUDA_CALL (cuMemHostGetDevicePointer, &s->d, s->h, 0);
 140
 141   assert (s->h);
 142
 143   s->h_begin = s->h;
 144   s->h_end = s->h_begin + size;
 145   s->h_next = s->h_prev = s->h_tail = s->h_begin;
 146
 147   assert (s->h_next);
 148   assert (s->h_end);
 149   return true;
 150 }
 151
 152 static bool
 153 map_fini (struct ptx_stream *s)
 154 {
 155   CUDA_CALL (cuMemFreeHost, s->h);
 156   return true;
 157 }
 158
 159 static void
 160 map_pop (struct ptx_stream *s)
 161 {
 162   struct map *m;
 163
 164   assert (s != NULL);
 165   assert (s->h_next);
 166   assert (s->h_prev);
 167   assert (s->h_tail);
 168
 169   m = s->h_tail;
 170
 171   s->h_tail += m->size;
 172
 173   if (s->h_tail >= s->h_end)
 174     s->h_tail = s->h_begin + (int) (s->h_tail - s->h_end);
 175
 176   if (s->h_next == s->h_tail)
 177     s->h_prev = s->h_next;
 178
 179   assert (s->h_next >= s->h_begin);
 180   assert (s->h_tail >= s->h_begin);
 181   assert (s->h_prev >= s->h_begin);
 182
 183   assert (s->h_next <= s->h_end);
 184   assert (s->h_tail <= s->h_end);
 185   assert (s->h_prev <= s->h_end);
 186 }
 187
 188 static void
 189 map_push (struct ptx_stream *s, int async, size_t size, void **h, void **d)
 190 {
 191   int left;
 192   int offset;
 193   struct map *m;
 194
 195   assert (s != NULL);
 196
 197   left = s->h_end - s->h_next;
 198   size += sizeof (struct map);
 199
 200   assert (s->h_prev);
 201   assert (s->h_next);
 202
 203   if (size >= left)
 204     {
 205       m = s->h_prev;
 206       m->size += left;
 207       s->h_next = s->h_begin;
 208
 209       if (s->h_next + size > s->h_end)
 210         GOMP_PLUGIN_fatal ("unable to push map");
 211     }
 212
 213   assert (s->h_next);
 214
 215   m = s->h_next;
 216   m->async = async;
 217   m->size = size;
 218
 219   offset = (void *)&m->mappings[0] - s->h;
 220
 221   *d = (void *)(s->d + offset);
 222   *h = (void *)(s->h + offset);
 223
 224   s->h_prev = s->h_next;
 225   s->h_next += size;
 226
 227   assert (s->h_prev);
 228   assert (s->h_next);
 229
 230   assert (s->h_next >= s->h_begin);
 231   assert (s->h_tail >= s->h_begin);
 232   assert (s->h_prev >= s->h_begin);
 233   assert (s->h_next <= s->h_end);
 234   assert (s->h_tail <= s->h_end);
 235   assert (s->h_prev <= s->h_end);
 236
 237   return;
 238 }
 239
 240 /* Target data function launch information.  */
 241
 242 struct targ_fn_launch
 243 {
 244   const char *fn;
 245   unsigned short dim[GOMP_DIM_MAX];
 246 };
 247
 248 /* Target PTX object information.  */
 249
 250 struct targ_ptx_obj
 251 {
 252   const char *code;
 253   size_t size;
 254 };
 255
 256 /* Target data image information.  */
 257
 258 typedef struct nvptx_tdata
 259 {
 260   const struct targ_ptx_obj *ptx_objs;
 261   unsigned ptx_num;
 262
 263   const char *const *var_names;
 264   unsigned var_num;
 265
 266   const struct targ_fn_launch *fn_descs;
 267   unsigned fn_num;
 268 } nvptx_tdata_t;
 269
 270 /* Descriptor of a loaded function.  */
 271
 272 struct targ_fn_descriptor
 273 {
 274   CUfunction fn;
 275   const struct targ_fn_launch *launch;
 276 };
 277
 278 /* A loaded PTX image.  */
 279 struct ptx_image_data
 280 {
 281   const void *target_data;
 282   CUmodule module;
 283
 284   struct targ_fn_descriptor *fns;  /* Array of functions.  */
 285
 286   struct ptx_image_data *next;
 287 };
 288
 289 struct ptx_device
 290 {
 291   CUcontext ctx;
 292   bool ctx_shared;
 293   CUdevice dev;
 294   struct ptx_stream *null_stream;
 295   /* All non-null streams associated with this device (actually context),
 296      either created implicitly or passed in from the user (via
 297      acc_set_cuda_stream).  */
 298   struct ptx_stream *active_streams;
 299   struct {
 300     struct ptx_stream **arr;
 301     int size;
 302   } async_streams;
 303   /* A lock for use when manipulating the above stream list and array.  */
 304   pthread_mutex_t stream_lock;
 305   int ord;
 306   bool overlap;
 307   bool map;
 308   bool concur;
 309   int  mode;
 310   bool mkern;
 311
 312   struct ptx_image_data *images;  /* Images loaded on device.  */
 313   pthread_mutex_t image_lock;     /* Lock for above list.  */
 314
 315   struct ptx_device *next;
 316 };
 317
 318 enum ptx_event_type
 319 {
 320   PTX_EVT_MEM,
 321   PTX_EVT_KNL,
 322   PTX_EVT_SYNC,
 323   PTX_EVT_ASYNC_CLEANUP
 324 };
 325
 326 struct ptx_event
 327 {
 328   CUevent *evt;
 329   int type;
 330   void *addr;
 331   int ord;
 332   int val;
 333
 334   struct ptx_event *next;
 335 };
 336
 337 static pthread_mutex_t ptx_event_lock;
 338 static struct ptx_event *ptx_events;
 339
 340 static struct ptx_device **ptx_devices;
 341
 342 static inline struct nvptx_thread *
 343 nvptx_thread (void)
 344 {
 345   return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
 346 }
 347
 348 static bool
 349 init_streams_for_device (struct ptx_device *ptx_dev, int concurrency)
 350 {
 351   int i;
 352   struct ptx_stream *null_stream
 353     = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
 354
 355   null_stream->stream = NULL;
 356   null_stream->host_thread = pthread_self ();
 357   null_stream->multithreaded = true;
 358   null_stream->d = (CUdeviceptr) NULL;
 359   null_stream->h = NULL;
 360   if (!map_init (null_stream))
 361     return false;
 362
 363   ptx_dev->null_stream = null_stream;
 364   ptx_dev->active_streams = NULL;
 365   pthread_mutex_init (&ptx_dev->stream_lock, NULL);
 366
 367   if (concurrency < 1)
 368     concurrency = 1;
 369
 370   /* This is just a guess -- make space for as many async streams as the
 371      current device is capable of concurrently executing.  This can grow
 372      later as necessary.  No streams are created yet.  */
 373   ptx_dev->async_streams.arr
 374     = GOMP_PLUGIN_malloc (concurrency * sizeof (struct ptx_stream *));
 375   ptx_dev->async_streams.size = concurrency;
 376
 377   for (i = 0; i < concurrency; i++)
 378     ptx_dev->async_streams.arr[i] = NULL;
 379
 380   return true;
 381 }
 382
 383 static bool
 384 fini_streams_for_device (struct ptx_device *ptx_dev)
 385 {
 386   free (ptx_dev->async_streams.arr);
 387
 388   bool ret = true;
 389   while (ptx_dev->active_streams != NULL)
 390     {
 391       struct ptx_stream *s = ptx_dev->active_streams;
 392       ptx_dev->active_streams = ptx_dev->active_streams->next;
 393
 394       ret &= map_fini (s);
 395
 396       CUresult r = cuStreamDestroy (s->stream);
 397       if (r != CUDA_SUCCESS)
 398         {
 399           GOMP_PLUGIN_error ("cuStreamDestroy error: %s", cuda_error (r));
 400           ret = false;
 401         }
 402       free (s);
 403     }
 404
 405   ret &= map_fini (ptx_dev->null_stream);
 406   free (ptx_dev->null_stream);
 407   return ret;
 408 }
 409
 410 /* Select a stream for (OpenACC-semantics) ASYNC argument for the current
 411    thread THREAD (and also current device/context).  If CREATE is true, create
 412    the stream if it does not exist (or use EXISTING if it is non-NULL), and
 413    associate the stream with the same thread argument.  Returns stream to use
 414    as result.  */
 415
 416 static struct ptx_stream *
 417 select_stream_for_async (int async, pthread_t thread, bool create,
 418                          CUstream existing)
 419 {
 420   struct nvptx_thread *nvthd = nvptx_thread ();
 421   /* Local copy of TLS variable.  */
 422   struct ptx_device *ptx_dev = nvthd->ptx_dev;
 423   struct ptx_stream *stream = NULL;
 424   int orig_async = async;
 425
 426   /* The special value acc_async_noval (-1) maps (for now) to an
 427      implicitly-created stream, which is then handled the same as any other
 428      numbered async stream.  Other options are available, e.g. using the null
 429      stream for anonymous async operations, or choosing an idle stream from an
 430      active set.  But, stick with this for now.  */
 431   if (async > acc_async_sync)
 432     async++;
 433
 434   if (create)
 435     pthread_mutex_lock (&ptx_dev->stream_lock);
 436
 437   /* NOTE: AFAICT there's no particular need for acc_async_sync to map to the
 438      null stream, and in fact better performance may be obtainable if it doesn't
 439      (because the null stream enforces overly-strict synchronisation with
 440      respect to other streams for legacy reasons, and that's probably not
 441      needed with OpenACC).  Maybe investigate later.  */
 442   if (async == acc_async_sync)
 443     stream = ptx_dev->null_stream;
 444   else if (async >= 0 && async < ptx_dev->async_streams.size
 445            && ptx_dev->async_streams.arr[async] && !(create && existing))
 446     stream = ptx_dev->async_streams.arr[async];
 447   else if (async >= 0 && create)
 448     {
 449       if (async >= ptx_dev->async_streams.size)
 450         {
 451           int i, newsize = ptx_dev->async_streams.size * 2;
 452
 453           if (async >= newsize)
 454             newsize = async + 1;
 455
 456           ptx_dev->async_streams.arr
 457             = GOMP_PLUGIN_realloc (ptx_dev->async_streams.arr,
 458                                    newsize * sizeof (struct ptx_stream *));
 459
 460           for (i = ptx_dev->async_streams.size; i < newsize; i++)
 461             ptx_dev->async_streams.arr[i] = NULL;
 462
 463           ptx_dev->async_streams.size = newsize;
 464         }
 465
 466       /* Create a new stream on-demand if there isn't one already, or if we're
 467          setting a particular async value to an existing (externally-provided)
 468          stream.  */
 469       if (!ptx_dev->async_streams.arr[async] || existing)
 470         {
 471           CUresult r;
 472           struct ptx_stream *s
 473             = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
 474
 475           if (existing)
 476             s->stream = existing;
 477           else
 478             {
 479               r = cuStreamCreate (&s->stream, CU_STREAM_DEFAULT);
 480               if (r != CUDA_SUCCESS)
 481                 {
 482                   pthread_mutex_unlock (&ptx_dev->stream_lock);
 483                   GOMP_PLUGIN_fatal ("cuStreamCreate error: %s",
 484                                      cuda_error (r));
 485                 }
 486             }
 487
 488           /* If CREATE is true, we're going to be queueing some work on this
 489              stream.  Associate it with the current host thread.  */
 490           s->host_thread = thread;
 491           s->multithreaded = false;
 492
 493           s->d = (CUdeviceptr) NULL;
 494           s->h = NULL;
 495           if (!map_init (s))
 496             {
 497               pthread_mutex_unlock (&ptx_dev->stream_lock);
 498               GOMP_PLUGIN_fatal ("map_init fail");
 499             }
 500
 501           s->next = ptx_dev->active_streams;
 502           ptx_dev->active_streams = s;
 503           ptx_dev->async_streams.arr[async] = s;
 504         }
 505
 506       stream = ptx_dev->async_streams.arr[async];
 507     }
 508   else if (async < 0)
 509     {
 510       if (create)
 511         pthread_mutex_unlock (&ptx_dev->stream_lock);
 512       GOMP_PLUGIN_fatal ("bad async %d", async);
 513     }
 514
 515   if (create)
 516     {
 517       assert (stream != NULL);
 518
 519       /* If we're trying to use the same stream from different threads
 520          simultaneously, set stream->multithreaded to true.  This affects the
 521          behaviour of acc_async_test_all and acc_wait_all, which are supposed to
 522          only wait for asynchronous launches from the same host thread they are
 523          invoked on.  If multiple threads use the same async value, we make note
 524          of that here and fall back to testing/waiting for all threads in those
 525          functions.  */
 526       if (thread != stream->host_thread)
 527         stream->multithreaded = true;
 528
 529       pthread_mutex_unlock (&ptx_dev->stream_lock);
 530     }
 531   else if (stream && !stream->multithreaded
 532            && !pthread_equal (stream->host_thread, thread))
 533     GOMP_PLUGIN_fatal ("async %d used on wrong thread", orig_async);
 534
 535   return stream;
 536 }
 537
 538 /* Initialize the device.  Return TRUE on success, else FALSE.  PTX_DEV_LOCK
 539    should be locked on entry and remains locked on exit.  */
 540
 541 static bool
 542 nvptx_init (void)
 543 {
 544   int ndevs;
 545
 546   if (instantiated_devices != 0)
 547     return true;
 548
 549   CUDA_CALL (cuInit, 0);
 550   ptx_events = NULL;
 551   pthread_mutex_init (&ptx_event_lock, NULL);
 552
 553   CUDA_CALL (cuDeviceGetCount, &ndevs);
 554   ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *)
 555                                             * ndevs);
 556   return true;
 557 }
 558
 559 /* Select the N'th PTX device for the current host thread.  The device must
 560    have been previously opened before calling this function.  */
 561
 562 static bool
 563 nvptx_attach_host_thread_to_device (int n)
 564 {
 565   CUdevice dev;
 566   CUresult r;
 567   struct ptx_device *ptx_dev;
 568   CUcontext thd_ctx;
 569
 570   r = cuCtxGetDevice (&dev);
 571   if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
 572     {
 573       GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
 574       return false;
 575     }
 576
 577   if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n)
 578     return true;
 579   else
 580     {
 581       CUcontext old_ctx;
 582
 583       ptx_dev = ptx_devices[n];
 584       if (!ptx_dev)
 585         {
 586           GOMP_PLUGIN_error ("device %d not found", n);
 587           return false;
 588         }
 589
 590       CUDA_CALL (cuCtxGetCurrent, &thd_ctx);
 591
 592       /* We don't necessarily have a current context (e.g. if it has been
 593          destroyed.  Pop it if we do though.  */
 594       if (thd_ctx != NULL)
 595         CUDA_CALL (cuCtxPopCurrent, &old_ctx);
 596
 597       CUDA_CALL (cuCtxPushCurrent, ptx_dev->ctx);
 598     }
 599   return true;
 600 }
 601
 602 static struct ptx_device *
 603 nvptx_open_device (int n)
 604 {
 605   struct ptx_device *ptx_dev;
 606   CUdevice dev, ctx_dev;
 607   CUresult r;
 608   int async_engines, pi;
 609
 610   CUDA_CALL_ERET (NULL, cuDeviceGet, &dev, n);
 611
 612   ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device));
 613
 614   ptx_dev->ord = n;
 615   ptx_dev->dev = dev;
 616   ptx_dev->ctx_shared = false;
 617
 618   r = cuCtxGetDevice (&ctx_dev);
 619   if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
 620     {
 621       GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
 622       return NULL;
 623     }
 624
 625   if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev)
 626     {
 627       /* The current host thread has an active context for a different device.
 628          Detach it.  */
 629       CUcontext old_ctx;
 630       CUDA_CALL_ERET (NULL, cuCtxPopCurrent, &old_ctx);
 631     }
 632
 633   CUDA_CALL_ERET (NULL, cuCtxGetCurrent, &ptx_dev->ctx);
 634
 635   if (!ptx_dev->ctx)
 636     CUDA_CALL_ERET (NULL, cuCtxCreate, &ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
 637   else
 638     ptx_dev->ctx_shared = true;
 639
 640   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 641                   &pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
 642   ptx_dev->overlap = pi;
 643
 644   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 645                   &pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
 646   ptx_dev->map = pi;
 647
 648   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 649                   &pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
 650   ptx_dev->concur = pi;
 651
 652   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 653                   &pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
 654   ptx_dev->mode = pi;
 655
 656   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 657                   &pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
 658   ptx_dev->mkern = pi;
 659
 660   r = cuDeviceGetAttribute (&async_engines,
 661                             CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
 662   if (r != CUDA_SUCCESS)
 663     async_engines = 1;
 664
 665   ptx_dev->images = NULL;
 666   pthread_mutex_init (&ptx_dev->image_lock, NULL);
 667
 668   if (!init_streams_for_device (ptx_dev, async_engines))
 669     return NULL;
 670
 671   return ptx_dev;
 672 }
 673
 674 static bool
 675 nvptx_close_device (struct ptx_device *ptx_dev)
 676 {
 677   if (!ptx_dev)
 678     return true;
 679
 680   if (!fini_streams_for_device (ptx_dev))
 681     return false;
 682
 683   pthread_mutex_destroy (&ptx_dev->image_lock);
 684
 685   if (!ptx_dev->ctx_shared)
 686     CUDA_CALL (cuCtxDestroy, ptx_dev->ctx);
 687
 688   free (ptx_dev);
 689   return true;
 690 }
 691
 692 static int
 693 nvptx_get_num_devices (void)
 694 {
 695   int n;
 696
 697   /* PR libgomp/65099: Currently, we only support offloading in 64-bit
 698      configurations.  */
 699   if (sizeof (void *) != 8)
 700     return 0;
 701
 702   /* This function will be called before the plugin has been initialized in
 703      order to enumerate available devices, but CUDA API routines can't be used
 704      until cuInit has been called.  Just call it now (but don't yet do any
 705      further initialization).  */
 706   if (instantiated_devices == 0)
 707     {
 708       CUresult r = cuInit (0);
 709       /* This is not an error: e.g. we may have CUDA libraries installed but
 710          no devices available.  */
 711       if (r != CUDA_SUCCESS)
 712         return 0;
 713     }
 714
 715   CUDA_CALL_ERET (-1, cuDeviceGetCount, &n);
 716   return n;
 717 }
 718
 719
 720 static bool
 721 link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs,
 722           unsigned num_objs)
 723 {
 724   CUjit_option opts[6];
 725   void *optvals[6];
 726   float elapsed = 0.0;
 727 #define LOGSIZE 8192
 728   char elog[LOGSIZE];
 729   char ilog[LOGSIZE];
 730   unsigned long logsize = LOGSIZE;
 731   CUlinkState linkstate;
 732   CUresult r;
 733   void *linkout;
 734   size_t linkoutsize __attribute__ ((unused));
 735
 736   opts[0] = CU_JIT_WALL_TIME;
 737   optvals[0] = &elapsed;
 738
 739   opts[1] = CU_JIT_INFO_LOG_BUFFER;
 740   optvals[1] = &ilog[0];
 741
 742   opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
 743   optvals[2] = (void *) logsize;
 744
 745   opts[3] = CU_JIT_ERROR_LOG_BUFFER;
 746   optvals[3] = &elog[0];
 747
 748   opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
 749   optvals[4] = (void *) logsize;
 750
 751   opts[5] = CU_JIT_LOG_VERBOSE;
 752   optvals[5] = (void *) 1;
 753
 754   CUDA_CALL (cuLinkCreate, 6, opts, optvals, &linkstate);
 755
 756   for (; num_objs--; ptx_objs++)
 757     {
 758       /* cuLinkAddData's 'data' argument erroneously omits the const
 759          qualifier.  */
 760       GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs->code);
 761       r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, (char*)ptx_objs->code,
 762                          ptx_objs->size, 0, 0, 0, 0);
 763       if (r != CUDA_SUCCESS)
 764         {
 765           GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
 766           GOMP_PLUGIN_error ("cuLinkAddData (ptx_code) error: %s",
 767                              cuda_error (r));
 768           return false;
 769         }
 770     }
 771
 772   GOMP_PLUGIN_debug (0, "Linking\n");
 773   r = cuLinkComplete (linkstate, &linkout, &linkoutsize);
 774
 775   GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed);
 776   GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]);
 777
 778   if (r != CUDA_SUCCESS)
 779     {
 780       GOMP_PLUGIN_error ("cuLinkComplete error: %s", cuda_error (r));
 781       return false;
 782     }
 783
 784   CUDA_CALL (cuModuleLoadData, module, linkout);
 785   CUDA_CALL (cuLinkDestroy, linkstate);
 786   return true;
 787 }
 788
 789 static void
 790 event_gc (bool memmap_lockable)
 791 {
 792   struct ptx_event *ptx_event = ptx_events;
 793   struct ptx_event *async_cleanups = NULL;
 794   struct nvptx_thread *nvthd = nvptx_thread ();
 795
 796   pthread_mutex_lock (&ptx_event_lock);
 797
 798   while (ptx_event != NULL)
 799     {
 800       CUresult r;
 801       struct ptx_event *e = ptx_event;
 802
 803       ptx_event = ptx_event->next;
 804
 805       if (e->ord != nvthd->ptx_dev->ord)
 806         continue;
 807
 808       r = cuEventQuery (*e->evt);
 809       if (r == CUDA_SUCCESS)
 810         {
 811           bool append_async = false;
 812           CUevent *te;
 813
 814           te = e->evt;
 815
 816           switch (e->type)
 817             {
 818             case PTX_EVT_MEM:
 819             case PTX_EVT_SYNC:
 820               break;
 821
 822             case PTX_EVT_KNL:
 823               map_pop (e->addr);
 824               break;
 825
 826             case PTX_EVT_ASYNC_CLEANUP:
 827               {
 828                 /* The function gomp_plugin_async_unmap_vars needs to claim the
 829                    memory-map splay tree lock for the current device, so we
 830                    can't call it when one of our callers has already claimed
 831                    the lock.  In that case, just delay the GC for this event
 832                    until later.  */
 833                 if (!memmap_lockable)
 834                   continue;
 835
 836                 append_async = true;
 837               }
 838               break;
 839             }
 840
 841           cuEventDestroy (*te);
 842           free ((void *)te);
 843
 844           /* Unlink 'e' from ptx_events list.  */
 845           if (ptx_events == e)
 846             ptx_events = ptx_events->next;
 847           else
 848             {
 849               struct ptx_event *e_ = ptx_events;
 850               while (e_->next != e)
 851                 e_ = e_->next;
 852               e_->next = e_->next->next;
 853             }
 854
 855           if (append_async)
 856             {
 857               e->next = async_cleanups;
 858               async_cleanups = e;
 859             }
 860           else
 861             free (e);
 862         }
 863     }
 864
 865   pthread_mutex_unlock (&ptx_event_lock);
 866
 867   /* We have to do these here, after ptx_event_lock is released.  */
 868   while (async_cleanups)
 869     {
 870       struct ptx_event *e = async_cleanups;
 871       async_cleanups = async_cleanups->next;
 872
 873       GOMP_PLUGIN_async_unmap_vars (e->addr, e->val);
 874       free (e);
 875     }
 876 }
 877
 878 static void
 879 event_add (enum ptx_event_type type, CUevent *e, void *h, int val)
 880 {
 881   struct ptx_event *ptx_event;
 882   struct nvptx_thread *nvthd = nvptx_thread ();
 883
 884   assert (type == PTX_EVT_MEM || type == PTX_EVT_KNL || type == PTX_EVT_SYNC
 885           || type == PTX_EVT_ASYNC_CLEANUP);
 886
 887   ptx_event = GOMP_PLUGIN_malloc (sizeof (struct ptx_event));
 888   ptx_event->type = type;
 889   ptx_event->evt = e;
 890   ptx_event->addr = h;
 891   ptx_event->ord = nvthd->ptx_dev->ord;
 892   ptx_event->val = val;
 893
 894   pthread_mutex_lock (&ptx_event_lock);
 895
 896   ptx_event->next = ptx_events;
 897   ptx_events = ptx_event;
 898
 899   pthread_mutex_unlock (&ptx_event_lock);
 900 }
 901
 902 void
 903 nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
 904             int async, unsigned *dims, void *targ_mem_desc)
 905 {
 906   struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
 907   CUfunction function;
 908   CUresult r;
 909   int i;
 910   struct ptx_stream *dev_str;
 911   void *kargs[1];
 912   void *hp, *dp;
 913   struct nvptx_thread *nvthd = nvptx_thread ();
 914   const char *maybe_abort_msg = "(perhaps abort was called)";
 915
 916   function = targ_fn->fn;
 917
 918   dev_str = select_stream_for_async (async, pthread_self (), false, NULL);
 919   assert (dev_str == nvthd->current_stream);
 920
 921   /* Initialize the launch dimensions.  Typically this is constant,
 922      provided by the device compiler, but we must permit runtime
 923      values.  */
 924   int seen_zero = 0;
 925   for (i = 0; i != GOMP_DIM_MAX; i++)
 926     {
 927       if (targ_fn->launch->dim[i])
 928        dims[i] = targ_fn->launch->dim[i];
 929       if (!dims[i])
 930        seen_zero = 1;
 931     }
 932
 933   if (seen_zero)
 934     {
 935       for (i = 0; i != GOMP_DIM_MAX; i++)
 936        if (!dims[i])
 937          dims[i] = /* TODO */ 32;
 938     }
 939
 940   /* This reserves a chunk of a pre-allocated page of memory mapped on both
 941      the host and the device. HP is a host pointer to the new chunk, and DP is
 942      the corresponding device pointer.  */
 943   map_push (dev_str, async, mapnum * sizeof (void *), &hp, &dp);
 944
 945   GOMP_PLUGIN_debug (0, "  %s: prepare mappings\n", __FUNCTION__);
 946
 947   /* Copy the array of arguments to the mapped page.  */
 948   for (i = 0; i < mapnum; i++)
 949     ((void **) hp)[i] = devaddrs[i];
 950
 951   /* Copy the (device) pointers to arguments to the device (dp and hp might in
 952      fact have the same value on a unified-memory system).  */
 953   CUDA_CALL_ASSERT (cuMemcpy, (CUdeviceptr) dp, (CUdeviceptr) hp,
 954                     mapnum * sizeof (void *));
 955   GOMP_PLUGIN_debug (0, "  %s: kernel %s: launch"
 956                      " gangs=%u, workers=%u, vectors=%u\n",
 957                      __FUNCTION__, targ_fn->launch->fn,
 958                      dims[0], dims[1], dims[2]);
 959
 960   // OpenACC            CUDA
 961   //
 962   // num_gangs          nctaid.x
 963   // num_workers        ntid.y
 964   // vector length      ntid.x
 965
 966   kargs[0] = &dp;
 967   CUDA_CALL_ASSERT (cuLaunchKernel, function,
 968                     dims[GOMP_DIM_GANG], 1, 1,
 969                     dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1,
 970                     0, dev_str->stream, kargs, 0);
 971
 972 #ifndef DISABLE_ASYNC
 973   if (async < acc_async_noval)
 974     {
 975       r = cuStreamSynchronize (dev_str->stream);
 976       if (r == CUDA_ERROR_LAUNCH_FAILED)
 977         GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
 978                            maybe_abort_msg);
 979       else if (r != CUDA_SUCCESS)
 980         GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
 981     }
 982   else
 983     {
 984       CUevent *e;
 985
 986       e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
 987
 988       r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
 989       if (r == CUDA_ERROR_LAUNCH_FAILED)
 990         GOMP_PLUGIN_fatal ("cuEventCreate error: %s %s\n", cuda_error (r),
 991                            maybe_abort_msg);
 992       else if (r != CUDA_SUCCESS)
 993         GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
 994
 995       event_gc (true);
 996
 997       CUDA_CALL_ASSERT (cuEventRecord, *e, dev_str->stream);
 998
 999       event_add (PTX_EVT_KNL, e, (void *)dev_str, 0);
1000     }
1001 #else
1002   r = cuCtxSynchronize ();
1003   if (r == CUDA_ERROR_LAUNCH_FAILED)
1004     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
1005                        maybe_abort_msg);
1006   else if (r != CUDA_SUCCESS)
1007     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
1008 #endif
1009
1010   GOMP_PLUGIN_debug (0, "  %s: kernel %s: finished\n", __FUNCTION__,
1011                      targ_fn->launch->fn);
1012
1013 #ifndef DISABLE_ASYNC
1014   if (async < acc_async_noval)
1015 #endif
1016     map_pop (dev_str);
1017 }
1018
1019 void * openacc_get_current_cuda_context (void);
1020
1021 static void *
1022 nvptx_alloc (size_t s)
1023 {
1024   CUdeviceptr d;
1025
1026   CUDA_CALL_ERET (NULL, cuMemAlloc, &d, s);
1027   return (void *) d;
1028 }
1029
1030 static bool
1031 nvptx_free (void *p)
1032 {
1033   CUdeviceptr pb;
1034   size_t ps;
1035
1036   CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) p);
1037   if ((CUdeviceptr) p != pb)
1038     {
1039       GOMP_PLUGIN_error ("invalid device address");
1040       return false;
1041     }
1042
1043   CUDA_CALL (cuMemFree, (CUdeviceptr) p);
1044   return true;
1045 }
1046
1047
1048 static bool
1049 nvptx_host2dev (void *d, const void *h, size_t s)
1050 {
1051   CUdeviceptr pb;
1052   size_t ps;
1053   struct nvptx_thread *nvthd = nvptx_thread ();
1054
1055   if (!s)
1056     return true;
1057   if (!d)
1058     {
1059       GOMP_PLUGIN_error ("invalid device address");
1060       return false;
1061     }
1062
1063   CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
1064
1065   if (!pb)
1066     {
1067       GOMP_PLUGIN_error ("invalid device address");
1068       return false;
1069     }
1070   if (!h)
1071     {
1072       GOMP_PLUGIN_error ("invalid host address");
1073       return false;
1074     }
1075   if (d == h)
1076     {
1077       GOMP_PLUGIN_error ("invalid host or device address");
1078       return false;
1079     }
1080   if ((void *)(d + s) > (void *)(pb + ps))
1081     {
1082       GOMP_PLUGIN_error ("invalid size");
1083       return false;
1084     }
1085
1086 #ifndef DISABLE_ASYNC
1087   if (nvthd->current_stream != nvthd->ptx_dev->null_stream)
1088     {
1089       CUevent *e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1090       CUDA_CALL (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1091       event_gc (false);
1092       CUDA_CALL (cuMemcpyHtoDAsync,
1093                  (CUdeviceptr) d, h, s, nvthd->current_stream->stream);
1094       CUDA_CALL (cuEventRecord, *e, nvthd->current_stream->stream);
1095       event_add (PTX_EVT_MEM, e, (void *)h, 0);
1096     }
1097   else
1098 #endif
1099     CUDA_CALL (cuMemcpyHtoD, (CUdeviceptr) d, h, s);
1100
1101   return true;
1102 }
1103
1104 static bool
1105 nvptx_dev2host (void *h, const void *d, size_t s)
1106 {
1107   CUdeviceptr pb;
1108   size_t ps;
1109   struct nvptx_thread *nvthd = nvptx_thread ();
1110
1111   if (!s)
1112     return true;
1113   if (!d)
1114     {
1115       GOMP_PLUGIN_error ("invalid device address");
1116       return false;
1117     }
1118
1119   CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
1120
1121   if (!pb)
1122     {
1123       GOMP_PLUGIN_error ("invalid device address");
1124       return false;
1125     }
1126   if (!h)
1127     {
1128       GOMP_PLUGIN_error ("invalid host address");
1129       return false;
1130     }
1131   if (d == h)
1132     {
1133       GOMP_PLUGIN_error ("invalid host or device address");
1134       return false;
1135     }
1136   if ((void *)(d + s) > (void *)(pb + ps))
1137     {
1138       GOMP_PLUGIN_error ("invalid size");
1139       return false;
1140     }
1141
1142 #ifndef DISABLE_ASYNC
1143   if (nvthd->current_stream != nvthd->ptx_dev->null_stream)
1144     {
1145       CUevent *e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1146       CUDA_CALL (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1147       event_gc (false);
1148       CUDA_CALL (cuMemcpyDtoHAsync,
1149                  h, (CUdeviceptr) d, s, nvthd->current_stream->stream);
1150       CUDA_CALL (cuEventRecord, *e, nvthd->current_stream->stream);
1151       event_add (PTX_EVT_MEM, e, (void *)h, 0);
1152     }
1153   else
1154 #endif
1155     CUDA_CALL (cuMemcpyDtoH, h, (CUdeviceptr) d, s);
1156
1157   return true;
1158 }
1159
1160 static void
1161 nvptx_set_async (int async)
1162 {
1163   struct nvptx_thread *nvthd = nvptx_thread ();
1164   nvthd->current_stream
1165     = select_stream_for_async (async, pthread_self (), true, NULL);
1166 }
1167
1168 static int
1169 nvptx_async_test (int async)
1170 {
1171   CUresult r;
1172   struct ptx_stream *s;
1173
1174   s = select_stream_for_async (async, pthread_self (), false, NULL);
1175
1176   if (!s)
1177     GOMP_PLUGIN_fatal ("unknown async %d", async);
1178
1179   r = cuStreamQuery (s->stream);
1180   if (r == CUDA_SUCCESS)
1181     {
1182       /* The oacc-parallel.c:goacc_wait function calls this hook to determine
1183          whether all work has completed on this stream, and if so omits the call
1184          to the wait hook.  If that happens, event_gc might not get called
1185          (which prevents variables from getting unmapped and their associated
1186          device storage freed), so call it here.  */
1187       event_gc (true);
1188       return 1;
1189     }
1190   else if (r == CUDA_ERROR_NOT_READY)
1191     return 0;
1192
1193   GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
1194
1195   return 0;
1196 }
1197
1198 static int
1199 nvptx_async_test_all (void)
1200 {
1201   struct ptx_stream *s;
1202   pthread_t self = pthread_self ();
1203   struct nvptx_thread *nvthd = nvptx_thread ();
1204
1205   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1206
1207   for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
1208     {
1209       if ((s->multithreaded || pthread_equal (s->host_thread, self))
1210           && cuStreamQuery (s->stream) == CUDA_ERROR_NOT_READY)
1211         {
1212           pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1213           return 0;
1214         }
1215     }
1216
1217   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1218
1219   event_gc (true);
1220
1221   return 1;
1222 }
1223
1224 static void
1225 nvptx_wait (int async)
1226 {
1227   struct ptx_stream *s;
1228
1229   s = select_stream_for_async (async, pthread_self (), false, NULL);
1230   if (!s)
1231     GOMP_PLUGIN_fatal ("unknown async %d", async);
1232
1233   CUDA_CALL_ASSERT (cuStreamSynchronize, s->stream);
1234
1235   event_gc (true);
1236 }
1237
1238 static void
1239 nvptx_wait_async (int async1, int async2)
1240 {
1241   CUevent *e;
1242   struct ptx_stream *s1, *s2;
1243   pthread_t self = pthread_self ();
1244
1245   /* The stream that is waiting (rather than being waited for) doesn't
1246      necessarily have to exist already.  */
1247   s2 = select_stream_for_async (async2, self, true, NULL);
1248
1249   s1 = select_stream_for_async (async1, self, false, NULL);
1250   if (!s1)
1251     GOMP_PLUGIN_fatal ("invalid async 1\n");
1252
1253   if (s1 == s2)
1254     GOMP_PLUGIN_fatal ("identical parameters");
1255
1256   e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1257
1258   CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1259
1260   event_gc (true);
1261
1262   CUDA_CALL_ASSERT (cuEventRecord, *e, s1->stream);
1263
1264   event_add (PTX_EVT_SYNC, e, NULL, 0);
1265
1266   CUDA_CALL_ASSERT (cuStreamWaitEvent, s2->stream, *e, 0);
1267 }
1268
1269 static void
1270 nvptx_wait_all (void)
1271 {
1272   CUresult r;
1273   struct ptx_stream *s;
1274   pthread_t self = pthread_self ();
1275   struct nvptx_thread *nvthd = nvptx_thread ();
1276
1277   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1278
1279   /* Wait for active streams initiated by this thread (or by multiple threads)
1280      to complete.  */
1281   for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
1282     {
1283       if (s->multithreaded || pthread_equal (s->host_thread, self))
1284         {
1285           r = cuStreamQuery (s->stream);
1286           if (r == CUDA_SUCCESS)
1287             continue;
1288           else if (r != CUDA_ERROR_NOT_READY)
1289             GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
1290
1291           CUDA_CALL_ASSERT (cuStreamSynchronize, s->stream);
1292         }
1293     }
1294
1295   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1296
1297   event_gc (true);
1298 }
1299
1300 static void
1301 nvptx_wait_all_async (int async)
1302 {
1303   struct ptx_stream *waiting_stream, *other_stream;
1304   CUevent *e;
1305   struct nvptx_thread *nvthd = nvptx_thread ();
1306   pthread_t self = pthread_self ();
1307
1308   /* The stream doing the waiting.  This could be the first mention of the
1309      stream, so create it if necessary.  */
1310   waiting_stream
1311     = select_stream_for_async (async, pthread_self (), true, NULL);
1312
1313   /* Launches on the null stream already block on other streams in the
1314      context.  */
1315   if (!waiting_stream || waiting_stream == nvthd->ptx_dev->null_stream)
1316     return;
1317
1318   event_gc (true);
1319
1320   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1321
1322   for (other_stream = nvthd->ptx_dev->active_streams;
1323        other_stream != NULL;
1324        other_stream = other_stream->next)
1325     {
1326       if (!other_stream->multithreaded
1327           && !pthread_equal (other_stream->host_thread, self))
1328         continue;
1329
1330       e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1331
1332       CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1333
1334       /* Record an event on the waited-for stream.  */
1335       CUDA_CALL_ASSERT (cuEventRecord, *e, other_stream->stream);
1336
1337       event_add (PTX_EVT_SYNC, e, NULL, 0);
1338
1339       CUDA_CALL_ASSERT (cuStreamWaitEvent, waiting_stream->stream, *e, 0);
1340    }
1341
1342   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1343 }
1344
1345 static void *
1346 nvptx_get_current_cuda_device (void)
1347 {
1348   struct nvptx_thread *nvthd = nvptx_thread ();
1349
1350   if (!nvthd || !nvthd->ptx_dev)
1351     return NULL;
1352
1353   return &nvthd->ptx_dev->dev;
1354 }
1355
1356 static void *
1357 nvptx_get_current_cuda_context (void)
1358 {
1359   struct nvptx_thread *nvthd = nvptx_thread ();
1360
1361   if (!nvthd || !nvthd->ptx_dev)
1362     return NULL;
1363
1364   return nvthd->ptx_dev->ctx;
1365 }
1366
1367 static void *
1368 nvptx_get_cuda_stream (int async)
1369 {
1370   struct ptx_stream *s;
1371   struct nvptx_thread *nvthd = nvptx_thread ();
1372
1373   if (!nvthd || !nvthd->ptx_dev)
1374     return NULL;
1375
1376   s = select_stream_for_async (async, pthread_self (), false, NULL);
1377
1378   return s ? s->stream : NULL;
1379 }
1380
1381 static int
1382 nvptx_set_cuda_stream (int async, void *stream)
1383 {
1384   struct ptx_stream *oldstream;
1385   pthread_t self = pthread_self ();
1386   struct nvptx_thread *nvthd = nvptx_thread ();
1387
1388   if (async < 0)
1389     GOMP_PLUGIN_fatal ("bad async %d", async);
1390
1391   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1392
1393   /* We have a list of active streams and an array mapping async values to
1394      entries of that list.  We need to take "ownership" of the passed-in stream,
1395      and add it to our list, removing the previous entry also (if there was one)
1396      in order to prevent resource leaks.  Note the potential for surprise
1397      here: maybe we should keep track of passed-in streams and leave it up to
1398      the user to tidy those up, but that doesn't work for stream handles
1399      returned from acc_get_cuda_stream above...  */
1400
1401   oldstream = select_stream_for_async (async, self, false, NULL);
1402
1403   if (oldstream)
1404     {
1405       if (nvthd->ptx_dev->active_streams == oldstream)
1406         nvthd->ptx_dev->active_streams = nvthd->ptx_dev->active_streams->next;
1407       else
1408         {
1409           struct ptx_stream *s = nvthd->ptx_dev->active_streams;
1410           while (s->next != oldstream)
1411             s = s->next;
1412           s->next = s->next->next;
1413         }
1414
1415       CUDA_CALL_ASSERT (cuStreamDestroy, oldstream->stream);
1416
1417       if (!map_fini (oldstream))
1418         GOMP_PLUGIN_fatal ("error when freeing host memory");
1419
1420       free (oldstream);
1421     }
1422
1423   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1424
1425   (void) select_stream_for_async (async, self, true, (CUstream) stream);
1426
1427   return 1;
1428 }
1429
1430 /* Plugin entry points.  */
1431
1432 const char *
1433 GOMP_OFFLOAD_get_name (void)
1434 {
1435   return "nvptx";
1436 }
1437
1438 unsigned int
1439 GOMP_OFFLOAD_get_caps (void)
1440 {
1441   return GOMP_OFFLOAD_CAP_OPENACC_200;
1442 }
1443
1444 int
1445 GOMP_OFFLOAD_get_type (void)
1446 {
1447   return OFFLOAD_TARGET_TYPE_NVIDIA_PTX;
1448 }
1449
1450 int
1451 GOMP_OFFLOAD_get_num_devices (void)
1452 {
1453   return nvptx_get_num_devices ();
1454 }
1455
1456 bool
1457 GOMP_OFFLOAD_init_device (int n)
1458 {
1459   struct ptx_device *dev;
1460
1461   pthread_mutex_lock (&ptx_dev_lock);
1462
1463   if (!nvptx_init () || ptx_devices[n] != NULL)
1464     {
1465       pthread_mutex_unlock (&ptx_dev_lock);
1466       return false;
1467     }
1468
1469   dev = nvptx_open_device (n);
1470   if (dev)
1471     {
1472       ptx_devices[n] = dev;
1473       instantiated_devices++;
1474     }
1475
1476   pthread_mutex_unlock (&ptx_dev_lock);
1477
1478   return dev != NULL;
1479 }
1480
1481 bool
1482 GOMP_OFFLOAD_fini_device (int n)
1483 {
1484   pthread_mutex_lock (&ptx_dev_lock);
1485
1486   if (ptx_devices[n] != NULL)
1487     {
1488       if (!nvptx_attach_host_thread_to_device (n)
1489           || !nvptx_close_device (ptx_devices[n]))
1490         {
1491           pthread_mutex_unlock (&ptx_dev_lock);
1492           return false;
1493         }
1494       ptx_devices[n] = NULL;
1495       instantiated_devices--;
1496     }
1497
1498   pthread_mutex_unlock (&ptx_dev_lock);
1499   return true;
1500 }
1501
1502 /* Return the libgomp version number we're compatible with.  There is
1503    no requirement for cross-version compatibility.  */
1504
1505 unsigned
1506 GOMP_OFFLOAD_version (void)
1507 {
1508   return GOMP_VERSION;
1509 }
1510
1511 /* Load the (partial) program described by TARGET_DATA to device
1512    number ORD.  Allocate and return TARGET_TABLE.  */
1513
1514 int
1515 GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
1516                          struct addr_pair **target_table)
1517 {
1518   CUmodule module;
1519   const char *const *var_names;
1520   const struct targ_fn_launch *fn_descs;
1521   unsigned int fn_entries, var_entries, i, j;
1522   struct targ_fn_descriptor *targ_fns;
1523   struct addr_pair *targ_tbl;
1524   const nvptx_tdata_t *img_header = (const nvptx_tdata_t *) target_data;
1525   struct ptx_image_data *new_image;
1526   struct ptx_device *dev;
1527
1528   if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1529     {
1530       GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1531                          " (expected %u, received %u)",
1532                          GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1533       return -1;
1534     }
1535
1536   if (!nvptx_attach_host_thread_to_device (ord)
1537       || !link_ptx (&module, img_header->ptx_objs, img_header->ptx_num))
1538     return -1;
1539
1540   dev = ptx_devices[ord];
1541
1542   /* The mkoffload utility emits a struct of pointers/integers at the
1543      start of each offload image.  The array of kernel names and the
1544      functions addresses form a one-to-one correspondence.  */
1545
1546   var_entries = img_header->var_num;
1547   var_names = img_header->var_names;
1548   fn_entries = img_header->fn_num;
1549   fn_descs = img_header->fn_descs;
1550
1551   targ_tbl = GOMP_PLUGIN_malloc (sizeof (struct addr_pair)
1552                                  * (fn_entries + var_entries));
1553   targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
1554                                  * fn_entries);
1555
1556   *target_table = targ_tbl;
1557
1558   new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data));
1559   new_image->target_data = target_data;
1560   new_image->module = module;
1561   new_image->fns = targ_fns;
1562
1563   pthread_mutex_lock (&dev->image_lock);
1564   new_image->next = dev->images;
1565   dev->images = new_image;
1566   pthread_mutex_unlock (&dev->image_lock);
1567
1568   for (i = 0; i < fn_entries; i++, targ_fns++, targ_tbl++)
1569     {
1570       CUfunction function;
1571
1572       CUDA_CALL_ERET (-1, cuModuleGetFunction, &function, module,
1573                       fn_descs[i].fn);
1574
1575       targ_fns->fn = function;
1576       targ_fns->launch = &fn_descs[i];
1577
1578       targ_tbl->start = (uintptr_t) targ_fns;
1579       targ_tbl->end = targ_tbl->start + 1;
1580     }
1581
1582   for (j = 0; j < var_entries; j++, targ_tbl++)
1583     {
1584       CUdeviceptr var;
1585       size_t bytes;
1586
1587       CUDA_CALL_ERET (-1, cuModuleGetGlobal,
1588                       &var, &bytes, module, var_names[j]);
1589
1590       targ_tbl->start = (uintptr_t) var;
1591       targ_tbl->end = targ_tbl->start + bytes;
1592     }
1593
1594   return fn_entries + var_entries;
1595 }
1596
1597 /* Unload the program described by TARGET_DATA.  DEV_DATA is the
1598    function descriptors allocated by G_O_load_image.  */
1599
1600 bool
1601 GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data)
1602 {
1603   struct ptx_image_data *image, **prev_p;
1604   struct ptx_device *dev = ptx_devices[ord];
1605
1606   if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1607     {
1608       GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1609                          " (expected %u, received %u)",
1610                          GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1611       return false;
1612     }
1613
1614   bool ret = true;
1615   pthread_mutex_lock (&dev->image_lock);
1616   for (prev_p = &dev->images; (image = *prev_p) != 0; prev_p = &image->next)
1617     if (image->target_data == target_data)
1618       {
1619         *prev_p = image->next;
1620         if (cuModuleUnload (image->module) != CUDA_SUCCESS)
1621           ret = false;
1622         free (image->fns);
1623         free (image);
1624         break;
1625       }
1626   pthread_mutex_unlock (&dev->image_lock);
1627   return ret;
1628 }
1629
1630 void *
1631 GOMP_OFFLOAD_alloc (int ord, size_t size)
1632 {
1633   if (!nvptx_attach_host_thread_to_device (ord))
1634     return NULL;
1635   return nvptx_alloc (size);
1636 }
1637
1638 bool
1639 GOMP_OFFLOAD_free (int ord, void *ptr)
1640 {
1641   return (nvptx_attach_host_thread_to_device (ord)
1642           && nvptx_free (ptr));
1643 }
1644
1645 bool
1646 GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
1647 {
1648   return (nvptx_attach_host_thread_to_device (ord)
1649           && nvptx_dev2host (dst, src, n));
1650 }
1651
1652 bool
1653 GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
1654 {
1655   return (nvptx_attach_host_thread_to_device (ord)
1656           && nvptx_host2dev (dst, src, n));
1657 }
1658
1659 void (*device_run) (int n, void *fn_ptr, void *vars) = NULL;
1660
1661 void
1662 GOMP_OFFLOAD_openacc_parallel (void (*fn) (void *), size_t mapnum,
1663                                void **hostaddrs, void **devaddrs,
1664                                int async, unsigned *dims, void *targ_mem_desc)
1665 {
1666   nvptx_exec (fn, mapnum, hostaddrs, devaddrs, async, dims, targ_mem_desc);
1667 }
1668
1669 void
1670 GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc, int async)
1671 {
1672   struct nvptx_thread *nvthd = nvptx_thread ();
1673   CUevent *e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1674
1675   CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1676   CUDA_CALL_ASSERT (cuEventRecord, *e, nvthd->current_stream->stream);
1677   event_add (PTX_EVT_ASYNC_CLEANUP, e, targ_mem_desc, async);
1678 }
1679
1680 int
1681 GOMP_OFFLOAD_openacc_async_test (int async)
1682 {
1683   return nvptx_async_test (async);
1684 }
1685
1686 int
1687 GOMP_OFFLOAD_openacc_async_test_all (void)
1688 {
1689   return nvptx_async_test_all ();
1690 }
1691
1692 void
1693 GOMP_OFFLOAD_openacc_async_wait (int async)
1694 {
1695   nvptx_wait (async);
1696 }
1697
1698 void
1699 GOMP_OFFLOAD_openacc_async_wait_async (int async1, int async2)
1700 {
1701   nvptx_wait_async (async1, async2);
1702 }
1703
1704 void
1705 GOMP_OFFLOAD_openacc_async_wait_all (void)
1706 {
1707   nvptx_wait_all ();
1708 }
1709
1710 void
1711 GOMP_OFFLOAD_openacc_async_wait_all_async (int async)
1712 {
1713   nvptx_wait_all_async (async);
1714 }
1715
1716 void
1717 GOMP_OFFLOAD_openacc_async_set_async (int async)
1718 {
1719   nvptx_set_async (async);
1720 }
1721
1722 void *
1723 GOMP_OFFLOAD_openacc_create_thread_data (int ord)
1724 {
1725   struct ptx_device *ptx_dev;
1726   struct nvptx_thread *nvthd
1727     = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
1728   CUcontext thd_ctx;
1729
1730   ptx_dev = ptx_devices[ord];
1731
1732   assert (ptx_dev);
1733
1734   CUDA_CALL_ASSERT (cuCtxGetCurrent, &thd_ctx);
1735
1736   assert (ptx_dev->ctx);
1737
1738   if (!thd_ctx)
1739     CUDA_CALL_ASSERT (cuCtxPushCurrent, ptx_dev->ctx);
1740
1741   nvthd->current_stream = ptx_dev->null_stream;
1742   nvthd->ptx_dev = ptx_dev;
1743
1744   return (void *) nvthd;
1745 }
1746
1747 void
1748 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
1749 {
1750   free (data);
1751 }
1752
1753 void *
1754 GOMP_OFFLOAD_openacc_get_current_cuda_device (void)
1755 {
1756   return nvptx_get_current_cuda_device ();
1757 }
1758
1759 void *
1760 GOMP_OFFLOAD_openacc_get_current_cuda_context (void)
1761 {
1762   return nvptx_get_current_cuda_context ();
1763 }
1764
1765 /* NOTE: This returns a CUstream, not a ptx_stream pointer.  */
1766
1767 void *
1768 GOMP_OFFLOAD_openacc_get_cuda_stream (int async)
1769 {
1770   return nvptx_get_cuda_stream (async);
1771 }
1772
1773 /* NOTE: This takes a CUstream, not a ptx_stream pointer.  */
1774
1775 int
1776 GOMP_OFFLOAD_openacc_set_cuda_stream (int async, void *stream)
1777 {
1778   return nvptx_set_cuda_stream (async, stream);
1779 }