libgomp/plugin/plugin-nvptx.c

   1 /* Plugin for NVPTX execution.
   2
   3    Copyright (C) 2013-2015 Free Software Foundation, Inc.
   4
   5    Contributed by Mentor Embedded.
   6
   7    This file is part of the GNU Offloading and Multi Processing Library
   8    (libgomp).
   9
  10    Libgomp is free software; you can redistribute it and/or modify it
  11    under the terms of the GNU General Public License as published by
  12    the Free Software Foundation; either version 3, or (at your option)
  13    any later version.
  14
  15    Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
  16    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  17    FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  18    more details.
  19
  20    Under Section 7 of GPL version 3, you are granted additional
  21    permissions described in the GCC Runtime Library Exception, version
  22    3.1, as published by the Free Software Foundation.
  23
  24    You should have received a copy of the GNU General Public License and
  25    a copy of the GCC Runtime Library Exception along with this program;
  26    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  27    <http://www.gnu.org/licenses/>.  */
  28
  29 /* Nvidia PTX-specific parts of OpenACC support.  The cuda driver
  30    library appears to hold some implicit state, but the documentation
  31    is not clear as to what that state might be.  Or how one might
  32    propagate it from one thread to another.  */
  33
  34 #include "openacc.h"
  35 #include "config.h"
  36 #include "libgomp-plugin.h"
  37 #include "oacc-ptx.h"
  38 #include "oacc-plugin.h"
  39
  40 #include <pthread.h>
  41 #include <cuda.h>
  42 #include <stdbool.h>
  43 #include <stdint.h>
  44 #include <string.h>
  45 #include <stdio.h>
  46 #include <dlfcn.h>
  47 #include <unistd.h>
  48 #include <assert.h>
  49
  50 #define ARRAYSIZE(X) (sizeof (X) / sizeof ((X)[0]))
  51
  52 static struct
  53 {
  54   CUresult r;
  55   char *m;
  56 } cuda_errlist[]=
  57 {
  58   { CUDA_ERROR_INVALID_VALUE, "invalid value" },
  59   { CUDA_ERROR_OUT_OF_MEMORY, "out of memory" },
  60   { CUDA_ERROR_NOT_INITIALIZED, "not initialized" },
  61   { CUDA_ERROR_DEINITIALIZED, "deinitialized" },
  62   { CUDA_ERROR_PROFILER_DISABLED, "profiler disabled" },
  63   { CUDA_ERROR_PROFILER_NOT_INITIALIZED, "profiler not initialized" },
  64   { CUDA_ERROR_PROFILER_ALREADY_STARTED, "already started" },
  65   { CUDA_ERROR_PROFILER_ALREADY_STOPPED, "already stopped" },
  66   { CUDA_ERROR_NO_DEVICE, "no device" },
  67   { CUDA_ERROR_INVALID_DEVICE, "invalid device" },
  68   { CUDA_ERROR_INVALID_IMAGE, "invalid image" },
  69   { CUDA_ERROR_INVALID_CONTEXT, "invalid context" },
  70   { CUDA_ERROR_CONTEXT_ALREADY_CURRENT, "context already current" },
  71   { CUDA_ERROR_MAP_FAILED, "map error" },
  72   { CUDA_ERROR_UNMAP_FAILED, "unmap error" },
  73   { CUDA_ERROR_ARRAY_IS_MAPPED, "array is mapped" },
  74   { CUDA_ERROR_ALREADY_MAPPED, "already mapped" },
  75   { CUDA_ERROR_NO_BINARY_FOR_GPU, "no binary for gpu" },
  76   { CUDA_ERROR_ALREADY_ACQUIRED, "already acquired" },
  77   { CUDA_ERROR_NOT_MAPPED, "not mapped" },
  78   { CUDA_ERROR_NOT_MAPPED_AS_ARRAY, "not mapped as array" },
  79   { CUDA_ERROR_NOT_MAPPED_AS_POINTER, "not mapped as pointer" },
  80   { CUDA_ERROR_ECC_UNCORRECTABLE, "ecc uncorrectable" },
  81   { CUDA_ERROR_UNSUPPORTED_LIMIT, "unsupported limit" },
  82   { CUDA_ERROR_CONTEXT_ALREADY_IN_USE, "context already in use" },
  83   { CUDA_ERROR_PEER_ACCESS_UNSUPPORTED, "peer access unsupported" },
  84   { CUDA_ERROR_INVALID_SOURCE, "invalid source" },
  85   { CUDA_ERROR_FILE_NOT_FOUND, "file not found" },
  86   { CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
  87                                            "shared object symbol not found" },
  88   { CUDA_ERROR_SHARED_OBJECT_INIT_FAILED, "shared object init error" },
  89   { CUDA_ERROR_OPERATING_SYSTEM, "operating system" },
  90   { CUDA_ERROR_INVALID_HANDLE, "invalid handle" },
  91   { CUDA_ERROR_NOT_FOUND, "not found" },
  92   { CUDA_ERROR_NOT_READY, "not ready" },
  93   { CUDA_ERROR_LAUNCH_FAILED, "launch error" },
  94   { CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, "launch out of resources" },
  95   { CUDA_ERROR_LAUNCH_TIMEOUT, "launch timeout" },
  96   { CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
  97                                              "launch incompatibe texturing" },
  98   { CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED, "peer access already enabled" },
  99   { CUDA_ERROR_PEER_ACCESS_NOT_ENABLED, "peer access not enabled " },
 100   { CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE, "primary cotext active" },
 101   { CUDA_ERROR_CONTEXT_IS_DESTROYED, "context is destroyed" },
 102   { CUDA_ERROR_ASSERT, "assert" },
 103   { CUDA_ERROR_TOO_MANY_PEERS, "too many peers" },
 104   { CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED,
 105                                            "host memory already registered" },
 106   { CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED, "host memory not registered" },
 107   { CUDA_ERROR_NOT_PERMITTED, "not permitted" },
 108   { CUDA_ERROR_NOT_SUPPORTED, "not supported" },
 109   { CUDA_ERROR_UNKNOWN, "unknown" }
 110 };
 111
 112 static char errmsg[128];
 113
 114 static char *
 115 cuda_error (CUresult r)
 116 {
 117   int i;
 118
 119   for (i = 0; i < ARRAYSIZE (cuda_errlist); i++)
 120     {
 121       if (cuda_errlist[i].r == r)
 122         return &cuda_errlist[i].m[0];
 123     }
 124
 125   sprintf (&errmsg[0], "unknown result code: %5d", r);
 126
 127   return &errmsg[0];
 128 }
 129
 130 struct targ_fn_descriptor
 131 {
 132   CUfunction fn;
 133   const char *name;
 134 };
 135
 136 static unsigned int instantiated_devices = 0;
 137 static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
 138
 139 struct ptx_stream
 140 {
 141   CUstream stream;
 142   pthread_t host_thread;
 143   bool multithreaded;
 144
 145   CUdeviceptr d;
 146   void *h;
 147   void *h_begin;
 148   void *h_end;
 149   void *h_next;
 150   void *h_prev;
 151   void *h_tail;
 152
 153   struct ptx_stream *next;
 154 };
 155
 156 /* Thread-specific data for PTX.  */
 157
 158 struct nvptx_thread
 159 {
 160   struct ptx_stream *current_stream;
 161   struct ptx_device *ptx_dev;
 162 };
 163
 164 struct map
 165 {
 166   int     async;
 167   size_t  size;
 168   char    mappings[0];
 169 };
 170
 171 static void
 172 map_init (struct ptx_stream *s)
 173 {
 174   CUresult r;
 175
 176   int size = getpagesize ();
 177
 178   assert (s);
 179   assert (!s->d);
 180   assert (!s->h);
 181
 182   r = cuMemAllocHost (&s->h, size);
 183   if (r != CUDA_SUCCESS)
 184     GOMP_PLUGIN_fatal ("cuMemAllocHost error: %s", cuda_error (r));
 185
 186   r = cuMemHostGetDevicePointer (&s->d, s->h, 0);
 187   if (r != CUDA_SUCCESS)
 188     GOMP_PLUGIN_fatal ("cuMemHostGetDevicePointer error: %s", cuda_error (r));
 189
 190   assert (s->h);
 191
 192   s->h_begin = s->h;
 193   s->h_end = s->h_begin + size;
 194   s->h_next = s->h_prev = s->h_tail = s->h_begin;
 195
 196   assert (s->h_next);
 197   assert (s->h_end);
 198 }
 199
 200 static void
 201 map_fini (struct ptx_stream *s)
 202 {
 203   CUresult r;
 204
 205   r = cuMemFreeHost (s->h);
 206   if (r != CUDA_SUCCESS)
 207     GOMP_PLUGIN_fatal ("cuMemFreeHost error: %s", cuda_error (r));
 208 }
 209
 210 static void
 211 map_pop (struct ptx_stream *s)
 212 {
 213   struct map *m;
 214
 215   assert (s != NULL);
 216   assert (s->h_next);
 217   assert (s->h_prev);
 218   assert (s->h_tail);
 219
 220   m = s->h_tail;
 221
 222   s->h_tail += m->size;
 223
 224   if (s->h_tail >= s->h_end)
 225     s->h_tail = s->h_begin + (int) (s->h_tail - s->h_end);
 226
 227   if (s->h_next == s->h_tail)
 228     s->h_prev = s->h_next;
 229
 230   assert (s->h_next >= s->h_begin);
 231   assert (s->h_tail >= s->h_begin);
 232   assert (s->h_prev >= s->h_begin);
 233
 234   assert (s->h_next <= s->h_end);
 235   assert (s->h_tail <= s->h_end);
 236   assert (s->h_prev <= s->h_end);
 237 }
 238
 239 static void
 240 map_push (struct ptx_stream *s, int async, size_t size, void **h, void **d)
 241 {
 242   int left;
 243   int offset;
 244   struct map *m;
 245
 246   assert (s != NULL);
 247
 248   left = s->h_end - s->h_next;
 249   size += sizeof (struct map);
 250
 251   assert (s->h_prev);
 252   assert (s->h_next);
 253
 254   if (size >= left)
 255     {
 256       m = s->h_prev;
 257       m->size += left;
 258       s->h_next = s->h_begin;
 259
 260       if (s->h_next + size > s->h_end)
 261         GOMP_PLUGIN_fatal ("unable to push map");
 262     }
 263
 264   assert (s->h_next);
 265
 266   m = s->h_next;
 267   m->async = async;
 268   m->size = size;
 269
 270   offset = (void *)&m->mappings[0] - s->h;
 271
 272   *d = (void *)(s->d + offset);
 273   *h = (void *)(s->h + offset);
 274
 275   s->h_prev = s->h_next;
 276   s->h_next += size;
 277
 278   assert (s->h_prev);
 279   assert (s->h_next);
 280
 281   assert (s->h_next >= s->h_begin);
 282   assert (s->h_tail >= s->h_begin);
 283   assert (s->h_prev >= s->h_begin);
 284   assert (s->h_next <= s->h_end);
 285   assert (s->h_tail <= s->h_end);
 286   assert (s->h_prev <= s->h_end);
 287
 288   return;
 289 }
 290
 291 struct ptx_device
 292 {
 293   CUcontext ctx;
 294   bool ctx_shared;
 295   CUdevice dev;
 296   struct ptx_stream *null_stream;
 297   /* All non-null streams associated with this device (actually context),
 298      either created implicitly or passed in from the user (via
 299      acc_set_cuda_stream).  */
 300   struct ptx_stream *active_streams;
 301   struct {
 302     struct ptx_stream **arr;
 303     int size;
 304   } async_streams;
 305   /* A lock for use when manipulating the above stream list and array.  */
 306   pthread_mutex_t stream_lock;
 307   int ord;
 308   bool overlap;
 309   bool map;
 310   bool concur;
 311   int  mode;
 312   bool mkern;
 313
 314   struct ptx_device *next;
 315 };
 316
 317 enum ptx_event_type
 318 {
 319   PTX_EVT_MEM,
 320   PTX_EVT_KNL,
 321   PTX_EVT_SYNC,
 322   PTX_EVT_ASYNC_CLEANUP
 323 };
 324
 325 struct ptx_event
 326 {
 327   CUevent *evt;
 328   int type;
 329   void *addr;
 330   int ord;
 331
 332   struct ptx_event *next;
 333 };
 334
 335 struct ptx_image_data
 336 {
 337   void *target_data;
 338   CUmodule module;
 339   struct ptx_image_data *next;
 340 };
 341
 342 static pthread_mutex_t ptx_event_lock;
 343 static struct ptx_event *ptx_events;
 344
 345 static struct ptx_device **ptx_devices;
 346
 347 static struct ptx_image_data *ptx_images = NULL;
 348 static pthread_mutex_t ptx_image_lock = PTHREAD_MUTEX_INITIALIZER;
 349
 350 #define _XSTR(s) _STR(s)
 351 #define _STR(s) #s
 352
 353 static struct _synames
 354 {
 355   char *n;
 356 } cuda_symnames[] =
 357 {
 358   { _XSTR (cuCtxCreate) },
 359   { _XSTR (cuCtxDestroy) },
 360   { _XSTR (cuCtxGetCurrent) },
 361   { _XSTR (cuCtxPushCurrent) },
 362   { _XSTR (cuCtxSynchronize) },
 363   { _XSTR (cuDeviceGet) },
 364   { _XSTR (cuDeviceGetAttribute) },
 365   { _XSTR (cuDeviceGetCount) },
 366   { _XSTR (cuEventCreate) },
 367   { _XSTR (cuEventDestroy) },
 368   { _XSTR (cuEventQuery) },
 369   { _XSTR (cuEventRecord) },
 370   { _XSTR (cuInit) },
 371   { _XSTR (cuLaunchKernel) },
 372   { _XSTR (cuLinkAddData) },
 373   { _XSTR (cuLinkComplete) },
 374   { _XSTR (cuLinkCreate) },
 375   { _XSTR (cuMemAlloc) },
 376   { _XSTR (cuMemAllocHost) },
 377   { _XSTR (cuMemcpy) },
 378   { _XSTR (cuMemcpyDtoH) },
 379   { _XSTR (cuMemcpyDtoHAsync) },
 380   { _XSTR (cuMemcpyHtoD) },
 381   { _XSTR (cuMemcpyHtoDAsync) },
 382   { _XSTR (cuMemFree) },
 383   { _XSTR (cuMemFreeHost) },
 384   { _XSTR (cuMemGetAddressRange) },
 385   { _XSTR (cuMemHostGetDevicePointer) },
 386   { _XSTR (cuMemHostRegister) },
 387   { _XSTR (cuMemHostUnregister) },
 388   { _XSTR (cuModuleGetFunction) },
 389   { _XSTR (cuModuleLoadData) },
 390   { _XSTR (cuStreamDestroy) },
 391   { _XSTR (cuStreamQuery) },
 392   { _XSTR (cuStreamSynchronize) },
 393   { _XSTR (cuStreamWaitEvent) }
 394 };
 395
 396 static int
 397 verify_device_library (void)
 398 {
 399   int i;
 400   void *dh, *ds;
 401
 402   dh = dlopen ("libcuda.so", RTLD_LAZY);
 403   if (!dh)
 404     return -1;
 405
 406   for (i = 0; i < ARRAYSIZE (cuda_symnames); i++)
 407     {
 408       ds = dlsym (dh, cuda_symnames[i].n);
 409       if (!ds)
 410         return -1;
 411     }
 412
 413   dlclose (dh);
 414
 415   return 0;
 416 }
 417
 418 static inline struct nvptx_thread *
 419 nvptx_thread (void)
 420 {
 421   return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
 422 }
 423
 424 static void
 425 init_streams_for_device (struct ptx_device *ptx_dev, int concurrency)
 426 {
 427   int i;
 428   struct ptx_stream *null_stream
 429     = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
 430
 431   null_stream->stream = NULL;
 432   null_stream->host_thread = pthread_self ();
 433   null_stream->multithreaded = true;
 434   null_stream->d = (CUdeviceptr) NULL;
 435   null_stream->h = NULL;
 436   map_init (null_stream);
 437   ptx_dev->null_stream = null_stream;
 438
 439   ptx_dev->active_streams = NULL;
 440   pthread_mutex_init (&ptx_dev->stream_lock, NULL);
 441
 442   if (concurrency < 1)
 443     concurrency = 1;
 444
 445   /* This is just a guess -- make space for as many async streams as the
 446      current device is capable of concurrently executing.  This can grow
 447      later as necessary.  No streams are created yet.  */
 448   ptx_dev->async_streams.arr
 449     = GOMP_PLUGIN_malloc (concurrency * sizeof (struct ptx_stream *));
 450   ptx_dev->async_streams.size = concurrency;
 451
 452   for (i = 0; i < concurrency; i++)
 453     ptx_dev->async_streams.arr[i] = NULL;
 454 }
 455
 456 static void
 457 fini_streams_for_device (struct ptx_device *ptx_dev)
 458 {
 459   free (ptx_dev->async_streams.arr);
 460
 461   while (ptx_dev->active_streams != NULL)
 462     {
 463       struct ptx_stream *s = ptx_dev->active_streams;
 464       ptx_dev->active_streams = ptx_dev->active_streams->next;
 465
 466       map_fini (s);
 467       cuStreamDestroy (s->stream);
 468       free (s);
 469     }
 470
 471   map_fini (ptx_dev->null_stream);
 472   free (ptx_dev->null_stream);
 473 }
 474
 475 /* Select a stream for (OpenACC-semantics) ASYNC argument for the current
 476    thread THREAD (and also current device/context).  If CREATE is true, create
 477    the stream if it does not exist (or use EXISTING if it is non-NULL), and
 478    associate the stream with the same thread argument.  Returns stream to use
 479    as result.  */
 480
 481 static struct ptx_stream *
 482 select_stream_for_async (int async, pthread_t thread, bool create,
 483                          CUstream existing)
 484 {
 485   struct nvptx_thread *nvthd = nvptx_thread ();
 486   /* Local copy of TLS variable.  */
 487   struct ptx_device *ptx_dev = nvthd->ptx_dev;
 488   struct ptx_stream *stream = NULL;
 489   int orig_async = async;
 490
 491   /* The special value acc_async_noval (-1) maps (for now) to an
 492      implicitly-created stream, which is then handled the same as any other
 493      numbered async stream.  Other options are available, e.g. using the null
 494      stream for anonymous async operations, or choosing an idle stream from an
 495      active set.  But, stick with this for now.  */
 496   if (async > acc_async_sync)
 497     async++;
 498
 499   if (create)
 500     pthread_mutex_lock (&ptx_dev->stream_lock);
 501
 502   /* NOTE: AFAICT there's no particular need for acc_async_sync to map to the
 503      null stream, and in fact better performance may be obtainable if it doesn't
 504      (because the null stream enforces overly-strict synchronisation with
 505      respect to other streams for legacy reasons, and that's probably not
 506      needed with OpenACC).  Maybe investigate later.  */
 507   if (async == acc_async_sync)
 508     stream = ptx_dev->null_stream;
 509   else if (async >= 0 && async < ptx_dev->async_streams.size
 510            && ptx_dev->async_streams.arr[async] && !(create && existing))
 511     stream = ptx_dev->async_streams.arr[async];
 512   else if (async >= 0 && create)
 513     {
 514       if (async >= ptx_dev->async_streams.size)
 515         {
 516           int i, newsize = ptx_dev->async_streams.size * 2;
 517
 518           if (async >= newsize)
 519             newsize = async + 1;
 520
 521           ptx_dev->async_streams.arr
 522             = GOMP_PLUGIN_realloc (ptx_dev->async_streams.arr,
 523                                    newsize * sizeof (struct ptx_stream *));
 524
 525           for (i = ptx_dev->async_streams.size; i < newsize; i++)
 526             ptx_dev->async_streams.arr[i] = NULL;
 527
 528           ptx_dev->async_streams.size = newsize;
 529         }
 530
 531       /* Create a new stream on-demand if there isn't one already, or if we're
 532          setting a particular async value to an existing (externally-provided)
 533          stream.  */
 534       if (!ptx_dev->async_streams.arr[async] || existing)
 535         {
 536           CUresult r;
 537           struct ptx_stream *s
 538             = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
 539
 540           if (existing)
 541             s->stream = existing;
 542           else
 543             {
 544               r = cuStreamCreate (&s->stream, CU_STREAM_DEFAULT);
 545               if (r != CUDA_SUCCESS)
 546                 GOMP_PLUGIN_fatal ("cuStreamCreate error: %s", cuda_error (r));
 547             }
 548
 549           /* If CREATE is true, we're going to be queueing some work on this
 550              stream.  Associate it with the current host thread.  */
 551           s->host_thread = thread;
 552           s->multithreaded = false;
 553
 554           s->d = (CUdeviceptr) NULL;
 555           s->h = NULL;
 556           map_init (s);
 557
 558           s->next = ptx_dev->active_streams;
 559           ptx_dev->active_streams = s;
 560           ptx_dev->async_streams.arr[async] = s;
 561         }
 562
 563       stream = ptx_dev->async_streams.arr[async];
 564     }
 565   else if (async < 0)
 566     GOMP_PLUGIN_fatal ("bad async %d", async);
 567
 568   if (create)
 569     {
 570       assert (stream != NULL);
 571
 572       /* If we're trying to use the same stream from different threads
 573          simultaneously, set stream->multithreaded to true.  This affects the
 574          behaviour of acc_async_test_all and acc_wait_all, which are supposed to
 575          only wait for asynchronous launches from the same host thread they are
 576          invoked on.  If multiple threads use the same async value, we make note
 577          of that here and fall back to testing/waiting for all threads in those
 578          functions.  */
 579       if (thread != stream->host_thread)
 580         stream->multithreaded = true;
 581
 582       pthread_mutex_unlock (&ptx_dev->stream_lock);
 583     }
 584   else if (stream && !stream->multithreaded
 585            && !pthread_equal (stream->host_thread, thread))
 586     GOMP_PLUGIN_fatal ("async %d used on wrong thread", orig_async);
 587
 588   return stream;
 589 }
 590
 591 /* Initialize the device.  Return TRUE on success, else FALSE.  PTX_DEV_LOCK
 592    should be locked on entry and remains locked on exit.  */
 593 static bool
 594 nvptx_init (void)
 595 {
 596   CUresult r;
 597   int rc;
 598   int ndevs;
 599
 600   if (instantiated_devices != 0)
 601     return true;
 602
 603   rc = verify_device_library ();
 604   if (rc < 0)
 605     return false;
 606
 607   r = cuInit (0);
 608   if (r != CUDA_SUCCESS)
 609     GOMP_PLUGIN_fatal ("cuInit error: %s", cuda_error (r));
 610
 611   ptx_events = NULL;
 612
 613   pthread_mutex_init (&ptx_event_lock, NULL);
 614
 615   r = cuDeviceGetCount (&ndevs);
 616   if (r != CUDA_SUCCESS)
 617     GOMP_PLUGIN_fatal ("cuDeviceGetCount error: %s", cuda_error (r));
 618
 619   ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *)
 620                                             * ndevs);
 621
 622   return true;
 623 }
 624
 625 /* Select the N'th PTX device for the current host thread.  The device must
 626    have been previously opened before calling this function.  */
 627
 628 static void
 629 nvptx_attach_host_thread_to_device (int n)
 630 {
 631   CUdevice dev;
 632   CUresult r;
 633   struct ptx_device *ptx_dev;
 634   CUcontext thd_ctx;
 635
 636   r = cuCtxGetDevice (&dev);
 637   if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
 638     GOMP_PLUGIN_fatal ("cuCtxGetDevice error: %s", cuda_error (r));
 639
 640   if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n)
 641     return;
 642   else
 643     {
 644       CUcontext old_ctx;
 645
 646       ptx_dev = ptx_devices[n];
 647       assert (ptx_dev);
 648
 649       r = cuCtxGetCurrent (&thd_ctx);
 650       if (r != CUDA_SUCCESS)
 651         GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));
 652
 653       /* We don't necessarily have a current context (e.g. if it has been
 654          destroyed.  Pop it if we do though.  */
 655       if (thd_ctx != NULL)
 656         {
 657           r = cuCtxPopCurrent (&old_ctx);
 658           if (r != CUDA_SUCCESS)
 659             GOMP_PLUGIN_fatal ("cuCtxPopCurrent error: %s", cuda_error (r));
 660         }
 661
 662       r = cuCtxPushCurrent (ptx_dev->ctx);
 663       if (r != CUDA_SUCCESS)
 664         GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s", cuda_error (r));
 665     }
 666 }
 667
 668 static struct ptx_device *
 669 nvptx_open_device (int n)
 670 {
 671   struct ptx_device *ptx_dev;
 672   CUdevice dev, ctx_dev;
 673   CUresult r;
 674   int async_engines, pi;
 675
 676   r = cuDeviceGet (&dev, n);
 677   if (r != CUDA_SUCCESS)
 678     GOMP_PLUGIN_fatal ("cuDeviceGet error: %s", cuda_error (r));
 679
 680   ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device));
 681
 682   ptx_dev->ord = n;
 683   ptx_dev->dev = dev;
 684   ptx_dev->ctx_shared = false;
 685
 686   r = cuCtxGetDevice (&ctx_dev);
 687   if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
 688     GOMP_PLUGIN_fatal ("cuCtxGetDevice error: %s", cuda_error (r));
 689
 690   if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev)
 691     {
 692       /* The current host thread has an active context for a different device.
 693          Detach it.  */
 694       CUcontext old_ctx;
 695
 696       r = cuCtxPopCurrent (&old_ctx);
 697       if (r != CUDA_SUCCESS)
 698         GOMP_PLUGIN_fatal ("cuCtxPopCurrent error: %s", cuda_error (r));
 699     }
 700
 701   r = cuCtxGetCurrent (&ptx_dev->ctx);
 702   if (r != CUDA_SUCCESS)
 703     GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));
 704
 705   if (!ptx_dev->ctx)
 706     {
 707       r = cuCtxCreate (&ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
 708       if (r != CUDA_SUCCESS)
 709         GOMP_PLUGIN_fatal ("cuCtxCreate error: %s", cuda_error (r));
 710     }
 711   else
 712     ptx_dev->ctx_shared = true;
 713
 714   r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
 715   if (r != CUDA_SUCCESS)
 716     GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
 717
 718   ptx_dev->overlap = pi;
 719
 720   r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
 721   if (r != CUDA_SUCCESS)
 722     GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
 723
 724   ptx_dev->map = pi;
 725
 726   r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
 727   if (r != CUDA_SUCCESS)
 728     GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
 729
 730   ptx_dev->concur = pi;
 731
 732   r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
 733   if (r != CUDA_SUCCESS)
 734     GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
 735
 736   ptx_dev->mode = pi;
 737
 738   r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
 739   if (r != CUDA_SUCCESS)
 740     GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
 741
 742   ptx_dev->mkern = pi;
 743
 744   r = cuDeviceGetAttribute (&async_engines,
 745                             CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
 746   if (r != CUDA_SUCCESS)
 747     async_engines = 1;
 748
 749   init_streams_for_device (ptx_dev, async_engines);
 750
 751   return ptx_dev;
 752 }
 753
 754 static void
 755 nvptx_close_device (struct ptx_device *ptx_dev)
 756 {
 757   CUresult r;
 758
 759   if (!ptx_dev)
 760     return;
 761
 762   fini_streams_for_device (ptx_dev);
 763
 764   if (!ptx_dev->ctx_shared)
 765     {
 766       r = cuCtxDestroy (ptx_dev->ctx);
 767       if (r != CUDA_SUCCESS)
 768         GOMP_PLUGIN_fatal ("cuCtxDestroy error: %s", cuda_error (r));
 769     }
 770
 771   free (ptx_dev);
 772 }
 773
 774 static int
 775 nvptx_get_num_devices (void)
 776 {
 777   int n;
 778   CUresult r;
 779
 780   /* This function will be called before the plugin has been initialized in
 781      order to enumerate available devices, but CUDA API routines can't be used
 782      until cuInit has been called.  Just call it now (but don't yet do any
 783      further initialization).  */
 784   if (instantiated_devices == 0)
 785     cuInit (0);
 786
 787   r = cuDeviceGetCount (&n);
 788   if (r!= CUDA_SUCCESS)
 789     GOMP_PLUGIN_fatal ("cuDeviceGetCount error: %s", cuda_error (r));
 790
 791   return n;
 792 }
 793
 794
 795 static void
 796 link_ptx (CUmodule *module, char *ptx_code)
 797 {
 798   CUjit_option opts[7];
 799   void *optvals[7];
 800   float elapsed = 0.0;
 801 #define LOGSIZE 8192
 802   char elog[LOGSIZE];
 803   char ilog[LOGSIZE];
 804   unsigned long logsize = LOGSIZE;
 805   CUlinkState linkstate;
 806   CUresult r;
 807   void *linkout;
 808   size_t linkoutsize __attribute__ ((unused));
 809
 810   GOMP_PLUGIN_debug (0, "attempting to load:\n---\n%s\n---\n", ptx_code);
 811
 812   opts[0] = CU_JIT_WALL_TIME;
 813   optvals[0] = &elapsed;
 814
 815   opts[1] = CU_JIT_INFO_LOG_BUFFER;
 816   optvals[1] = &ilog[0];
 817
 818   opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
 819   optvals[2] = (void *) logsize;
 820
 821   opts[3] = CU_JIT_ERROR_LOG_BUFFER;
 822   optvals[3] = &elog[0];
 823
 824   opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
 825   optvals[4] = (void *) logsize;
 826
 827   opts[5] = CU_JIT_LOG_VERBOSE;
 828   optvals[5] = (void *) 1;
 829
 830   opts[6] = CU_JIT_TARGET;
 831   optvals[6] = (void *) CU_TARGET_COMPUTE_30;
 832
 833   r = cuLinkCreate (7, opts, optvals, &linkstate);
 834   if (r != CUDA_SUCCESS)
 835     GOMP_PLUGIN_fatal ("cuLinkCreate error: %s", cuda_error (r));
 836
 837   char *abort_ptx = ABORT_PTX;
 838   r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, abort_ptx,
 839                      strlen (abort_ptx) + 1, 0, 0, 0, 0);
 840   if (r != CUDA_SUCCESS)
 841     {
 842       GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
 843       GOMP_PLUGIN_fatal ("cuLinkAddData (abort) error: %s", cuda_error (r));
 844     }
 845
 846   char *acc_on_device_ptx = ACC_ON_DEVICE_PTX;
 847   r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, acc_on_device_ptx,
 848                      strlen (acc_on_device_ptx) + 1, 0, 0, 0, 0);
 849   if (r != CUDA_SUCCESS)
 850     {
 851       GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
 852       GOMP_PLUGIN_fatal ("cuLinkAddData (acc_on_device) error: %s",
 853                          cuda_error (r));
 854     }
 855
 856   char *goacc_internal_ptx = GOACC_INTERNAL_PTX;
 857   r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, goacc_internal_ptx,
 858                      strlen (goacc_internal_ptx) + 1, 0, 0, 0, 0);
 859   if (r != CUDA_SUCCESS)
 860     {
 861       GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
 862       GOMP_PLUGIN_fatal ("cuLinkAddData (goacc_internal_ptx) error: %s",
 863                          cuda_error (r));
 864     }
 865
 866   r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, ptx_code,
 867               strlen (ptx_code) + 1, 0, 0, 0, 0);
 868   if (r != CUDA_SUCCESS)
 869     {
 870       GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
 871       GOMP_PLUGIN_fatal ("cuLinkAddData (ptx_code) error: %s", cuda_error (r));
 872     }
 873
 874   r = cuLinkComplete (linkstate, &linkout, &linkoutsize);
 875   if (r != CUDA_SUCCESS)
 876     GOMP_PLUGIN_fatal ("cuLinkComplete error: %s", cuda_error (r));
 877
 878   GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed);
 879   GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]);
 880
 881   r = cuModuleLoadData (module, linkout);
 882   if (r != CUDA_SUCCESS)
 883     GOMP_PLUGIN_fatal ("cuModuleLoadData error: %s", cuda_error (r));
 884 }
 885
 886 static void
 887 event_gc (bool memmap_lockable)
 888 {
 889   struct ptx_event *ptx_event = ptx_events;
 890   struct nvptx_thread *nvthd = nvptx_thread ();
 891
 892   pthread_mutex_lock (&ptx_event_lock);
 893
 894   while (ptx_event != NULL)
 895     {
 896       CUresult r;
 897       struct ptx_event *e = ptx_event;
 898
 899       ptx_event = ptx_event->next;
 900
 901       if (e->ord != nvthd->ptx_dev->ord)
 902         continue;
 903
 904       r = cuEventQuery (*e->evt);
 905       if (r == CUDA_SUCCESS)
 906         {
 907           CUevent *te;
 908
 909           te = e->evt;
 910
 911           switch (e->type)
 912             {
 913             case PTX_EVT_MEM:
 914             case PTX_EVT_SYNC:
 915               break;
 916
 917             case PTX_EVT_KNL:
 918               map_pop (e->addr);
 919               break;
 920
 921             case PTX_EVT_ASYNC_CLEANUP:
 922               {
 923                 /* The function gomp_plugin_async_unmap_vars needs to claim the
 924                    memory-map splay tree lock for the current device, so we
 925                    can't call it when one of our callers has already claimed
 926                    the lock.  In that case, just delay the GC for this event
 927                    until later.  */
 928                 if (!memmap_lockable)
 929                   continue;
 930
 931                 GOMP_PLUGIN_async_unmap_vars (e->addr);
 932               }
 933               break;
 934             }
 935
 936           cuEventDestroy (*te);
 937           free ((void *)te);
 938
 939           if (ptx_events == e)
 940             ptx_events = ptx_events->next;
 941           else
 942             {
 943               struct ptx_event *e_ = ptx_events;
 944               while (e_->next != e)
 945                 e_ = e_->next;
 946               e_->next = e_->next->next;
 947             }
 948
 949           free (e);
 950         }
 951     }
 952
 953   pthread_mutex_unlock (&ptx_event_lock);
 954 }
 955
 956 static void
 957 event_add (enum ptx_event_type type, CUevent *e, void *h)
 958 {
 959   struct ptx_event *ptx_event;
 960   struct nvptx_thread *nvthd = nvptx_thread ();
 961
 962   assert (type == PTX_EVT_MEM || type == PTX_EVT_KNL || type == PTX_EVT_SYNC
 963           || type == PTX_EVT_ASYNC_CLEANUP);
 964
 965   ptx_event = GOMP_PLUGIN_malloc (sizeof (struct ptx_event));
 966   ptx_event->type = type;
 967   ptx_event->evt = e;
 968   ptx_event->addr = h;
 969   ptx_event->ord = nvthd->ptx_dev->ord;
 970
 971   pthread_mutex_lock (&ptx_event_lock);
 972
 973   ptx_event->next = ptx_events;
 974   ptx_events = ptx_event;
 975
 976   pthread_mutex_unlock (&ptx_event_lock);
 977 }
 978
 979 void
 980 nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
 981           size_t *sizes, unsigned short *kinds, int num_gangs, int num_workers,
 982           int vector_length, int async, void *targ_mem_desc)
 983 {
 984   struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
 985   CUfunction function;
 986   CUresult r;
 987   int i;
 988   struct ptx_stream *dev_str;
 989   void *kargs[1];
 990   void *hp, *dp;
 991   unsigned int nthreads_in_block;
 992   struct nvptx_thread *nvthd = nvptx_thread ();
 993   const char *maybe_abort_msg = "(perhaps abort was called)";
 994
 995   function = targ_fn->fn;
 996
 997   dev_str = select_stream_for_async (async, pthread_self (), false, NULL);
 998   assert (dev_str == nvthd->current_stream);
 999
1000   /* This reserves a chunk of a pre-allocated page of memory mapped on both
1001      the host and the device. HP is a host pointer to the new chunk, and DP is
1002      the corresponding device pointer.  */
1003   map_push (dev_str, async, mapnum * sizeof (void *), &hp, &dp);
1004
1005   GOMP_PLUGIN_debug (0, "  %s: prepare mappings\n", __FUNCTION__);
1006
1007   /* Copy the array of arguments to the mapped page.  */
1008   for (i = 0; i < mapnum; i++)
1009     ((void **) hp)[i] = devaddrs[i];
1010
1011   /* Copy the (device) pointers to arguments to the device (dp and hp might in
1012      fact have the same value on a unified-memory system).  */
1013   r = cuMemcpy ((CUdeviceptr)dp, (CUdeviceptr)hp, mapnum * sizeof (void *));
1014   if (r != CUDA_SUCCESS)
1015     GOMP_PLUGIN_fatal ("cuMemcpy failed: %s", cuda_error (r));
1016
1017   GOMP_PLUGIN_debug (0, "  %s: kernel %s: launch\n", __FUNCTION__, targ_fn->name);
1018
1019   // OpenACC            CUDA
1020   //
1021   // num_gangs          blocks
1022   // num_workers        warps (where a warp is equivalent to 32 threads)
1023   // vector length      threads
1024   //
1025
1026   /* The openacc vector_length clause 'determines the vector length to use for
1027      vector or SIMD operations'.  The question is how to map this to CUDA.
1028
1029      In CUDA, the warp size is the vector length of a CUDA device.  However, the
1030      CUDA interface abstracts away from that, and only shows us warp size
1031      indirectly in maximum number of threads per block, which is a product of
1032      warp size and the number of hyperthreads of a multiprocessor.
1033
1034      We choose to map openacc vector_length directly onto the number of threads
1035      in a block, in the x dimension.  This is reflected in gcc code generation
1036      that uses ThreadIdx.x to access vector elements.
1037
1038      Attempting to use an openacc vector_length of more than the maximum number
1039      of threads per block will result in a cuda error.  */
1040   nthreads_in_block = vector_length;
1041
1042   kargs[0] = &dp;
1043   r = cuLaunchKernel (function,
1044                       num_gangs, 1, 1,
1045                       nthreads_in_block, 1, 1,
1046                       0, dev_str->stream, kargs, 0);
1047   if (r != CUDA_SUCCESS)
1048     GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
1049
1050 #ifndef DISABLE_ASYNC
1051   if (async < acc_async_noval)
1052     {
1053       r = cuStreamSynchronize (dev_str->stream);
1054       if (r == CUDA_ERROR_LAUNCH_FAILED)
1055         GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
1056                            maybe_abort_msg);
1057       else if (r != CUDA_SUCCESS)
1058         GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1059     }
1060   else
1061     {
1062       CUevent *e;
1063
1064       e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1065
1066       r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1067       if (r == CUDA_ERROR_LAUNCH_FAILED)
1068         GOMP_PLUGIN_fatal ("cuEventCreate error: %s %s\n", cuda_error (r),
1069                            maybe_abort_msg);
1070       else if (r != CUDA_SUCCESS)
1071         GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1072
1073       event_gc (true);
1074
1075       r = cuEventRecord (*e, dev_str->stream);
1076       if (r != CUDA_SUCCESS)
1077         GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1078
1079       event_add (PTX_EVT_KNL, e, (void *)dev_str);
1080     }
1081 #else
1082   r = cuCtxSynchronize ();
1083   if (r == CUDA_ERROR_LAUNCH_FAILED)
1084     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
1085                        maybe_abort_msg);
1086   else if (r != CUDA_SUCCESS)
1087     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
1088 #endif
1089
1090   GOMP_PLUGIN_debug (0, "  %s: kernel %s: finished\n", __FUNCTION__,
1091                      targ_fn->name);
1092
1093 #ifndef DISABLE_ASYNC
1094   if (async < acc_async_noval)
1095 #endif
1096     map_pop (dev_str);
1097 }
1098
1099 void * openacc_get_current_cuda_context (void);
1100
1101 static void *
1102 nvptx_alloc (size_t s)
1103 {
1104   CUdeviceptr d;
1105   CUresult r;
1106
1107   r = cuMemAlloc (&d, s);
1108   if (r == CUDA_ERROR_OUT_OF_MEMORY)
1109     return 0;
1110   if (r != CUDA_SUCCESS)
1111     GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
1112   return (void *)d;
1113 }
1114
1115 static void
1116 nvptx_free (void *p)
1117 {
1118   CUresult r;
1119   CUdeviceptr pb;
1120   size_t ps;
1121
1122   r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)p);
1123   if (r != CUDA_SUCCESS)
1124     GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r));
1125
1126   if ((CUdeviceptr)p != pb)
1127     GOMP_PLUGIN_fatal ("invalid device address");
1128
1129   r = cuMemFree ((CUdeviceptr)p);
1130   if (r != CUDA_SUCCESS)
1131     GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
1132 }
1133
1134 static void *
1135 nvptx_host2dev (void *d, const void *h, size_t s)
1136 {
1137   CUresult r;
1138   CUdeviceptr pb;
1139   size_t ps;
1140   struct nvptx_thread *nvthd = nvptx_thread ();
1141
1142   if (!s)
1143     return 0;
1144
1145   if (!d)
1146     GOMP_PLUGIN_fatal ("invalid device address");
1147
1148   r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)d);
1149   if (r != CUDA_SUCCESS)
1150     GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r));
1151
1152   if (!pb)
1153     GOMP_PLUGIN_fatal ("invalid device address");
1154
1155   if (!h)
1156     GOMP_PLUGIN_fatal ("invalid host address");
1157
1158   if (d == h)
1159     GOMP_PLUGIN_fatal ("invalid host or device address");
1160
1161   if ((void *)(d + s) > (void *)(pb + ps))
1162     GOMP_PLUGIN_fatal ("invalid size");
1163
1164 #ifndef DISABLE_ASYNC
1165   if (nvthd->current_stream != nvthd->ptx_dev->null_stream)
1166     {
1167       CUevent *e;
1168
1169       e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1170
1171       r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1172       if (r != CUDA_SUCCESS)
1173         GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1174
1175       event_gc (false);
1176
1177       r = cuMemcpyHtoDAsync ((CUdeviceptr)d, h, s,
1178                              nvthd->current_stream->stream);
1179       if (r != CUDA_SUCCESS)
1180         GOMP_PLUGIN_fatal ("cuMemcpyHtoDAsync error: %s", cuda_error (r));
1181
1182       r = cuEventRecord (*e, nvthd->current_stream->stream);
1183       if (r != CUDA_SUCCESS)
1184         GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1185
1186       event_add (PTX_EVT_MEM, e, (void *)h);
1187     }
1188   else
1189 #endif
1190     {
1191       r = cuMemcpyHtoD ((CUdeviceptr)d, h, s);
1192       if (r != CUDA_SUCCESS)
1193         GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
1194     }
1195
1196   return 0;
1197 }
1198
1199 static void *
1200 nvptx_dev2host (void *h, const void *d, size_t s)
1201 {
1202   CUresult r;
1203   CUdeviceptr pb;
1204   size_t ps;
1205   struct nvptx_thread *nvthd = nvptx_thread ();
1206
1207   if (!s)
1208     return 0;
1209
1210   if (!d)
1211     GOMP_PLUGIN_fatal ("invalid device address");
1212
1213   r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)d);
1214   if (r != CUDA_SUCCESS)
1215     GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r));
1216
1217   if (!pb)
1218     GOMP_PLUGIN_fatal ("invalid device address");
1219
1220   if (!h)
1221     GOMP_PLUGIN_fatal ("invalid host address");
1222
1223   if (d == h)
1224     GOMP_PLUGIN_fatal ("invalid host or device address");
1225
1226   if ((void *)(d + s) > (void *)(pb + ps))
1227     GOMP_PLUGIN_fatal ("invalid size");
1228
1229 #ifndef DISABLE_ASYNC
1230   if (nvthd->current_stream != nvthd->ptx_dev->null_stream)
1231     {
1232       CUevent *e;
1233
1234       e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1235
1236       r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1237       if (r != CUDA_SUCCESS)
1238         GOMP_PLUGIN_fatal ("cuEventCreate error: %s\n", cuda_error (r));
1239
1240       event_gc (false);
1241
1242       r = cuMemcpyDtoHAsync (h, (CUdeviceptr)d, s,
1243                              nvthd->current_stream->stream);
1244       if (r != CUDA_SUCCESS)
1245         GOMP_PLUGIN_fatal ("cuMemcpyDtoHAsync error: %s", cuda_error (r));
1246
1247       r = cuEventRecord (*e, nvthd->current_stream->stream);
1248       if (r != CUDA_SUCCESS)
1249         GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1250
1251       event_add (PTX_EVT_MEM, e, (void *)h);
1252     }
1253   else
1254 #endif
1255     {
1256       r = cuMemcpyDtoH (h, (CUdeviceptr)d, s);
1257       if (r != CUDA_SUCCESS)
1258         GOMP_PLUGIN_fatal ("cuMemcpyDtoH error: %s", cuda_error (r));
1259     }
1260
1261   return 0;
1262 }
1263
1264 static void
1265 nvptx_set_async (int async)
1266 {
1267   struct nvptx_thread *nvthd = nvptx_thread ();
1268   nvthd->current_stream
1269     = select_stream_for_async (async, pthread_self (), true, NULL);
1270 }
1271
1272 static int
1273 nvptx_async_test (int async)
1274 {
1275   CUresult r;
1276   struct ptx_stream *s;
1277
1278   s = select_stream_for_async (async, pthread_self (), false, NULL);
1279
1280   if (!s)
1281     GOMP_PLUGIN_fatal ("unknown async %d", async);
1282
1283   r = cuStreamQuery (s->stream);
1284   if (r == CUDA_SUCCESS)
1285     {
1286       /* The oacc-parallel.c:goacc_wait function calls this hook to determine
1287          whether all work has completed on this stream, and if so omits the call
1288          to the wait hook.  If that happens, event_gc might not get called
1289          (which prevents variables from getting unmapped and their associated
1290          device storage freed), so call it here.  */
1291       event_gc (true);
1292       return 1;
1293     }
1294   else if (r == CUDA_ERROR_NOT_READY)
1295     return 0;
1296
1297   GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
1298
1299   return 0;
1300 }
1301
1302 static int
1303 nvptx_async_test_all (void)
1304 {
1305   struct ptx_stream *s;
1306   pthread_t self = pthread_self ();
1307   struct nvptx_thread *nvthd = nvptx_thread ();
1308
1309   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1310
1311   for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
1312     {
1313       if ((s->multithreaded || pthread_equal (s->host_thread, self))
1314           && cuStreamQuery (s->stream) == CUDA_ERROR_NOT_READY)
1315         {
1316           pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1317           return 0;
1318         }
1319     }
1320
1321   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1322
1323   event_gc (true);
1324
1325   return 1;
1326 }
1327
1328 static void
1329 nvptx_wait (int async)
1330 {
1331   CUresult r;
1332   struct ptx_stream *s;
1333
1334   s = select_stream_for_async (async, pthread_self (), false, NULL);
1335
1336   if (!s)
1337     GOMP_PLUGIN_fatal ("unknown async %d", async);
1338
1339   r = cuStreamSynchronize (s->stream);
1340   if (r != CUDA_SUCCESS)
1341     GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1342
1343   event_gc (true);
1344 }
1345
1346 static void
1347 nvptx_wait_async (int async1, int async2)
1348 {
1349   CUresult r;
1350   CUevent *e;
1351   struct ptx_stream *s1, *s2;
1352   pthread_t self = pthread_self ();
1353
1354   /* The stream that is waiting (rather than being waited for) doesn't
1355      necessarily have to exist already.  */
1356   s2 = select_stream_for_async (async2, self, true, NULL);
1357
1358   s1 = select_stream_for_async (async1, self, false, NULL);
1359   if (!s1)
1360     GOMP_PLUGIN_fatal ("invalid async 1\n");
1361
1362   if (s1 == s2)
1363     GOMP_PLUGIN_fatal ("identical parameters");
1364
1365   e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1366
1367   r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1368   if (r != CUDA_SUCCESS)
1369     GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1370
1371   event_gc (true);
1372
1373   r = cuEventRecord (*e, s1->stream);
1374   if (r != CUDA_SUCCESS)
1375     GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1376
1377   event_add (PTX_EVT_SYNC, e, NULL);
1378
1379   r = cuStreamWaitEvent (s2->stream, *e, 0);
1380   if (r != CUDA_SUCCESS)
1381     GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuda_error (r));
1382 }
1383
1384 static void
1385 nvptx_wait_all (void)
1386 {
1387   CUresult r;
1388   struct ptx_stream *s;
1389   pthread_t self = pthread_self ();
1390   struct nvptx_thread *nvthd = nvptx_thread ();
1391
1392   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1393
1394   /* Wait for active streams initiated by this thread (or by multiple threads)
1395      to complete.  */
1396   for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
1397     {
1398       if (s->multithreaded || pthread_equal (s->host_thread, self))
1399         {
1400           r = cuStreamQuery (s->stream);
1401           if (r == CUDA_SUCCESS)
1402             continue;
1403           else if (r != CUDA_ERROR_NOT_READY)
1404             GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
1405
1406           r = cuStreamSynchronize (s->stream);
1407           if (r != CUDA_SUCCESS)
1408             GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1409         }
1410     }
1411
1412   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1413
1414   event_gc (true);
1415 }
1416
1417 static void
1418 nvptx_wait_all_async (int async)
1419 {
1420   CUresult r;
1421   struct ptx_stream *waiting_stream, *other_stream;
1422   CUevent *e;
1423   struct nvptx_thread *nvthd = nvptx_thread ();
1424   pthread_t self = pthread_self ();
1425
1426   /* The stream doing the waiting.  This could be the first mention of the
1427      stream, so create it if necessary.  */
1428   waiting_stream
1429     = select_stream_for_async (async, pthread_self (), true, NULL);
1430
1431   /* Launches on the null stream already block on other streams in the
1432      context.  */
1433   if (!waiting_stream || waiting_stream == nvthd->ptx_dev->null_stream)
1434     return;
1435
1436   event_gc (true);
1437
1438   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1439
1440   for (other_stream = nvthd->ptx_dev->active_streams;
1441        other_stream != NULL;
1442        other_stream = other_stream->next)
1443     {
1444       if (!other_stream->multithreaded
1445           && !pthread_equal (other_stream->host_thread, self))
1446         continue;
1447
1448       e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1449
1450       r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1451       if (r != CUDA_SUCCESS)
1452         GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1453
1454       /* Record an event on the waited-for stream.  */
1455       r = cuEventRecord (*e, other_stream->stream);
1456       if (r != CUDA_SUCCESS)
1457         GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1458
1459       event_add (PTX_EVT_SYNC, e, NULL);
1460
1461       r = cuStreamWaitEvent (waiting_stream->stream, *e, 0);
1462       if (r != CUDA_SUCCESS)
1463         GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuda_error (r));
1464    }
1465
1466   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1467 }
1468
1469 static void *
1470 nvptx_get_current_cuda_device (void)
1471 {
1472   struct nvptx_thread *nvthd = nvptx_thread ();
1473
1474   if (!nvthd || !nvthd->ptx_dev)
1475     return NULL;
1476
1477   return &nvthd->ptx_dev->dev;
1478 }
1479
1480 static void *
1481 nvptx_get_current_cuda_context (void)
1482 {
1483   struct nvptx_thread *nvthd = nvptx_thread ();
1484
1485   if (!nvthd || !nvthd->ptx_dev)
1486     return NULL;
1487
1488   return nvthd->ptx_dev->ctx;
1489 }
1490
1491 static void *
1492 nvptx_get_cuda_stream (int async)
1493 {
1494   struct ptx_stream *s;
1495   struct nvptx_thread *nvthd = nvptx_thread ();
1496
1497   if (!nvthd || !nvthd->ptx_dev)
1498     return NULL;
1499
1500   s = select_stream_for_async (async, pthread_self (), false, NULL);
1501
1502   return s ? s->stream : NULL;
1503 }
1504
1505 static int
1506 nvptx_set_cuda_stream (int async, void *stream)
1507 {
1508   struct ptx_stream *oldstream;
1509   pthread_t self = pthread_self ();
1510   struct nvptx_thread *nvthd = nvptx_thread ();
1511
1512   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1513
1514   if (async < 0)
1515     GOMP_PLUGIN_fatal ("bad async %d", async);
1516
1517   /* We have a list of active streams and an array mapping async values to
1518      entries of that list.  We need to take "ownership" of the passed-in stream,
1519      and add it to our list, removing the previous entry also (if there was one)
1520      in order to prevent resource leaks.  Note the potential for surprise
1521      here: maybe we should keep track of passed-in streams and leave it up to
1522      the user to tidy those up, but that doesn't work for stream handles
1523      returned from acc_get_cuda_stream above...  */
1524
1525   oldstream = select_stream_for_async (async, self, false, NULL);
1526
1527   if (oldstream)
1528     {
1529       if (nvthd->ptx_dev->active_streams == oldstream)
1530         nvthd->ptx_dev->active_streams = nvthd->ptx_dev->active_streams->next;
1531       else
1532         {
1533           struct ptx_stream *s = nvthd->ptx_dev->active_streams;
1534           while (s->next != oldstream)
1535             s = s->next;
1536           s->next = s->next->next;
1537         }
1538
1539       cuStreamDestroy (oldstream->stream);
1540       map_fini (oldstream);
1541       free (oldstream);
1542     }
1543
1544   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1545
1546   (void) select_stream_for_async (async, self, true, (CUstream) stream);
1547
1548   return 1;
1549 }
1550
1551 /* Plugin entry points.  */
1552
1553 const char *
1554 GOMP_OFFLOAD_get_name (void)
1555 {
1556   return "nvptx";
1557 }
1558
1559 unsigned int
1560 GOMP_OFFLOAD_get_caps (void)
1561 {
1562   return GOMP_OFFLOAD_CAP_OPENACC_200;
1563 }
1564
1565 int
1566 GOMP_OFFLOAD_get_type (void)
1567 {
1568   return OFFLOAD_TARGET_TYPE_NVIDIA_PTX;
1569 }
1570
1571 int
1572 GOMP_OFFLOAD_get_num_devices (void)
1573 {
1574   return nvptx_get_num_devices ();
1575 }
1576
1577 void
1578 GOMP_OFFLOAD_init_device (int n)
1579 {
1580   pthread_mutex_lock (&ptx_dev_lock);
1581
1582   if (!nvptx_init () || ptx_devices[n] != NULL)
1583     {
1584       pthread_mutex_unlock (&ptx_dev_lock);
1585       return;
1586     }
1587
1588   ptx_devices[n] = nvptx_open_device (n);
1589   instantiated_devices++;
1590
1591   pthread_mutex_unlock (&ptx_dev_lock);
1592 }
1593
1594 void
1595 GOMP_OFFLOAD_fini_device (int n)
1596 {
1597   pthread_mutex_lock (&ptx_dev_lock);
1598
1599   if (ptx_devices[n] != NULL)
1600     {
1601       nvptx_attach_host_thread_to_device (n);
1602       nvptx_close_device (ptx_devices[n]);
1603       ptx_devices[n] = NULL;
1604       instantiated_devices--;
1605     }
1606
1607   pthread_mutex_unlock (&ptx_dev_lock);
1608 }
1609
1610 int
1611 GOMP_OFFLOAD_load_image (int ord, void *target_data,
1612                          struct addr_pair **target_table)
1613 {
1614   CUmodule module;
1615   char **fn_names, **var_names;
1616   unsigned int fn_entries, var_entries, i, j;
1617   CUresult r;
1618   struct targ_fn_descriptor *targ_fns;
1619   void **img_header = (void **) target_data;
1620   struct ptx_image_data *new_image;
1621
1622   GOMP_OFFLOAD_init_device (ord);
1623
1624   nvptx_attach_host_thread_to_device (ord);
1625
1626   link_ptx (&module, img_header[0]);
1627
1628   pthread_mutex_lock (&ptx_image_lock);
1629   new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data));
1630   new_image->target_data = target_data;
1631   new_image->module = module;
1632   new_image->next = ptx_images;
1633   ptx_images = new_image;
1634   pthread_mutex_unlock (&ptx_image_lock);
1635
1636   /* The mkoffload utility emits a table of pointers/integers at the start of
1637      each offload image:
1638
1639      img_header[0] -> ptx code
1640      img_header[1] -> number of variables
1641      img_header[2] -> array of variable names (pointers to strings)
1642      img_header[3] -> number of kernels
1643      img_header[4] -> array of kernel names (pointers to strings)
1644
1645      The array of kernel names and the functions addresses form a
1646      one-to-one correspondence.  */
1647
1648   var_entries = (uintptr_t) img_header[1];
1649   var_names = (char **) img_header[2];
1650   fn_entries = (uintptr_t) img_header[3];
1651   fn_names = (char **) img_header[4];
1652
1653   *target_table = GOMP_PLUGIN_malloc (sizeof (struct addr_pair)
1654                                       * (fn_entries + var_entries));
1655   targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
1656                                  * fn_entries);
1657
1658   for (i = 0; i < fn_entries; i++)
1659     {
1660       CUfunction function;
1661
1662       r = cuModuleGetFunction (&function, module, fn_names[i]);
1663       if (r != CUDA_SUCCESS)
1664         GOMP_PLUGIN_fatal ("cuModuleGetFunction error: %s", cuda_error (r));
1665
1666       targ_fns[i].fn = function;
1667       targ_fns[i].name = (const char *) fn_names[i];
1668
1669       (*target_table)[i].start = (uintptr_t) &targ_fns[i];
1670       (*target_table)[i].end = (*target_table)[i].start + 1;
1671     }
1672
1673   for (j = 0; j < var_entries; j++, i++)
1674     {
1675       CUdeviceptr var;
1676       size_t bytes;
1677
1678       r = cuModuleGetGlobal (&var, &bytes, module, var_names[j]);
1679       if (r != CUDA_SUCCESS)
1680         GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
1681
1682       (*target_table)[i].start = (uintptr_t) var;
1683       (*target_table)[i].end = (*target_table)[i].start + bytes;
1684     }
1685
1686   return i;
1687 }
1688
1689 void
1690 GOMP_OFFLOAD_unload_image (int tid __attribute__((unused)), void *target_data)
1691 {
1692   void **img_header = (void **) target_data;
1693   struct targ_fn_descriptor *targ_fns
1694     = (struct targ_fn_descriptor *) img_header[0];
1695   struct ptx_image_data *image, *prev = NULL, *newhd = NULL;
1696
1697   free (targ_fns);
1698
1699   pthread_mutex_lock (&ptx_image_lock);
1700   for (image = ptx_images; image != NULL;)
1701     {
1702       struct ptx_image_data *next = image->next;
1703
1704       if (image->target_data == target_data)
1705         {
1706           cuModuleUnload (image->module);
1707           free (image);
1708           if (prev)
1709             prev->next = next;
1710         }
1711       else
1712         {
1713           prev = image;
1714           if (!newhd)
1715             newhd = image;
1716         }
1717
1718       image = next;
1719     }
1720   ptx_images = newhd;
1721   pthread_mutex_unlock (&ptx_image_lock);
1722 }
1723
1724 void *
1725 GOMP_OFFLOAD_alloc (int ord, size_t size)
1726 {
1727   nvptx_attach_host_thread_to_device (ord);
1728   return nvptx_alloc (size);
1729 }
1730
1731 void
1732 GOMP_OFFLOAD_free (int ord, void *ptr)
1733 {
1734   nvptx_attach_host_thread_to_device (ord);
1735   nvptx_free (ptr);
1736 }
1737
1738 void *
1739 GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
1740 {
1741   nvptx_attach_host_thread_to_device (ord);
1742   return nvptx_dev2host (dst, src, n);
1743 }
1744
1745 void *
1746 GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
1747 {
1748   nvptx_attach_host_thread_to_device (ord);
1749   return nvptx_host2dev (dst, src, n);
1750 }
1751
1752 void (*device_run) (int n, void *fn_ptr, void *vars) = NULL;
1753
1754 void
1755 GOMP_OFFLOAD_openacc_parallel (void (*fn) (void *), size_t mapnum,
1756                                void **hostaddrs, void **devaddrs, size_t *sizes,
1757                                unsigned short *kinds, int num_gangs,
1758                                int num_workers, int vector_length, int async,
1759                                void *targ_mem_desc)
1760 {
1761   nvptx_exec (fn, mapnum, hostaddrs, devaddrs, sizes, kinds, num_gangs,
1762             num_workers, vector_length, async, targ_mem_desc);
1763 }
1764
1765 void
1766 GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc)
1767 {
1768   CUevent *e;
1769   CUresult r;
1770   struct nvptx_thread *nvthd = nvptx_thread ();
1771
1772   e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1773
1774   r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1775   if (r != CUDA_SUCCESS)
1776     GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1777
1778   r = cuEventRecord (*e, nvthd->current_stream->stream);
1779   if (r != CUDA_SUCCESS)
1780     GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1781
1782   event_add (PTX_EVT_ASYNC_CLEANUP, e, targ_mem_desc);
1783 }
1784
1785 int
1786 GOMP_OFFLOAD_openacc_async_test (int async)
1787 {
1788   return nvptx_async_test (async);
1789 }
1790
1791 int
1792 GOMP_OFFLOAD_openacc_async_test_all (void)
1793 {
1794   return nvptx_async_test_all ();
1795 }
1796
1797 void
1798 GOMP_OFFLOAD_openacc_async_wait (int async)
1799 {
1800   nvptx_wait (async);
1801 }
1802
1803 void
1804 GOMP_OFFLOAD_openacc_async_wait_async (int async1, int async2)
1805 {
1806   nvptx_wait_async (async1, async2);
1807 }
1808
1809 void
1810 GOMP_OFFLOAD_openacc_async_wait_all (void)
1811 {
1812   nvptx_wait_all ();
1813 }
1814
1815 void
1816 GOMP_OFFLOAD_openacc_async_wait_all_async (int async)
1817 {
1818   nvptx_wait_all_async (async);
1819 }
1820
1821 void
1822 GOMP_OFFLOAD_openacc_async_set_async (int async)
1823 {
1824   nvptx_set_async (async);
1825 }
1826
1827 void *
1828 GOMP_OFFLOAD_openacc_create_thread_data (int ord)
1829 {
1830   struct ptx_device *ptx_dev;
1831   struct nvptx_thread *nvthd
1832     = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
1833   CUresult r;
1834   CUcontext thd_ctx;
1835
1836   ptx_dev = ptx_devices[ord];
1837
1838   assert (ptx_dev);
1839
1840   r = cuCtxGetCurrent (&thd_ctx);
1841   if (r != CUDA_SUCCESS)
1842     GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));
1843
1844   assert (ptx_dev->ctx);
1845
1846   if (!thd_ctx)
1847     {
1848       r = cuCtxPushCurrent (ptx_dev->ctx);
1849       if (r != CUDA_SUCCESS)
1850         GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s", cuda_error (r));
1851     }
1852
1853   nvthd->current_stream = ptx_dev->null_stream;
1854   nvthd->ptx_dev = ptx_dev;
1855
1856   return (void *) nvthd;
1857 }
1858
1859 void
1860 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
1861 {
1862   free (data);
1863 }
1864
1865 void *
1866 GOMP_OFFLOAD_openacc_get_current_cuda_device (void)
1867 {
1868   return nvptx_get_current_cuda_device ();
1869 }
1870
1871 void *
1872 GOMP_OFFLOAD_openacc_get_current_cuda_context (void)
1873 {
1874   return nvptx_get_current_cuda_context ();
1875 }
1876
1877 /* NOTE: This returns a CUstream, not a ptx_stream pointer.  */
1878
1879 void *
1880 GOMP_OFFLOAD_openacc_get_cuda_stream (int async)
1881 {
1882   return nvptx_get_cuda_stream (async);
1883 }
1884
1885 /* NOTE: This takes a CUstream, not a ptx_stream pointer.  */
1886
1887 int
1888 GOMP_OFFLOAD_openacc_set_cuda_stream (int async, void *stream)
1889 {
1890   return nvptx_set_cuda_stream (async, stream);
1891 }