libgomp/plugin/plugin-nvptx.c

   1 /* Plugin for NVPTX execution.
   2
   3    Copyright (C) 2013-2015 Free Software Foundation, Inc.
   4
   5    Contributed by Mentor Embedded.
   6
   7    This file is part of the GNU Offloading and Multi Processing Library
   8    (libgomp).
   9
  10    Libgomp is free software; you can redistribute it and/or modify it
  11    under the terms of the GNU General Public License as published by
  12    the Free Software Foundation; either version 3, or (at your option)
  13    any later version.
  14
  15    Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
  16    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  17    FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  18    more details.
  19
  20    Under Section 7 of GPL version 3, you are granted additional
  21    permissions described in the GCC Runtime Library Exception, version
  22    3.1, as published by the Free Software Foundation.
  23
  24    You should have received a copy of the GNU General Public License and
  25    a copy of the GCC Runtime Library Exception along with this program;
  26    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  27    <http://www.gnu.org/licenses/>.  */
  28
  29 /* Nvidia PTX-specific parts of OpenACC support.  The cuda driver
  30    library appears to hold some implicit state, but the documentation
  31    is not clear as to what that state might be.  Or how one might
  32    propagate it from one thread to another.  */
  33
  34 #include "openacc.h"
  35 #include "config.h"
  36 #include "libgomp-plugin.h"
  37 #include "oacc-ptx.h"
  38 #include "oacc-plugin.h"
  39
  40 #include <pthread.h>
  41 #include <cuda.h>
  42 #include <stdbool.h>
  43 #include <stdint.h>
  44 #include <string.h>
  45 #include <stdio.h>
  46 #include <dlfcn.h>
  47 #include <unistd.h>
  48 #include <assert.h>
  49
  50 #define ARRAYSIZE(X) (sizeof (X) / sizeof ((X)[0]))
  51
  52 static struct
  53 {
  54   CUresult r;
  55   char *m;
  56 } cuda_errlist[]=
  57 {
  58   { CUDA_ERROR_INVALID_VALUE, "invalid value" },
  59   { CUDA_ERROR_OUT_OF_MEMORY, "out of memory" },
  60   { CUDA_ERROR_NOT_INITIALIZED, "not initialized" },
  61   { CUDA_ERROR_DEINITIALIZED, "deinitialized" },
  62   { CUDA_ERROR_PROFILER_DISABLED, "profiler disabled" },
  63   { CUDA_ERROR_PROFILER_NOT_INITIALIZED, "profiler not initialized" },
  64   { CUDA_ERROR_PROFILER_ALREADY_STARTED, "already started" },
  65   { CUDA_ERROR_PROFILER_ALREADY_STOPPED, "already stopped" },
  66   { CUDA_ERROR_NO_DEVICE, "no device" },
  67   { CUDA_ERROR_INVALID_DEVICE, "invalid device" },
  68   { CUDA_ERROR_INVALID_IMAGE, "invalid image" },
  69   { CUDA_ERROR_INVALID_CONTEXT, "invalid context" },
  70   { CUDA_ERROR_CONTEXT_ALREADY_CURRENT, "context already current" },
  71   { CUDA_ERROR_MAP_FAILED, "map error" },
  72   { CUDA_ERROR_UNMAP_FAILED, "unmap error" },
  73   { CUDA_ERROR_ARRAY_IS_MAPPED, "array is mapped" },
  74   { CUDA_ERROR_ALREADY_MAPPED, "already mapped" },
  75   { CUDA_ERROR_NO_BINARY_FOR_GPU, "no binary for gpu" },
  76   { CUDA_ERROR_ALREADY_ACQUIRED, "already acquired" },
  77   { CUDA_ERROR_NOT_MAPPED, "not mapped" },
  78   { CUDA_ERROR_NOT_MAPPED_AS_ARRAY, "not mapped as array" },
  79   { CUDA_ERROR_NOT_MAPPED_AS_POINTER, "not mapped as pointer" },
  80   { CUDA_ERROR_ECC_UNCORRECTABLE, "ecc uncorrectable" },
  81   { CUDA_ERROR_UNSUPPORTED_LIMIT, "unsupported limit" },
  82   { CUDA_ERROR_CONTEXT_ALREADY_IN_USE, "context already in use" },
  83   { CUDA_ERROR_PEER_ACCESS_UNSUPPORTED, "peer access unsupported" },
  84   { CUDA_ERROR_INVALID_SOURCE, "invalid source" },
  85   { CUDA_ERROR_FILE_NOT_FOUND, "file not found" },
  86   { CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
  87                                            "shared object symbol not found" },
  88   { CUDA_ERROR_SHARED_OBJECT_INIT_FAILED, "shared object init error" },
  89   { CUDA_ERROR_OPERATING_SYSTEM, "operating system" },
  90   { CUDA_ERROR_INVALID_HANDLE, "invalid handle" },
  91   { CUDA_ERROR_NOT_FOUND, "not found" },
  92   { CUDA_ERROR_NOT_READY, "not ready" },
  93   { CUDA_ERROR_LAUNCH_FAILED, "launch error" },
  94   { CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, "launch out of resources" },
  95   { CUDA_ERROR_LAUNCH_TIMEOUT, "launch timeout" },
  96   { CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
  97                                              "launch incompatibe texturing" },
  98   { CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED, "peer access already enabled" },
  99   { CUDA_ERROR_PEER_ACCESS_NOT_ENABLED, "peer access not enabled " },
 100   { CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE, "primary cotext active" },
 101   { CUDA_ERROR_CONTEXT_IS_DESTROYED, "context is destroyed" },
 102   { CUDA_ERROR_ASSERT, "assert" },
 103   { CUDA_ERROR_TOO_MANY_PEERS, "too many peers" },
 104   { CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED,
 105                                            "host memory already registered" },
 106   { CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED, "host memory not registered" },
 107   { CUDA_ERROR_NOT_PERMITTED, "not permitted" },
 108   { CUDA_ERROR_NOT_SUPPORTED, "not supported" },
 109   { CUDA_ERROR_UNKNOWN, "unknown" }
 110 };
 111
 112 static char errmsg[128];
 113
 114 static char *
 115 cuda_error (CUresult r)
 116 {
 117   int i;
 118
 119   for (i = 0; i < ARRAYSIZE (cuda_errlist); i++)
 120     {
 121       if (cuda_errlist[i].r == r)
 122         return &cuda_errlist[i].m[0];
 123     }
 124
 125   sprintf (&errmsg[0], "unknown result code: %5d", r);
 126
 127   return &errmsg[0];
 128 }
 129
 130 struct targ_fn_descriptor
 131 {
 132   CUfunction fn;
 133   const char *name;
 134 };
 135
 136 static unsigned int instantiated_devices = 0;
 137 static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
 138
 139 struct ptx_stream
 140 {
 141   CUstream stream;
 142   pthread_t host_thread;
 143   bool multithreaded;
 144
 145   CUdeviceptr d;
 146   void *h;
 147   void *h_begin;
 148   void *h_end;
 149   void *h_next;
 150   void *h_prev;
 151   void *h_tail;
 152
 153   struct ptx_stream *next;
 154 };
 155
 156 /* Thread-specific data for PTX.  */
 157
 158 struct nvptx_thread
 159 {
 160   struct ptx_stream *current_stream;
 161   struct ptx_device *ptx_dev;
 162 };
 163
 164 struct map
 165 {
 166   int     async;
 167   size_t  size;
 168   char    mappings[0];
 169 };
 170
 171 static void
 172 map_init (struct ptx_stream *s)
 173 {
 174   CUresult r;
 175
 176   int size = getpagesize ();
 177
 178   assert (s);
 179   assert (!s->d);
 180   assert (!s->h);
 181
 182   r = cuMemAllocHost (&s->h, size);
 183   if (r != CUDA_SUCCESS)
 184     GOMP_PLUGIN_fatal ("cuMemAllocHost error: %s", cuda_error (r));
 185
 186   r = cuMemHostGetDevicePointer (&s->d, s->h, 0);
 187   if (r != CUDA_SUCCESS)
 188     GOMP_PLUGIN_fatal ("cuMemHostGetDevicePointer error: %s", cuda_error (r));
 189
 190   assert (s->h);
 191
 192   s->h_begin = s->h;
 193   s->h_end = s->h_begin + size;
 194   s->h_next = s->h_prev = s->h_tail = s->h_begin;
 195
 196   assert (s->h_next);
 197   assert (s->h_end);
 198 }
 199
 200 static void
 201 map_fini (struct ptx_stream *s)
 202 {
 203   CUresult r;
 204
 205   r = cuMemFreeHost (s->h);
 206   if (r != CUDA_SUCCESS)
 207     GOMP_PLUGIN_fatal ("cuMemFreeHost error: %s", cuda_error (r));
 208 }
 209
 210 static void
 211 map_pop (struct ptx_stream *s)
 212 {
 213   struct map *m;
 214
 215   assert (s != NULL);
 216   assert (s->h_next);
 217   assert (s->h_prev);
 218   assert (s->h_tail);
 219
 220   m = s->h_tail;
 221
 222   s->h_tail += m->size;
 223
 224   if (s->h_tail >= s->h_end)
 225     s->h_tail = s->h_begin + (int) (s->h_tail - s->h_end);
 226
 227   if (s->h_next == s->h_tail)
 228     s->h_prev = s->h_next;
 229
 230   assert (s->h_next >= s->h_begin);
 231   assert (s->h_tail >= s->h_begin);
 232   assert (s->h_prev >= s->h_begin);
 233
 234   assert (s->h_next <= s->h_end);
 235   assert (s->h_tail <= s->h_end);
 236   assert (s->h_prev <= s->h_end);
 237 }
 238
 239 static void
 240 map_push (struct ptx_stream *s, int async, size_t size, void **h, void **d)
 241 {
 242   int left;
 243   int offset;
 244   struct map *m;
 245
 246   assert (s != NULL);
 247
 248   left = s->h_end - s->h_next;
 249   size += sizeof (struct map);
 250
 251   assert (s->h_prev);
 252   assert (s->h_next);
 253
 254   if (size >= left)
 255     {
 256       m = s->h_prev;
 257       m->size += left;
 258       s->h_next = s->h_begin;
 259
 260       if (s->h_next + size > s->h_end)
 261         GOMP_PLUGIN_fatal ("unable to push map");
 262     }
 263
 264   assert (s->h_next);
 265
 266   m = s->h_next;
 267   m->async = async;
 268   m->size = size;
 269
 270   offset = (void *)&m->mappings[0] - s->h;
 271
 272   *d = (void *)(s->d + offset);
 273   *h = (void *)(s->h + offset);
 274
 275   s->h_prev = s->h_next;
 276   s->h_next += size;
 277
 278   assert (s->h_prev);
 279   assert (s->h_next);
 280
 281   assert (s->h_next >= s->h_begin);
 282   assert (s->h_tail >= s->h_begin);
 283   assert (s->h_prev >= s->h_begin);
 284   assert (s->h_next <= s->h_end);
 285   assert (s->h_tail <= s->h_end);
 286   assert (s->h_prev <= s->h_end);
 287
 288   return;
 289 }
 290
 291 struct ptx_device
 292 {
 293   CUcontext ctx;
 294   bool ctx_shared;
 295   CUdevice dev;
 296   struct ptx_stream *null_stream;
 297   /* All non-null streams associated with this device (actually context),
 298      either created implicitly or passed in from the user (via
 299      acc_set_cuda_stream).  */
 300   struct ptx_stream *active_streams;
 301   struct {
 302     struct ptx_stream **arr;
 303     int size;
 304   } async_streams;
 305   /* A lock for use when manipulating the above stream list and array.  */
 306   pthread_mutex_t stream_lock;
 307   int ord;
 308   bool overlap;
 309   bool map;
 310   bool concur;
 311   int  mode;
 312   bool mkern;
 313
 314   struct ptx_device *next;
 315 };
 316
 317 enum ptx_event_type
 318 {
 319   PTX_EVT_MEM,
 320   PTX_EVT_KNL,
 321   PTX_EVT_SYNC,
 322   PTX_EVT_ASYNC_CLEANUP
 323 };
 324
 325 struct ptx_event
 326 {
 327   CUevent *evt;
 328   int type;
 329   void *addr;
 330   int ord;
 331
 332   struct ptx_event *next;
 333 };
 334
 335 struct ptx_image_data
 336 {
 337   void *target_data;
 338   CUmodule module;
 339   struct ptx_image_data *next;
 340 };
 341
 342 static pthread_mutex_t ptx_event_lock;
 343 static struct ptx_event *ptx_events;
 344
 345 static struct ptx_device **ptx_devices;
 346
 347 static struct ptx_image_data *ptx_images = NULL;
 348 static pthread_mutex_t ptx_image_lock = PTHREAD_MUTEX_INITIALIZER;
 349
 350 #define _XSTR(s) _STR(s)
 351 #define _STR(s) #s
 352
 353 static struct _synames
 354 {
 355   char *n;
 356 } cuda_symnames[] =
 357 {
 358   { _XSTR (cuCtxCreate) },
 359   { _XSTR (cuCtxDestroy) },
 360   { _XSTR (cuCtxGetCurrent) },
 361   { _XSTR (cuCtxPushCurrent) },
 362   { _XSTR (cuCtxSynchronize) },
 363   { _XSTR (cuDeviceGet) },
 364   { _XSTR (cuDeviceGetAttribute) },
 365   { _XSTR (cuDeviceGetCount) },
 366   { _XSTR (cuEventCreate) },
 367   { _XSTR (cuEventDestroy) },
 368   { _XSTR (cuEventQuery) },
 369   { _XSTR (cuEventRecord) },
 370   { _XSTR (cuInit) },
 371   { _XSTR (cuLaunchKernel) },
 372   { _XSTR (cuLinkAddData) },
 373   { _XSTR (cuLinkComplete) },
 374   { _XSTR (cuLinkCreate) },
 375   { _XSTR (cuMemAlloc) },
 376   { _XSTR (cuMemAllocHost) },
 377   { _XSTR (cuMemcpy) },
 378   { _XSTR (cuMemcpyDtoH) },
 379   { _XSTR (cuMemcpyDtoHAsync) },
 380   { _XSTR (cuMemcpyHtoD) },
 381   { _XSTR (cuMemcpyHtoDAsync) },
 382   { _XSTR (cuMemFree) },
 383   { _XSTR (cuMemFreeHost) },
 384   { _XSTR (cuMemGetAddressRange) },
 385   { _XSTR (cuMemHostGetDevicePointer) },
 386   { _XSTR (cuMemHostRegister) },
 387   { _XSTR (cuMemHostUnregister) },
 388   { _XSTR (cuModuleGetFunction) },
 389   { _XSTR (cuModuleLoadData) },
 390   { _XSTR (cuStreamDestroy) },
 391   { _XSTR (cuStreamQuery) },
 392   { _XSTR (cuStreamSynchronize) },
 393   { _XSTR (cuStreamWaitEvent) }
 394 };
 395
 396 static int
 397 verify_device_library (void)
 398 {
 399   int i;
 400   void *dh, *ds;
 401
 402   dh = dlopen ("libcuda.so", RTLD_LAZY);
 403   if (!dh)
 404     return -1;
 405
 406   for (i = 0; i < ARRAYSIZE (cuda_symnames); i++)
 407     {
 408       ds = dlsym (dh, cuda_symnames[i].n);
 409       if (!ds)
 410         return -1;
 411     }
 412
 413   dlclose (dh);
 414
 415   return 0;
 416 }
 417
 418 static inline struct nvptx_thread *
 419 nvptx_thread (void)
 420 {
 421   return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
 422 }
 423
 424 static void
 425 init_streams_for_device (struct ptx_device *ptx_dev, int concurrency)
 426 {
 427   int i;
 428   struct ptx_stream *null_stream
 429     = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
 430
 431   null_stream->stream = NULL;
 432   null_stream->host_thread = pthread_self ();
 433   null_stream->multithreaded = true;
 434   null_stream->d = (CUdeviceptr) NULL;
 435   null_stream->h = NULL;
 436   map_init (null_stream);
 437   ptx_dev->null_stream = null_stream;
 438
 439   ptx_dev->active_streams = NULL;
 440   pthread_mutex_init (&ptx_dev->stream_lock, NULL);
 441
 442   if (concurrency < 1)
 443     concurrency = 1;
 444
 445   /* This is just a guess -- make space for as many async streams as the
 446      current device is capable of concurrently executing.  This can grow
 447      later as necessary.  No streams are created yet.  */
 448   ptx_dev->async_streams.arr
 449     = GOMP_PLUGIN_malloc (concurrency * sizeof (struct ptx_stream *));
 450   ptx_dev->async_streams.size = concurrency;
 451
 452   for (i = 0; i < concurrency; i++)
 453     ptx_dev->async_streams.arr[i] = NULL;
 454 }
 455
 456 static void
 457 fini_streams_for_device (struct ptx_device *ptx_dev)
 458 {
 459   free (ptx_dev->async_streams.arr);
 460
 461   while (ptx_dev->active_streams != NULL)
 462     {
 463       struct ptx_stream *s = ptx_dev->active_streams;
 464       ptx_dev->active_streams = ptx_dev->active_streams->next;
 465
 466       map_fini (s);
 467       cuStreamDestroy (s->stream);
 468       free (s);
 469     }
 470
 471   map_fini (ptx_dev->null_stream);
 472   free (ptx_dev->null_stream);
 473 }
 474
 475 /* Select a stream for (OpenACC-semantics) ASYNC argument for the current
 476    thread THREAD (and also current device/context).  If CREATE is true, create
 477    the stream if it does not exist (or use EXISTING if it is non-NULL), and
 478    associate the stream with the same thread argument.  Returns stream to use
 479    as result.  */
 480
 481 static struct ptx_stream *
 482 select_stream_for_async (int async, pthread_t thread, bool create,
 483                          CUstream existing)
 484 {
 485   struct nvptx_thread *nvthd = nvptx_thread ();
 486   /* Local copy of TLS variable.  */
 487   struct ptx_device *ptx_dev = nvthd->ptx_dev;
 488   struct ptx_stream *stream = NULL;
 489   int orig_async = async;
 490
 491   /* The special value acc_async_noval (-1) maps (for now) to an
 492      implicitly-created stream, which is then handled the same as any other
 493      numbered async stream.  Other options are available, e.g. using the null
 494      stream for anonymous async operations, or choosing an idle stream from an
 495      active set.  But, stick with this for now.  */
 496   if (async > acc_async_sync)
 497     async++;
 498
 499   if (create)
 500     pthread_mutex_lock (&ptx_dev->stream_lock);
 501
 502   /* NOTE: AFAICT there's no particular need for acc_async_sync to map to the
 503      null stream, and in fact better performance may be obtainable if it doesn't
 504      (because the null stream enforces overly-strict synchronisation with
 505      respect to other streams for legacy reasons, and that's probably not
 506      needed with OpenACC).  Maybe investigate later.  */
 507   if (async == acc_async_sync)
 508     stream = ptx_dev->null_stream;
 509   else if (async >= 0 && async < ptx_dev->async_streams.size
 510            && ptx_dev->async_streams.arr[async] && !(create && existing))
 511     stream = ptx_dev->async_streams.arr[async];
 512   else if (async >= 0 && create)
 513     {
 514       if (async >= ptx_dev->async_streams.size)
 515         {
 516           int i, newsize = ptx_dev->async_streams.size * 2;
 517
 518           if (async >= newsize)
 519             newsize = async + 1;
 520
 521           ptx_dev->async_streams.arr
 522             = GOMP_PLUGIN_realloc (ptx_dev->async_streams.arr,
 523                                    newsize * sizeof (struct ptx_stream *));
 524
 525           for (i = ptx_dev->async_streams.size; i < newsize; i++)
 526             ptx_dev->async_streams.arr[i] = NULL;
 527
 528           ptx_dev->async_streams.size = newsize;
 529         }
 530
 531       /* Create a new stream on-demand if there isn't one already, or if we're
 532          setting a particular async value to an existing (externally-provided)
 533          stream.  */
 534       if (!ptx_dev->async_streams.arr[async] || existing)
 535         {
 536           CUresult r;
 537           struct ptx_stream *s
 538             = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
 539
 540           if (existing)
 541             s->stream = existing;
 542           else
 543             {
 544               r = cuStreamCreate (&s->stream, CU_STREAM_DEFAULT);
 545               if (r != CUDA_SUCCESS)
 546                 GOMP_PLUGIN_fatal ("cuStreamCreate error: %s", cuda_error (r));
 547             }
 548
 549           /* If CREATE is true, we're going to be queueing some work on this
 550              stream.  Associate it with the current host thread.  */
 551           s->host_thread = thread;
 552           s->multithreaded = false;
 553
 554           s->d = (CUdeviceptr) NULL;
 555           s->h = NULL;
 556           map_init (s);
 557
 558           s->next = ptx_dev->active_streams;
 559           ptx_dev->active_streams = s;
 560           ptx_dev->async_streams.arr[async] = s;
 561         }
 562
 563       stream = ptx_dev->async_streams.arr[async];
 564     }
 565   else if (async < 0)
 566     GOMP_PLUGIN_fatal ("bad async %d", async);
 567
 568   if (create)
 569     {
 570       assert (stream != NULL);
 571
 572       /* If we're trying to use the same stream from different threads
 573          simultaneously, set stream->multithreaded to true.  This affects the
 574          behaviour of acc_async_test_all and acc_wait_all, which are supposed to
 575          only wait for asynchronous launches from the same host thread they are
 576          invoked on.  If multiple threads use the same async value, we make note
 577          of that here and fall back to testing/waiting for all threads in those
 578          functions.  */
 579       if (thread != stream->host_thread)
 580         stream->multithreaded = true;
 581
 582       pthread_mutex_unlock (&ptx_dev->stream_lock);
 583     }
 584   else if (stream && !stream->multithreaded
 585            && !pthread_equal (stream->host_thread, thread))
 586     GOMP_PLUGIN_fatal ("async %d used on wrong thread", orig_async);
 587
 588   return stream;
 589 }
 590
 591 /* Initialize the device.  Return TRUE on success, else FALSE.  PTX_DEV_LOCK
 592    should be locked on entry and remains locked on exit.  */
 593 static bool
 594 nvptx_init (void)
 595 {
 596   CUresult r;
 597   int rc;
 598   int ndevs;
 599
 600   if (instantiated_devices != 0)
 601     return true;
 602
 603   rc = verify_device_library ();
 604   if (rc < 0)
 605     return false;
 606
 607   r = cuInit (0);
 608   if (r != CUDA_SUCCESS)
 609     GOMP_PLUGIN_fatal ("cuInit error: %s", cuda_error (r));
 610
 611   ptx_events = NULL;
 612
 613   pthread_mutex_init (&ptx_event_lock, NULL);
 614
 615   r = cuDeviceGetCount (&ndevs);
 616   if (r != CUDA_SUCCESS)
 617     GOMP_PLUGIN_fatal ("cuDeviceGetCount error: %s", cuda_error (r));
 618
 619   ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *)
 620                                             * ndevs);
 621
 622   return true;
 623 }
 624
 625 /* Select the N'th PTX device for the current host thread.  The device must
 626    have been previously opened before calling this function.  */
 627
 628 static void
 629 nvptx_attach_host_thread_to_device (int n)
 630 {
 631   CUdevice dev;
 632   CUresult r;
 633   struct ptx_device *ptx_dev;
 634   CUcontext thd_ctx;
 635
 636   r = cuCtxGetDevice (&dev);
 637   if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
 638     GOMP_PLUGIN_fatal ("cuCtxGetDevice error: %s", cuda_error (r));
 639
 640   if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n)
 641     return;
 642   else
 643     {
 644       CUcontext old_ctx;
 645
 646       ptx_dev = ptx_devices[n];
 647       assert (ptx_dev);
 648
 649       r = cuCtxGetCurrent (&thd_ctx);
 650       if (r != CUDA_SUCCESS)
 651         GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));
 652
 653       /* We don't necessarily have a current context (e.g. if it has been
 654          destroyed.  Pop it if we do though.  */
 655       if (thd_ctx != NULL)
 656         {
 657           r = cuCtxPopCurrent (&old_ctx);
 658           if (r != CUDA_SUCCESS)
 659             GOMP_PLUGIN_fatal ("cuCtxPopCurrent error: %s", cuda_error (r));
 660         }
 661
 662       r = cuCtxPushCurrent (ptx_dev->ctx);
 663       if (r != CUDA_SUCCESS)
 664         GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s", cuda_error (r));
 665     }
 666 }
 667
 668 static struct ptx_device *
 669 nvptx_open_device (int n)
 670 {
 671   struct ptx_device *ptx_dev;
 672   CUdevice dev, ctx_dev;
 673   CUresult r;
 674   int async_engines, pi;
 675
 676   r = cuDeviceGet (&dev, n);
 677   if (r != CUDA_SUCCESS)
 678     GOMP_PLUGIN_fatal ("cuDeviceGet error: %s", cuda_error (r));
 679
 680   ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device));
 681
 682   ptx_dev->ord = n;
 683   ptx_dev->dev = dev;
 684   ptx_dev->ctx_shared = false;
 685
 686   r = cuCtxGetDevice (&ctx_dev);
 687   if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
 688     GOMP_PLUGIN_fatal ("cuCtxGetDevice error: %s", cuda_error (r));
 689
 690   if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev)
 691     {
 692       /* The current host thread has an active context for a different device.
 693          Detach it.  */
 694       CUcontext old_ctx;
 695
 696       r = cuCtxPopCurrent (&old_ctx);
 697       if (r != CUDA_SUCCESS)
 698         GOMP_PLUGIN_fatal ("cuCtxPopCurrent error: %s", cuda_error (r));
 699     }
 700
 701   r = cuCtxGetCurrent (&ptx_dev->ctx);
 702   if (r != CUDA_SUCCESS)
 703     GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));
 704
 705   if (!ptx_dev->ctx)
 706     {
 707       r = cuCtxCreate (&ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
 708       if (r != CUDA_SUCCESS)
 709         GOMP_PLUGIN_fatal ("cuCtxCreate error: %s", cuda_error (r));
 710     }
 711   else
 712     ptx_dev->ctx_shared = true;
 713
 714   r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
 715   if (r != CUDA_SUCCESS)
 716     GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
 717
 718   ptx_dev->overlap = pi;
 719
 720   r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
 721   if (r != CUDA_SUCCESS)
 722     GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
 723
 724   ptx_dev->map = pi;
 725
 726   r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
 727   if (r != CUDA_SUCCESS)
 728     GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
 729
 730   ptx_dev->concur = pi;
 731
 732   r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
 733   if (r != CUDA_SUCCESS)
 734     GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
 735
 736   ptx_dev->mode = pi;
 737
 738   r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
 739   if (r != CUDA_SUCCESS)
 740     GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
 741
 742   ptx_dev->mkern = pi;
 743
 744   r = cuDeviceGetAttribute (&async_engines,
 745                             CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
 746   if (r != CUDA_SUCCESS)
 747     async_engines = 1;
 748
 749   init_streams_for_device (ptx_dev, async_engines);
 750
 751   return ptx_dev;
 752 }
 753
 754 static void
 755 nvptx_close_device (struct ptx_device *ptx_dev)
 756 {
 757   CUresult r;
 758
 759   if (!ptx_dev)
 760     return;
 761
 762   fini_streams_for_device (ptx_dev);
 763
 764   if (!ptx_dev->ctx_shared)
 765     {
 766       r = cuCtxDestroy (ptx_dev->ctx);
 767       if (r != CUDA_SUCCESS)
 768         GOMP_PLUGIN_fatal ("cuCtxDestroy error: %s", cuda_error (r));
 769     }
 770
 771   free (ptx_dev);
 772 }
 773
 774 static int
 775 nvptx_get_num_devices (void)
 776 {
 777   int n;
 778   CUresult r;
 779
 780   /* This function will be called before the plugin has been initialized in
 781      order to enumerate available devices, but CUDA API routines can't be used
 782      until cuInit has been called.  Just call it now (but don't yet do any
 783      further initialization).  */
 784   if (instantiated_devices == 0)
 785     {
 786       r = cuInit (0);
 787       /* This is not an error: e.g. we may have CUDA libraries installed but
 788          no devices available.  */
 789       if (r != CUDA_SUCCESS)
 790         return 0;
 791     }
 792
 793   r = cuDeviceGetCount (&n);
 794   if (r!= CUDA_SUCCESS)
 795     GOMP_PLUGIN_fatal ("cuDeviceGetCount error: %s", cuda_error (r));
 796
 797   return n;
 798 }
 799
 800
 801 static void
 802 link_ptx (CUmodule *module, char *ptx_code)
 803 {
 804   CUjit_option opts[7];
 805   void *optvals[7];
 806   float elapsed = 0.0;
 807 #define LOGSIZE 8192
 808   char elog[LOGSIZE];
 809   char ilog[LOGSIZE];
 810   unsigned long logsize = LOGSIZE;
 811   CUlinkState linkstate;
 812   CUresult r;
 813   void *linkout;
 814   size_t linkoutsize __attribute__ ((unused));
 815
 816   GOMP_PLUGIN_debug (0, "attempting to load:\n---\n%s\n---\n", ptx_code);
 817
 818   opts[0] = CU_JIT_WALL_TIME;
 819   optvals[0] = &elapsed;
 820
 821   opts[1] = CU_JIT_INFO_LOG_BUFFER;
 822   optvals[1] = &ilog[0];
 823
 824   opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
 825   optvals[2] = (void *) logsize;
 826
 827   opts[3] = CU_JIT_ERROR_LOG_BUFFER;
 828   optvals[3] = &elog[0];
 829
 830   opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
 831   optvals[4] = (void *) logsize;
 832
 833   opts[5] = CU_JIT_LOG_VERBOSE;
 834   optvals[5] = (void *) 1;
 835
 836   opts[6] = CU_JIT_TARGET;
 837   optvals[6] = (void *) CU_TARGET_COMPUTE_30;
 838
 839   r = cuLinkCreate (7, opts, optvals, &linkstate);
 840   if (r != CUDA_SUCCESS)
 841     GOMP_PLUGIN_fatal ("cuLinkCreate error: %s", cuda_error (r));
 842
 843   char *abort_ptx = ABORT_PTX;
 844   r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, abort_ptx,
 845                      strlen (abort_ptx) + 1, 0, 0, 0, 0);
 846   if (r != CUDA_SUCCESS)
 847     {
 848       GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
 849       GOMP_PLUGIN_fatal ("cuLinkAddData (abort) error: %s", cuda_error (r));
 850     }
 851
 852   char *acc_on_device_ptx = ACC_ON_DEVICE_PTX;
 853   r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, acc_on_device_ptx,
 854                      strlen (acc_on_device_ptx) + 1, 0, 0, 0, 0);
 855   if (r != CUDA_SUCCESS)
 856     {
 857       GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
 858       GOMP_PLUGIN_fatal ("cuLinkAddData (acc_on_device) error: %s",
 859                          cuda_error (r));
 860     }
 861
 862   char *goacc_internal_ptx = GOACC_INTERNAL_PTX;
 863   r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, goacc_internal_ptx,
 864                      strlen (goacc_internal_ptx) + 1, 0, 0, 0, 0);
 865   if (r != CUDA_SUCCESS)
 866     {
 867       GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
 868       GOMP_PLUGIN_fatal ("cuLinkAddData (goacc_internal_ptx) error: %s",
 869                          cuda_error (r));
 870     }
 871
 872   r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, ptx_code,
 873               strlen (ptx_code) + 1, 0, 0, 0, 0);
 874   if (r != CUDA_SUCCESS)
 875     {
 876       GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
 877       GOMP_PLUGIN_fatal ("cuLinkAddData (ptx_code) error: %s", cuda_error (r));
 878     }
 879
 880   r = cuLinkComplete (linkstate, &linkout, &linkoutsize);
 881   if (r != CUDA_SUCCESS)
 882     GOMP_PLUGIN_fatal ("cuLinkComplete error: %s", cuda_error (r));
 883
 884   GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed);
 885   GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]);
 886
 887   r = cuModuleLoadData (module, linkout);
 888   if (r != CUDA_SUCCESS)
 889     GOMP_PLUGIN_fatal ("cuModuleLoadData error: %s", cuda_error (r));
 890 }
 891
 892 static void
 893 event_gc (bool memmap_lockable)
 894 {
 895   struct ptx_event *ptx_event = ptx_events;
 896   struct nvptx_thread *nvthd = nvptx_thread ();
 897
 898   pthread_mutex_lock (&ptx_event_lock);
 899
 900   while (ptx_event != NULL)
 901     {
 902       CUresult r;
 903       struct ptx_event *e = ptx_event;
 904
 905       ptx_event = ptx_event->next;
 906
 907       if (e->ord != nvthd->ptx_dev->ord)
 908         continue;
 909
 910       r = cuEventQuery (*e->evt);
 911       if (r == CUDA_SUCCESS)
 912         {
 913           CUevent *te;
 914
 915           te = e->evt;
 916
 917           switch (e->type)
 918             {
 919             case PTX_EVT_MEM:
 920             case PTX_EVT_SYNC:
 921               break;
 922
 923             case PTX_EVT_KNL:
 924               map_pop (e->addr);
 925               break;
 926
 927             case PTX_EVT_ASYNC_CLEANUP:
 928               {
 929                 /* The function gomp_plugin_async_unmap_vars needs to claim the
 930                    memory-map splay tree lock for the current device, so we
 931                    can't call it when one of our callers has already claimed
 932                    the lock.  In that case, just delay the GC for this event
 933                    until later.  */
 934                 if (!memmap_lockable)
 935                   continue;
 936
 937                 GOMP_PLUGIN_async_unmap_vars (e->addr);
 938               }
 939               break;
 940             }
 941
 942           cuEventDestroy (*te);
 943           free ((void *)te);
 944
 945           if (ptx_events == e)
 946             ptx_events = ptx_events->next;
 947           else
 948             {
 949               struct ptx_event *e_ = ptx_events;
 950               while (e_->next != e)
 951                 e_ = e_->next;
 952               e_->next = e_->next->next;
 953             }
 954
 955           free (e);
 956         }
 957     }
 958
 959   pthread_mutex_unlock (&ptx_event_lock);
 960 }
 961
 962 static void
 963 event_add (enum ptx_event_type type, CUevent *e, void *h)
 964 {
 965   struct ptx_event *ptx_event;
 966   struct nvptx_thread *nvthd = nvptx_thread ();
 967
 968   assert (type == PTX_EVT_MEM || type == PTX_EVT_KNL || type == PTX_EVT_SYNC
 969           || type == PTX_EVT_ASYNC_CLEANUP);
 970
 971   ptx_event = GOMP_PLUGIN_malloc (sizeof (struct ptx_event));
 972   ptx_event->type = type;
 973   ptx_event->evt = e;
 974   ptx_event->addr = h;
 975   ptx_event->ord = nvthd->ptx_dev->ord;
 976
 977   pthread_mutex_lock (&ptx_event_lock);
 978
 979   ptx_event->next = ptx_events;
 980   ptx_events = ptx_event;
 981
 982   pthread_mutex_unlock (&ptx_event_lock);
 983 }
 984
 985 void
 986 nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
 987           size_t *sizes, unsigned short *kinds, int num_gangs, int num_workers,
 988           int vector_length, int async, void *targ_mem_desc)
 989 {
 990   struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
 991   CUfunction function;
 992   CUresult r;
 993   int i;
 994   struct ptx_stream *dev_str;
 995   void *kargs[1];
 996   void *hp, *dp;
 997   unsigned int nthreads_in_block;
 998   struct nvptx_thread *nvthd = nvptx_thread ();
 999   const char *maybe_abort_msg = "(perhaps abort was called)";
1000
1001   function = targ_fn->fn;
1002
1003   dev_str = select_stream_for_async (async, pthread_self (), false, NULL);
1004   assert (dev_str == nvthd->current_stream);
1005
1006   /* This reserves a chunk of a pre-allocated page of memory mapped on both
1007      the host and the device. HP is a host pointer to the new chunk, and DP is
1008      the corresponding device pointer.  */
1009   map_push (dev_str, async, mapnum * sizeof (void *), &hp, &dp);
1010
1011   GOMP_PLUGIN_debug (0, "  %s: prepare mappings\n", __FUNCTION__);
1012
1013   /* Copy the array of arguments to the mapped page.  */
1014   for (i = 0; i < mapnum; i++)
1015     ((void **) hp)[i] = devaddrs[i];
1016
1017   /* Copy the (device) pointers to arguments to the device (dp and hp might in
1018      fact have the same value on a unified-memory system).  */
1019   r = cuMemcpy ((CUdeviceptr)dp, (CUdeviceptr)hp, mapnum * sizeof (void *));
1020   if (r != CUDA_SUCCESS)
1021     GOMP_PLUGIN_fatal ("cuMemcpy failed: %s", cuda_error (r));
1022
1023   GOMP_PLUGIN_debug (0, "  %s: kernel %s: launch\n", __FUNCTION__, targ_fn->name);
1024
1025   // OpenACC            CUDA
1026   //
1027   // num_gangs          blocks
1028   // num_workers        warps (where a warp is equivalent to 32 threads)
1029   // vector length      threads
1030   //
1031
1032   /* The openacc vector_length clause 'determines the vector length to use for
1033      vector or SIMD operations'.  The question is how to map this to CUDA.
1034
1035      In CUDA, the warp size is the vector length of a CUDA device.  However, the
1036      CUDA interface abstracts away from that, and only shows us warp size
1037      indirectly in maximum number of threads per block, which is a product of
1038      warp size and the number of hyperthreads of a multiprocessor.
1039
1040      We choose to map openacc vector_length directly onto the number of threads
1041      in a block, in the x dimension.  This is reflected in gcc code generation
1042      that uses ThreadIdx.x to access vector elements.
1043
1044      Attempting to use an openacc vector_length of more than the maximum number
1045      of threads per block will result in a cuda error.  */
1046   nthreads_in_block = vector_length;
1047
1048   kargs[0] = &dp;
1049   r = cuLaunchKernel (function,
1050                       num_gangs, 1, 1,
1051                       nthreads_in_block, 1, 1,
1052                       0, dev_str->stream, kargs, 0);
1053   if (r != CUDA_SUCCESS)
1054     GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
1055
1056 #ifndef DISABLE_ASYNC
1057   if (async < acc_async_noval)
1058     {
1059       r = cuStreamSynchronize (dev_str->stream);
1060       if (r == CUDA_ERROR_LAUNCH_FAILED)
1061         GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
1062                            maybe_abort_msg);
1063       else if (r != CUDA_SUCCESS)
1064         GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1065     }
1066   else
1067     {
1068       CUevent *e;
1069
1070       e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1071
1072       r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1073       if (r == CUDA_ERROR_LAUNCH_FAILED)
1074         GOMP_PLUGIN_fatal ("cuEventCreate error: %s %s\n", cuda_error (r),
1075                            maybe_abort_msg);
1076       else if (r != CUDA_SUCCESS)
1077         GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1078
1079       event_gc (true);
1080
1081       r = cuEventRecord (*e, dev_str->stream);
1082       if (r != CUDA_SUCCESS)
1083         GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1084
1085       event_add (PTX_EVT_KNL, e, (void *)dev_str);
1086     }
1087 #else
1088   r = cuCtxSynchronize ();
1089   if (r == CUDA_ERROR_LAUNCH_FAILED)
1090     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
1091                        maybe_abort_msg);
1092   else if (r != CUDA_SUCCESS)
1093     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
1094 #endif
1095
1096   GOMP_PLUGIN_debug (0, "  %s: kernel %s: finished\n", __FUNCTION__,
1097                      targ_fn->name);
1098
1099 #ifndef DISABLE_ASYNC
1100   if (async < acc_async_noval)
1101 #endif
1102     map_pop (dev_str);
1103 }
1104
1105 void * openacc_get_current_cuda_context (void);
1106
1107 static void *
1108 nvptx_alloc (size_t s)
1109 {
1110   CUdeviceptr d;
1111   CUresult r;
1112
1113   r = cuMemAlloc (&d, s);
1114   if (r == CUDA_ERROR_OUT_OF_MEMORY)
1115     return 0;
1116   if (r != CUDA_SUCCESS)
1117     GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
1118   return (void *)d;
1119 }
1120
1121 static void
1122 nvptx_free (void *p)
1123 {
1124   CUresult r;
1125   CUdeviceptr pb;
1126   size_t ps;
1127
1128   r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)p);
1129   if (r != CUDA_SUCCESS)
1130     GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r));
1131
1132   if ((CUdeviceptr)p != pb)
1133     GOMP_PLUGIN_fatal ("invalid device address");
1134
1135   r = cuMemFree ((CUdeviceptr)p);
1136   if (r != CUDA_SUCCESS)
1137     GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
1138 }
1139
1140 static void *
1141 nvptx_host2dev (void *d, const void *h, size_t s)
1142 {
1143   CUresult r;
1144   CUdeviceptr pb;
1145   size_t ps;
1146   struct nvptx_thread *nvthd = nvptx_thread ();
1147
1148   if (!s)
1149     return 0;
1150
1151   if (!d)
1152     GOMP_PLUGIN_fatal ("invalid device address");
1153
1154   r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)d);
1155   if (r != CUDA_SUCCESS)
1156     GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r));
1157
1158   if (!pb)
1159     GOMP_PLUGIN_fatal ("invalid device address");
1160
1161   if (!h)
1162     GOMP_PLUGIN_fatal ("invalid host address");
1163
1164   if (d == h)
1165     GOMP_PLUGIN_fatal ("invalid host or device address");
1166
1167   if ((void *)(d + s) > (void *)(pb + ps))
1168     GOMP_PLUGIN_fatal ("invalid size");
1169
1170 #ifndef DISABLE_ASYNC
1171   if (nvthd->current_stream != nvthd->ptx_dev->null_stream)
1172     {
1173       CUevent *e;
1174
1175       e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1176
1177       r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1178       if (r != CUDA_SUCCESS)
1179         GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1180
1181       event_gc (false);
1182
1183       r = cuMemcpyHtoDAsync ((CUdeviceptr)d, h, s,
1184                              nvthd->current_stream->stream);
1185       if (r != CUDA_SUCCESS)
1186         GOMP_PLUGIN_fatal ("cuMemcpyHtoDAsync error: %s", cuda_error (r));
1187
1188       r = cuEventRecord (*e, nvthd->current_stream->stream);
1189       if (r != CUDA_SUCCESS)
1190         GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1191
1192       event_add (PTX_EVT_MEM, e, (void *)h);
1193     }
1194   else
1195 #endif
1196     {
1197       r = cuMemcpyHtoD ((CUdeviceptr)d, h, s);
1198       if (r != CUDA_SUCCESS)
1199         GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
1200     }
1201
1202   return 0;
1203 }
1204
1205 static void *
1206 nvptx_dev2host (void *h, const void *d, size_t s)
1207 {
1208   CUresult r;
1209   CUdeviceptr pb;
1210   size_t ps;
1211   struct nvptx_thread *nvthd = nvptx_thread ();
1212
1213   if (!s)
1214     return 0;
1215
1216   if (!d)
1217     GOMP_PLUGIN_fatal ("invalid device address");
1218
1219   r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)d);
1220   if (r != CUDA_SUCCESS)
1221     GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r));
1222
1223   if (!pb)
1224     GOMP_PLUGIN_fatal ("invalid device address");
1225
1226   if (!h)
1227     GOMP_PLUGIN_fatal ("invalid host address");
1228
1229   if (d == h)
1230     GOMP_PLUGIN_fatal ("invalid host or device address");
1231
1232   if ((void *)(d + s) > (void *)(pb + ps))
1233     GOMP_PLUGIN_fatal ("invalid size");
1234
1235 #ifndef DISABLE_ASYNC
1236   if (nvthd->current_stream != nvthd->ptx_dev->null_stream)
1237     {
1238       CUevent *e;
1239
1240       e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1241
1242       r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1243       if (r != CUDA_SUCCESS)
1244         GOMP_PLUGIN_fatal ("cuEventCreate error: %s\n", cuda_error (r));
1245
1246       event_gc (false);
1247
1248       r = cuMemcpyDtoHAsync (h, (CUdeviceptr)d, s,
1249                              nvthd->current_stream->stream);
1250       if (r != CUDA_SUCCESS)
1251         GOMP_PLUGIN_fatal ("cuMemcpyDtoHAsync error: %s", cuda_error (r));
1252
1253       r = cuEventRecord (*e, nvthd->current_stream->stream);
1254       if (r != CUDA_SUCCESS)
1255         GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1256
1257       event_add (PTX_EVT_MEM, e, (void *)h);
1258     }
1259   else
1260 #endif
1261     {
1262       r = cuMemcpyDtoH (h, (CUdeviceptr)d, s);
1263       if (r != CUDA_SUCCESS)
1264         GOMP_PLUGIN_fatal ("cuMemcpyDtoH error: %s", cuda_error (r));
1265     }
1266
1267   return 0;
1268 }
1269
1270 static void
1271 nvptx_set_async (int async)
1272 {
1273   struct nvptx_thread *nvthd = nvptx_thread ();
1274   nvthd->current_stream
1275     = select_stream_for_async (async, pthread_self (), true, NULL);
1276 }
1277
1278 static int
1279 nvptx_async_test (int async)
1280 {
1281   CUresult r;
1282   struct ptx_stream *s;
1283
1284   s = select_stream_for_async (async, pthread_self (), false, NULL);
1285
1286   if (!s)
1287     GOMP_PLUGIN_fatal ("unknown async %d", async);
1288
1289   r = cuStreamQuery (s->stream);
1290   if (r == CUDA_SUCCESS)
1291     {
1292       /* The oacc-parallel.c:goacc_wait function calls this hook to determine
1293          whether all work has completed on this stream, and if so omits the call
1294          to the wait hook.  If that happens, event_gc might not get called
1295          (which prevents variables from getting unmapped and their associated
1296          device storage freed), so call it here.  */
1297       event_gc (true);
1298       return 1;
1299     }
1300   else if (r == CUDA_ERROR_NOT_READY)
1301     return 0;
1302
1303   GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
1304
1305   return 0;
1306 }
1307
1308 static int
1309 nvptx_async_test_all (void)
1310 {
1311   struct ptx_stream *s;
1312   pthread_t self = pthread_self ();
1313   struct nvptx_thread *nvthd = nvptx_thread ();
1314
1315   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1316
1317   for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
1318     {
1319       if ((s->multithreaded || pthread_equal (s->host_thread, self))
1320           && cuStreamQuery (s->stream) == CUDA_ERROR_NOT_READY)
1321         {
1322           pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1323           return 0;
1324         }
1325     }
1326
1327   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1328
1329   event_gc (true);
1330
1331   return 1;
1332 }
1333
1334 static void
1335 nvptx_wait (int async)
1336 {
1337   CUresult r;
1338   struct ptx_stream *s;
1339
1340   s = select_stream_for_async (async, pthread_self (), false, NULL);
1341
1342   if (!s)
1343     GOMP_PLUGIN_fatal ("unknown async %d", async);
1344
1345   r = cuStreamSynchronize (s->stream);
1346   if (r != CUDA_SUCCESS)
1347     GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1348
1349   event_gc (true);
1350 }
1351
1352 static void
1353 nvptx_wait_async (int async1, int async2)
1354 {
1355   CUresult r;
1356   CUevent *e;
1357   struct ptx_stream *s1, *s2;
1358   pthread_t self = pthread_self ();
1359
1360   /* The stream that is waiting (rather than being waited for) doesn't
1361      necessarily have to exist already.  */
1362   s2 = select_stream_for_async (async2, self, true, NULL);
1363
1364   s1 = select_stream_for_async (async1, self, false, NULL);
1365   if (!s1)
1366     GOMP_PLUGIN_fatal ("invalid async 1\n");
1367
1368   if (s1 == s2)
1369     GOMP_PLUGIN_fatal ("identical parameters");
1370
1371   e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1372
1373   r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1374   if (r != CUDA_SUCCESS)
1375     GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1376
1377   event_gc (true);
1378
1379   r = cuEventRecord (*e, s1->stream);
1380   if (r != CUDA_SUCCESS)
1381     GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1382
1383   event_add (PTX_EVT_SYNC, e, NULL);
1384
1385   r = cuStreamWaitEvent (s2->stream, *e, 0);
1386   if (r != CUDA_SUCCESS)
1387     GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuda_error (r));
1388 }
1389
1390 static void
1391 nvptx_wait_all (void)
1392 {
1393   CUresult r;
1394   struct ptx_stream *s;
1395   pthread_t self = pthread_self ();
1396   struct nvptx_thread *nvthd = nvptx_thread ();
1397
1398   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1399
1400   /* Wait for active streams initiated by this thread (or by multiple threads)
1401      to complete.  */
1402   for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
1403     {
1404       if (s->multithreaded || pthread_equal (s->host_thread, self))
1405         {
1406           r = cuStreamQuery (s->stream);
1407           if (r == CUDA_SUCCESS)
1408             continue;
1409           else if (r != CUDA_ERROR_NOT_READY)
1410             GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
1411
1412           r = cuStreamSynchronize (s->stream);
1413           if (r != CUDA_SUCCESS)
1414             GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1415         }
1416     }
1417
1418   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1419
1420   event_gc (true);
1421 }
1422
1423 static void
1424 nvptx_wait_all_async (int async)
1425 {
1426   CUresult r;
1427   struct ptx_stream *waiting_stream, *other_stream;
1428   CUevent *e;
1429   struct nvptx_thread *nvthd = nvptx_thread ();
1430   pthread_t self = pthread_self ();
1431
1432   /* The stream doing the waiting.  This could be the first mention of the
1433      stream, so create it if necessary.  */
1434   waiting_stream
1435     = select_stream_for_async (async, pthread_self (), true, NULL);
1436
1437   /* Launches on the null stream already block on other streams in the
1438      context.  */
1439   if (!waiting_stream || waiting_stream == nvthd->ptx_dev->null_stream)
1440     return;
1441
1442   event_gc (true);
1443
1444   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1445
1446   for (other_stream = nvthd->ptx_dev->active_streams;
1447        other_stream != NULL;
1448        other_stream = other_stream->next)
1449     {
1450       if (!other_stream->multithreaded
1451           && !pthread_equal (other_stream->host_thread, self))
1452         continue;
1453
1454       e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1455
1456       r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1457       if (r != CUDA_SUCCESS)
1458         GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1459
1460       /* Record an event on the waited-for stream.  */
1461       r = cuEventRecord (*e, other_stream->stream);
1462       if (r != CUDA_SUCCESS)
1463         GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1464
1465       event_add (PTX_EVT_SYNC, e, NULL);
1466
1467       r = cuStreamWaitEvent (waiting_stream->stream, *e, 0);
1468       if (r != CUDA_SUCCESS)
1469         GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuda_error (r));
1470    }
1471
1472   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1473 }
1474
1475 static void *
1476 nvptx_get_current_cuda_device (void)
1477 {
1478   struct nvptx_thread *nvthd = nvptx_thread ();
1479
1480   if (!nvthd || !nvthd->ptx_dev)
1481     return NULL;
1482
1483   return &nvthd->ptx_dev->dev;
1484 }
1485
1486 static void *
1487 nvptx_get_current_cuda_context (void)
1488 {
1489   struct nvptx_thread *nvthd = nvptx_thread ();
1490
1491   if (!nvthd || !nvthd->ptx_dev)
1492     return NULL;
1493
1494   return nvthd->ptx_dev->ctx;
1495 }
1496
1497 static void *
1498 nvptx_get_cuda_stream (int async)
1499 {
1500   struct ptx_stream *s;
1501   struct nvptx_thread *nvthd = nvptx_thread ();
1502
1503   if (!nvthd || !nvthd->ptx_dev)
1504     return NULL;
1505
1506   s = select_stream_for_async (async, pthread_self (), false, NULL);
1507
1508   return s ? s->stream : NULL;
1509 }
1510
1511 static int
1512 nvptx_set_cuda_stream (int async, void *stream)
1513 {
1514   struct ptx_stream *oldstream;
1515   pthread_t self = pthread_self ();
1516   struct nvptx_thread *nvthd = nvptx_thread ();
1517
1518   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1519
1520   if (async < 0)
1521     GOMP_PLUGIN_fatal ("bad async %d", async);
1522
1523   /* We have a list of active streams and an array mapping async values to
1524      entries of that list.  We need to take "ownership" of the passed-in stream,
1525      and add it to our list, removing the previous entry also (if there was one)
1526      in order to prevent resource leaks.  Note the potential for surprise
1527      here: maybe we should keep track of passed-in streams and leave it up to
1528      the user to tidy those up, but that doesn't work for stream handles
1529      returned from acc_get_cuda_stream above...  */
1530
1531   oldstream = select_stream_for_async (async, self, false, NULL);
1532
1533   if (oldstream)
1534     {
1535       if (nvthd->ptx_dev->active_streams == oldstream)
1536         nvthd->ptx_dev->active_streams = nvthd->ptx_dev->active_streams->next;
1537       else
1538         {
1539           struct ptx_stream *s = nvthd->ptx_dev->active_streams;
1540           while (s->next != oldstream)
1541             s = s->next;
1542           s->next = s->next->next;
1543         }
1544
1545       cuStreamDestroy (oldstream->stream);
1546       map_fini (oldstream);
1547       free (oldstream);
1548     }
1549
1550   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1551
1552   (void) select_stream_for_async (async, self, true, (CUstream) stream);
1553
1554   return 1;
1555 }
1556
1557 /* Plugin entry points.  */
1558
1559 const char *
1560 GOMP_OFFLOAD_get_name (void)
1561 {
1562   return "nvptx";
1563 }
1564
1565 unsigned int
1566 GOMP_OFFLOAD_get_caps (void)
1567 {
1568   return GOMP_OFFLOAD_CAP_OPENACC_200;
1569 }
1570
1571 int
1572 GOMP_OFFLOAD_get_type (void)
1573 {
1574   return OFFLOAD_TARGET_TYPE_NVIDIA_PTX;
1575 }
1576
1577 int
1578 GOMP_OFFLOAD_get_num_devices (void)
1579 {
1580   return nvptx_get_num_devices ();
1581 }
1582
1583 void
1584 GOMP_OFFLOAD_init_device (int n)
1585 {
1586   pthread_mutex_lock (&ptx_dev_lock);
1587
1588   if (!nvptx_init () || ptx_devices[n] != NULL)
1589     {
1590       pthread_mutex_unlock (&ptx_dev_lock);
1591       return;
1592     }
1593
1594   ptx_devices[n] = nvptx_open_device (n);
1595   instantiated_devices++;
1596
1597   pthread_mutex_unlock (&ptx_dev_lock);
1598 }
1599
1600 void
1601 GOMP_OFFLOAD_fini_device (int n)
1602 {
1603   pthread_mutex_lock (&ptx_dev_lock);
1604
1605   if (ptx_devices[n] != NULL)
1606     {
1607       nvptx_attach_host_thread_to_device (n);
1608       nvptx_close_device (ptx_devices[n]);
1609       ptx_devices[n] = NULL;
1610       instantiated_devices--;
1611     }
1612
1613   pthread_mutex_unlock (&ptx_dev_lock);
1614 }
1615
1616 int
1617 GOMP_OFFLOAD_load_image (int ord, void *target_data,
1618                          struct addr_pair **target_table)
1619 {
1620   CUmodule module;
1621   char **fn_names, **var_names;
1622   unsigned int fn_entries, var_entries, i, j;
1623   CUresult r;
1624   struct targ_fn_descriptor *targ_fns;
1625   void **img_header = (void **) target_data;
1626   struct ptx_image_data *new_image;
1627
1628   GOMP_OFFLOAD_init_device (ord);
1629
1630   nvptx_attach_host_thread_to_device (ord);
1631
1632   link_ptx (&module, img_header[0]);
1633
1634   pthread_mutex_lock (&ptx_image_lock);
1635   new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data));
1636   new_image->target_data = target_data;
1637   new_image->module = module;
1638   new_image->next = ptx_images;
1639   ptx_images = new_image;
1640   pthread_mutex_unlock (&ptx_image_lock);
1641
1642   /* The mkoffload utility emits a table of pointers/integers at the start of
1643      each offload image:
1644
1645      img_header[0] -> ptx code
1646      img_header[1] -> number of variables
1647      img_header[2] -> array of variable names (pointers to strings)
1648      img_header[3] -> number of kernels
1649      img_header[4] -> array of kernel names (pointers to strings)
1650
1651      The array of kernel names and the functions addresses form a
1652      one-to-one correspondence.  */
1653
1654   var_entries = (uintptr_t) img_header[1];
1655   var_names = (char **) img_header[2];
1656   fn_entries = (uintptr_t) img_header[3];
1657   fn_names = (char **) img_header[4];
1658
1659   *target_table = GOMP_PLUGIN_malloc (sizeof (struct addr_pair)
1660                                       * (fn_entries + var_entries));
1661   targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
1662                                  * fn_entries);
1663
1664   for (i = 0; i < fn_entries; i++)
1665     {
1666       CUfunction function;
1667
1668       r = cuModuleGetFunction (&function, module, fn_names[i]);
1669       if (r != CUDA_SUCCESS)
1670         GOMP_PLUGIN_fatal ("cuModuleGetFunction error: %s", cuda_error (r));
1671
1672       targ_fns[i].fn = function;
1673       targ_fns[i].name = (const char *) fn_names[i];
1674
1675       (*target_table)[i].start = (uintptr_t) &targ_fns[i];
1676       (*target_table)[i].end = (*target_table)[i].start + 1;
1677     }
1678
1679   for (j = 0; j < var_entries; j++, i++)
1680     {
1681       CUdeviceptr var;
1682       size_t bytes;
1683
1684       r = cuModuleGetGlobal (&var, &bytes, module, var_names[j]);
1685       if (r != CUDA_SUCCESS)
1686         GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
1687
1688       (*target_table)[i].start = (uintptr_t) var;
1689       (*target_table)[i].end = (*target_table)[i].start + bytes;
1690     }
1691
1692   return i;
1693 }
1694
1695 void
1696 GOMP_OFFLOAD_unload_image (int tid __attribute__((unused)), void *target_data)
1697 {
1698   void **img_header = (void **) target_data;
1699   struct targ_fn_descriptor *targ_fns
1700     = (struct targ_fn_descriptor *) img_header[0];
1701   struct ptx_image_data *image, *prev = NULL, *newhd = NULL;
1702
1703   free (targ_fns);
1704
1705   pthread_mutex_lock (&ptx_image_lock);
1706   for (image = ptx_images; image != NULL;)
1707     {
1708       struct ptx_image_data *next = image->next;
1709
1710       if (image->target_data == target_data)
1711         {
1712           cuModuleUnload (image->module);
1713           free (image);
1714           if (prev)
1715             prev->next = next;
1716         }
1717       else
1718         {
1719           prev = image;
1720           if (!newhd)
1721             newhd = image;
1722         }
1723
1724       image = next;
1725     }
1726   ptx_images = newhd;
1727   pthread_mutex_unlock (&ptx_image_lock);
1728 }
1729
1730 void *
1731 GOMP_OFFLOAD_alloc (int ord, size_t size)
1732 {
1733   nvptx_attach_host_thread_to_device (ord);
1734   return nvptx_alloc (size);
1735 }
1736
1737 void
1738 GOMP_OFFLOAD_free (int ord, void *ptr)
1739 {
1740   nvptx_attach_host_thread_to_device (ord);
1741   nvptx_free (ptr);
1742 }
1743
1744 void *
1745 GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
1746 {
1747   nvptx_attach_host_thread_to_device (ord);
1748   return nvptx_dev2host (dst, src, n);
1749 }
1750
1751 void *
1752 GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
1753 {
1754   nvptx_attach_host_thread_to_device (ord);
1755   return nvptx_host2dev (dst, src, n);
1756 }
1757
1758 void (*device_run) (int n, void *fn_ptr, void *vars) = NULL;
1759
1760 void
1761 GOMP_OFFLOAD_openacc_parallel (void (*fn) (void *), size_t mapnum,
1762                                void **hostaddrs, void **devaddrs, size_t *sizes,
1763                                unsigned short *kinds, int num_gangs,
1764                                int num_workers, int vector_length, int async,
1765                                void *targ_mem_desc)
1766 {
1767   nvptx_exec (fn, mapnum, hostaddrs, devaddrs, sizes, kinds, num_gangs,
1768             num_workers, vector_length, async, targ_mem_desc);
1769 }
1770
1771 void
1772 GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc)
1773 {
1774   CUevent *e;
1775   CUresult r;
1776   struct nvptx_thread *nvthd = nvptx_thread ();
1777
1778   e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1779
1780   r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1781   if (r != CUDA_SUCCESS)
1782     GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1783
1784   r = cuEventRecord (*e, nvthd->current_stream->stream);
1785   if (r != CUDA_SUCCESS)
1786     GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1787
1788   event_add (PTX_EVT_ASYNC_CLEANUP, e, targ_mem_desc);
1789 }
1790
1791 int
1792 GOMP_OFFLOAD_openacc_async_test (int async)
1793 {
1794   return nvptx_async_test (async);
1795 }
1796
1797 int
1798 GOMP_OFFLOAD_openacc_async_test_all (void)
1799 {
1800   return nvptx_async_test_all ();
1801 }
1802
1803 void
1804 GOMP_OFFLOAD_openacc_async_wait (int async)
1805 {
1806   nvptx_wait (async);
1807 }
1808
1809 void
1810 GOMP_OFFLOAD_openacc_async_wait_async (int async1, int async2)
1811 {
1812   nvptx_wait_async (async1, async2);
1813 }
1814
1815 void
1816 GOMP_OFFLOAD_openacc_async_wait_all (void)
1817 {
1818   nvptx_wait_all ();
1819 }
1820
1821 void
1822 GOMP_OFFLOAD_openacc_async_wait_all_async (int async)
1823 {
1824   nvptx_wait_all_async (async);
1825 }
1826
1827 void
1828 GOMP_OFFLOAD_openacc_async_set_async (int async)
1829 {
1830   nvptx_set_async (async);
1831 }
1832
1833 void *
1834 GOMP_OFFLOAD_openacc_create_thread_data (int ord)
1835 {
1836   struct ptx_device *ptx_dev;
1837   struct nvptx_thread *nvthd
1838     = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
1839   CUresult r;
1840   CUcontext thd_ctx;
1841
1842   ptx_dev = ptx_devices[ord];
1843
1844   assert (ptx_dev);
1845
1846   r = cuCtxGetCurrent (&thd_ctx);
1847   if (r != CUDA_SUCCESS)
1848     GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));
1849
1850   assert (ptx_dev->ctx);
1851
1852   if (!thd_ctx)
1853     {
1854       r = cuCtxPushCurrent (ptx_dev->ctx);
1855       if (r != CUDA_SUCCESS)
1856         GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s", cuda_error (r));
1857     }
1858
1859   nvthd->current_stream = ptx_dev->null_stream;
1860   nvthd->ptx_dev = ptx_dev;
1861
1862   return (void *) nvthd;
1863 }
1864
1865 void
1866 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
1867 {
1868   free (data);
1869 }
1870
1871 void *
1872 GOMP_OFFLOAD_openacc_get_current_cuda_device (void)
1873 {
1874   return nvptx_get_current_cuda_device ();
1875 }
1876
1877 void *
1878 GOMP_OFFLOAD_openacc_get_current_cuda_context (void)
1879 {
1880   return nvptx_get_current_cuda_context ();
1881 }
1882
1883 /* NOTE: This returns a CUstream, not a ptx_stream pointer.  */
1884
1885 void *
1886 GOMP_OFFLOAD_openacc_get_cuda_stream (int async)
1887 {
1888   return nvptx_get_cuda_stream (async);
1889 }
1890
1891 /* NOTE: This takes a CUstream, not a ptx_stream pointer.  */
1892
1893 int
1894 GOMP_OFFLOAD_openacc_set_cuda_stream (int async, void *stream)
1895 {
1896   return nvptx_set_cuda_stream (async, stream);
1897 }