libgomp/plugin/plugin-nvptx.c

   1 /* Plugin for NVPTX execution.
   2
   3    Copyright (C) 2013-2015 Free Software Foundation, Inc.
   4
   5    Contributed by Mentor Embedded.
   6
   7    This file is part of the GNU Offloading and Multi Processing Library
   8    (libgomp).
   9
  10    Libgomp is free software; you can redistribute it and/or modify it
  11    under the terms of the GNU General Public License as published by
  12    the Free Software Foundation; either version 3, or (at your option)
  13    any later version.
  14
  15    Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
  16    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  17    FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  18    more details.
  19
  20    Under Section 7 of GPL version 3, you are granted additional
  21    permissions described in the GCC Runtime Library Exception, version
  22    3.1, as published by the Free Software Foundation.
  23
  24    You should have received a copy of the GNU General Public License and
  25    a copy of the GCC Runtime Library Exception along with this program;
  26    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  27    <http://www.gnu.org/licenses/>.  */
  28
  29 /* Nvidia PTX-specific parts of OpenACC support.  The cuda driver
  30    library appears to hold some implicit state, but the documentation
  31    is not clear as to what that state might be.  Or how one might
  32    propagate it from one thread to another.  */
  33
  34 #include "openacc.h"
  35 #include "config.h"
  36 #include "libgomp-plugin.h"
  37 #include "oacc-ptx.h"
  38 #include "oacc-plugin.h"
  39
  40 #include <pthread.h>
  41 #include <cuda.h>
  42 #include <stdbool.h>
  43 #include <stdint.h>
  44 #include <string.h>
  45 #include <stdio.h>
  46 #include <dlfcn.h>
  47 #include <unistd.h>
  48 #include <assert.h>
  49
  50 #define ARRAYSIZE(X) (sizeof (X) / sizeof ((X)[0]))
  51
  52 static struct
  53 {
  54   CUresult r;
  55   char *m;
  56 } cuda_errlist[]=
  57 {
  58   { CUDA_ERROR_INVALID_VALUE, "invalid value" },
  59   { CUDA_ERROR_OUT_OF_MEMORY, "out of memory" },
  60   { CUDA_ERROR_NOT_INITIALIZED, "not initialized" },
  61   { CUDA_ERROR_DEINITIALIZED, "deinitialized" },
  62   { CUDA_ERROR_PROFILER_DISABLED, "profiler disabled" },
  63   { CUDA_ERROR_PROFILER_NOT_INITIALIZED, "profiler not initialized" },
  64   { CUDA_ERROR_PROFILER_ALREADY_STARTED, "already started" },
  65   { CUDA_ERROR_PROFILER_ALREADY_STOPPED, "already stopped" },
  66   { CUDA_ERROR_NO_DEVICE, "no device" },
  67   { CUDA_ERROR_INVALID_DEVICE, "invalid device" },
  68   { CUDA_ERROR_INVALID_IMAGE, "invalid image" },
  69   { CUDA_ERROR_INVALID_CONTEXT, "invalid context" },
  70   { CUDA_ERROR_CONTEXT_ALREADY_CURRENT, "context already current" },
  71   { CUDA_ERROR_MAP_FAILED, "map error" },
  72   { CUDA_ERROR_UNMAP_FAILED, "unmap error" },
  73   { CUDA_ERROR_ARRAY_IS_MAPPED, "array is mapped" },
  74   { CUDA_ERROR_ALREADY_MAPPED, "already mapped" },
  75   { CUDA_ERROR_NO_BINARY_FOR_GPU, "no binary for gpu" },
  76   { CUDA_ERROR_ALREADY_ACQUIRED, "already acquired" },
  77   { CUDA_ERROR_NOT_MAPPED, "not mapped" },
  78   { CUDA_ERROR_NOT_MAPPED_AS_ARRAY, "not mapped as array" },
  79   { CUDA_ERROR_NOT_MAPPED_AS_POINTER, "not mapped as pointer" },
  80   { CUDA_ERROR_ECC_UNCORRECTABLE, "ecc uncorrectable" },
  81   { CUDA_ERROR_UNSUPPORTED_LIMIT, "unsupported limit" },
  82   { CUDA_ERROR_CONTEXT_ALREADY_IN_USE, "context already in use" },
  83   { CUDA_ERROR_PEER_ACCESS_UNSUPPORTED, "peer access unsupported" },
  84   { CUDA_ERROR_INVALID_SOURCE, "invalid source" },
  85   { CUDA_ERROR_FILE_NOT_FOUND, "file not found" },
  86   { CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
  87                                            "shared object symbol not found" },
  88   { CUDA_ERROR_SHARED_OBJECT_INIT_FAILED, "shared object init error" },
  89   { CUDA_ERROR_OPERATING_SYSTEM, "operating system" },
  90   { CUDA_ERROR_INVALID_HANDLE, "invalid handle" },
  91   { CUDA_ERROR_NOT_FOUND, "not found" },
  92   { CUDA_ERROR_NOT_READY, "not ready" },
  93   { CUDA_ERROR_LAUNCH_FAILED, "launch error" },
  94   { CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, "launch out of resources" },
  95   { CUDA_ERROR_LAUNCH_TIMEOUT, "launch timeout" },
  96   { CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
  97                                              "launch incompatibe texturing" },
  98   { CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED, "peer access already enabled" },
  99   { CUDA_ERROR_PEER_ACCESS_NOT_ENABLED, "peer access not enabled " },
 100   { CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE, "primary cotext active" },
 101   { CUDA_ERROR_CONTEXT_IS_DESTROYED, "context is destroyed" },
 102   { CUDA_ERROR_ASSERT, "assert" },
 103   { CUDA_ERROR_TOO_MANY_PEERS, "too many peers" },
 104   { CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED,
 105                                            "host memory already registered" },
 106   { CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED, "host memory not registered" },
 107   { CUDA_ERROR_NOT_PERMITTED, "not permitted" },
 108   { CUDA_ERROR_NOT_SUPPORTED, "not supported" },
 109   { CUDA_ERROR_UNKNOWN, "unknown" }
 110 };
 111
 112 static char errmsg[128];
 113
 114 static char *
 115 cuda_error (CUresult r)
 116 {
 117   int i;
 118
 119   for (i = 0; i < ARRAYSIZE (cuda_errlist); i++)
 120     {
 121       if (cuda_errlist[i].r == r)
 122         return &cuda_errlist[i].m[0];
 123     }
 124
 125   sprintf (&errmsg[0], "unknown result code: %5d", r);
 126
 127   return &errmsg[0];
 128 }
 129
 130 struct targ_fn_descriptor
 131 {
 132   CUfunction fn;
 133   const char *name;
 134 };
 135
 136 static bool ptx_inited = false;
 137
 138 struct ptx_stream
 139 {
 140   CUstream stream;
 141   pthread_t host_thread;
 142   bool multithreaded;
 143
 144   CUdeviceptr d;
 145   void *h;
 146   void *h_begin;
 147   void *h_end;
 148   void *h_next;
 149   void *h_prev;
 150   void *h_tail;
 151
 152   struct ptx_stream *next;
 153 };
 154
 155 /* Thread-specific data for PTX.  */
 156
 157 struct nvptx_thread
 158 {
 159   struct ptx_stream *current_stream;
 160   struct ptx_device *ptx_dev;
 161 };
 162
 163 struct map
 164 {
 165   int     async;
 166   size_t  size;
 167   char    mappings[0];
 168 };
 169
 170 static void
 171 map_init (struct ptx_stream *s)
 172 {
 173   CUresult r;
 174
 175   int size = getpagesize ();
 176
 177   assert (s);
 178   assert (!s->d);
 179   assert (!s->h);
 180
 181   r = cuMemAllocHost (&s->h, size);
 182   if (r != CUDA_SUCCESS)
 183     GOMP_PLUGIN_fatal ("cuMemAllocHost error: %s", cuda_error (r));
 184
 185   r = cuMemHostGetDevicePointer (&s->d, s->h, 0);
 186   if (r != CUDA_SUCCESS)
 187     GOMP_PLUGIN_fatal ("cuMemHostGetDevicePointer error: %s", cuda_error (r));
 188
 189   assert (s->h);
 190
 191   s->h_begin = s->h;
 192   s->h_end = s->h_begin + size;
 193   s->h_next = s->h_prev = s->h_tail = s->h_begin;
 194
 195   assert (s->h_next);
 196   assert (s->h_end);
 197 }
 198
 199 static void
 200 map_fini (struct ptx_stream *s)
 201 {
 202   CUresult r;
 203
 204   r = cuMemFreeHost (s->h);
 205   if (r != CUDA_SUCCESS)
 206     GOMP_PLUGIN_fatal ("cuMemFreeHost error: %s", cuda_error (r));
 207 }
 208
 209 static void
 210 map_pop (struct ptx_stream *s)
 211 {
 212   struct map *m;
 213
 214   assert (s != NULL);
 215   assert (s->h_next);
 216   assert (s->h_prev);
 217   assert (s->h_tail);
 218
 219   m = s->h_tail;
 220
 221   s->h_tail += m->size;
 222
 223   if (s->h_tail >= s->h_end)
 224     s->h_tail = s->h_begin + (int) (s->h_tail - s->h_end);
 225
 226   if (s->h_next == s->h_tail)
 227     s->h_prev = s->h_next;
 228
 229   assert (s->h_next >= s->h_begin);
 230   assert (s->h_tail >= s->h_begin);
 231   assert (s->h_prev >= s->h_begin);
 232
 233   assert (s->h_next <= s->h_end);
 234   assert (s->h_tail <= s->h_end);
 235   assert (s->h_prev <= s->h_end);
 236 }
 237
 238 static void
 239 map_push (struct ptx_stream *s, int async, size_t size, void **h, void **d)
 240 {
 241   int left;
 242   int offset;
 243   struct map *m;
 244
 245   assert (s != NULL);
 246
 247   left = s->h_end - s->h_next;
 248   size += sizeof (struct map);
 249
 250   assert (s->h_prev);
 251   assert (s->h_next);
 252
 253   if (size >= left)
 254     {
 255       m = s->h_prev;
 256       m->size += left;
 257       s->h_next = s->h_begin;
 258
 259       if (s->h_next + size > s->h_end)
 260         GOMP_PLUGIN_fatal ("unable to push map");
 261     }
 262
 263   assert (s->h_next);
 264
 265   m = s->h_next;
 266   m->async = async;
 267   m->size = size;
 268
 269   offset = (void *)&m->mappings[0] - s->h;
 270
 271   *d = (void *)(s->d + offset);
 272   *h = (void *)(s->h + offset);
 273
 274   s->h_prev = s->h_next;
 275   s->h_next += size;
 276
 277   assert (s->h_prev);
 278   assert (s->h_next);
 279
 280   assert (s->h_next >= s->h_begin);
 281   assert (s->h_tail >= s->h_begin);
 282   assert (s->h_prev >= s->h_begin);
 283   assert (s->h_next <= s->h_end);
 284   assert (s->h_tail <= s->h_end);
 285   assert (s->h_prev <= s->h_end);
 286
 287   return;
 288 }
 289
 290 struct ptx_device
 291 {
 292   CUcontext ctx;
 293   bool ctx_shared;
 294   CUdevice dev;
 295   struct ptx_stream *null_stream;
 296   /* All non-null streams associated with this device (actually context),
 297      either created implicitly or passed in from the user (via
 298      acc_set_cuda_stream).  */
 299   struct ptx_stream *active_streams;
 300   struct {
 301     struct ptx_stream **arr;
 302     int size;
 303   } async_streams;
 304   /* A lock for use when manipulating the above stream list and array.  */
 305   pthread_mutex_t stream_lock;
 306   int ord;
 307   bool overlap;
 308   bool map;
 309   bool concur;
 310   int  mode;
 311   bool mkern;
 312
 313   struct ptx_device *next;
 314 };
 315
 316 enum ptx_event_type
 317 {
 318   PTX_EVT_MEM,
 319   PTX_EVT_KNL,
 320   PTX_EVT_SYNC,
 321   PTX_EVT_ASYNC_CLEANUP
 322 };
 323
 324 struct ptx_event
 325 {
 326   CUevent *evt;
 327   int type;
 328   void *addr;
 329   int ord;
 330
 331   struct ptx_event *next;
 332 };
 333
 334 static pthread_mutex_t ptx_event_lock;
 335 static struct ptx_event *ptx_events;
 336
 337 #define _XSTR(s) _STR(s)
 338 #define _STR(s) #s
 339
 340 static struct _synames
 341 {
 342   char *n;
 343 } cuda_symnames[] =
 344 {
 345   { _XSTR (cuCtxCreate) },
 346   { _XSTR (cuCtxDestroy) },
 347   { _XSTR (cuCtxGetCurrent) },
 348   { _XSTR (cuCtxPushCurrent) },
 349   { _XSTR (cuCtxSynchronize) },
 350   { _XSTR (cuDeviceGet) },
 351   { _XSTR (cuDeviceGetAttribute) },
 352   { _XSTR (cuDeviceGetCount) },
 353   { _XSTR (cuEventCreate) },
 354   { _XSTR (cuEventDestroy) },
 355   { _XSTR (cuEventQuery) },
 356   { _XSTR (cuEventRecord) },
 357   { _XSTR (cuInit) },
 358   { _XSTR (cuLaunchKernel) },
 359   { _XSTR (cuLinkAddData) },
 360   { _XSTR (cuLinkComplete) },
 361   { _XSTR (cuLinkCreate) },
 362   { _XSTR (cuMemAlloc) },
 363   { _XSTR (cuMemAllocHost) },
 364   { _XSTR (cuMemcpy) },
 365   { _XSTR (cuMemcpyDtoH) },
 366   { _XSTR (cuMemcpyDtoHAsync) },
 367   { _XSTR (cuMemcpyHtoD) },
 368   { _XSTR (cuMemcpyHtoDAsync) },
 369   { _XSTR (cuMemFree) },
 370   { _XSTR (cuMemFreeHost) },
 371   { _XSTR (cuMemGetAddressRange) },
 372   { _XSTR (cuMemHostGetDevicePointer) },
 373   { _XSTR (cuMemHostRegister) },
 374   { _XSTR (cuMemHostUnregister) },
 375   { _XSTR (cuModuleGetFunction) },
 376   { _XSTR (cuModuleLoadData) },
 377   { _XSTR (cuStreamDestroy) },
 378   { _XSTR (cuStreamQuery) },
 379   { _XSTR (cuStreamSynchronize) },
 380   { _XSTR (cuStreamWaitEvent) }
 381 };
 382
 383 static int
 384 verify_device_library (void)
 385 {
 386   int i;
 387   void *dh, *ds;
 388
 389   dh = dlopen ("libcuda.so", RTLD_LAZY);
 390   if (!dh)
 391     return -1;
 392
 393   for (i = 0; i < ARRAYSIZE (cuda_symnames); i++)
 394     {
 395       ds = dlsym (dh, cuda_symnames[i].n);
 396       if (!ds)
 397         return -1;
 398     }
 399
 400   dlclose (dh);
 401
 402   return 0;
 403 }
 404
 405 static inline struct nvptx_thread *
 406 nvptx_thread (void)
 407 {
 408   return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
 409 }
 410
 411 static void
 412 init_streams_for_device (struct ptx_device *ptx_dev, int concurrency)
 413 {
 414   int i;
 415   struct ptx_stream *null_stream
 416     = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
 417
 418   null_stream->stream = NULL;
 419   null_stream->host_thread = pthread_self ();
 420   null_stream->multithreaded = true;
 421   null_stream->d = (CUdeviceptr) NULL;
 422   null_stream->h = NULL;
 423   map_init (null_stream);
 424   ptx_dev->null_stream = null_stream;
 425
 426   ptx_dev->active_streams = NULL;
 427   pthread_mutex_init (&ptx_dev->stream_lock, NULL);
 428
 429   if (concurrency < 1)
 430     concurrency = 1;
 431
 432   /* This is just a guess -- make space for as many async streams as the
 433      current device is capable of concurrently executing.  This can grow
 434      later as necessary.  No streams are created yet.  */
 435   ptx_dev->async_streams.arr
 436     = GOMP_PLUGIN_malloc (concurrency * sizeof (struct ptx_stream *));
 437   ptx_dev->async_streams.size = concurrency;
 438
 439   for (i = 0; i < concurrency; i++)
 440     ptx_dev->async_streams.arr[i] = NULL;
 441 }
 442
 443 static void
 444 fini_streams_for_device (struct ptx_device *ptx_dev)
 445 {
 446   free (ptx_dev->async_streams.arr);
 447
 448   while (ptx_dev->active_streams != NULL)
 449     {
 450       struct ptx_stream *s = ptx_dev->active_streams;
 451       ptx_dev->active_streams = ptx_dev->active_streams->next;
 452
 453       cuStreamDestroy (s->stream);
 454       map_fini (s);
 455       free (s);
 456     }
 457
 458   map_fini (ptx_dev->null_stream);
 459   free (ptx_dev->null_stream);
 460 }
 461
 462 /* Select a stream for (OpenACC-semantics) ASYNC argument for the current
 463    thread THREAD (and also current device/context).  If CREATE is true, create
 464    the stream if it does not exist (or use EXISTING if it is non-NULL), and
 465    associate the stream with the same thread argument.  Returns stream to use
 466    as result.  */
 467
 468 static struct ptx_stream *
 469 select_stream_for_async (int async, pthread_t thread, bool create,
 470                          CUstream existing)
 471 {
 472   struct nvptx_thread *nvthd = nvptx_thread ();
 473   /* Local copy of TLS variable.  */
 474   struct ptx_device *ptx_dev = nvthd->ptx_dev;
 475   struct ptx_stream *stream = NULL;
 476   int orig_async = async;
 477
 478   /* The special value acc_async_noval (-1) maps (for now) to an
 479      implicitly-created stream, which is then handled the same as any other
 480      numbered async stream.  Other options are available, e.g. using the null
 481      stream for anonymous async operations, or choosing an idle stream from an
 482      active set.  But, stick with this for now.  */
 483   if (async > acc_async_sync)
 484     async++;
 485
 486   if (create)
 487     pthread_mutex_lock (&ptx_dev->stream_lock);
 488
 489   /* NOTE: AFAICT there's no particular need for acc_async_sync to map to the
 490      null stream, and in fact better performance may be obtainable if it doesn't
 491      (because the null stream enforces overly-strict synchronisation with
 492      respect to other streams for legacy reasons, and that's probably not
 493      needed with OpenACC).  Maybe investigate later.  */
 494   if (async == acc_async_sync)
 495     stream = ptx_dev->null_stream;
 496   else if (async >= 0 && async < ptx_dev->async_streams.size
 497            && ptx_dev->async_streams.arr[async] && !(create && existing))
 498     stream = ptx_dev->async_streams.arr[async];
 499   else if (async >= 0 && create)
 500     {
 501       if (async >= ptx_dev->async_streams.size)
 502         {
 503           int i, newsize = ptx_dev->async_streams.size * 2;
 504
 505           if (async >= newsize)
 506             newsize = async + 1;
 507
 508           ptx_dev->async_streams.arr
 509             = GOMP_PLUGIN_realloc (ptx_dev->async_streams.arr,
 510                                    newsize * sizeof (struct ptx_stream *));
 511
 512           for (i = ptx_dev->async_streams.size; i < newsize; i++)
 513             ptx_dev->async_streams.arr[i] = NULL;
 514
 515           ptx_dev->async_streams.size = newsize;
 516         }
 517
 518       /* Create a new stream on-demand if there isn't one already, or if we're
 519          setting a particular async value to an existing (externally-provided)
 520          stream.  */
 521       if (!ptx_dev->async_streams.arr[async] || existing)
 522         {
 523           CUresult r;
 524           struct ptx_stream *s
 525             = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
 526
 527           if (existing)
 528             s->stream = existing;
 529           else
 530             {
 531               r = cuStreamCreate (&s->stream, CU_STREAM_DEFAULT);
 532               if (r != CUDA_SUCCESS)
 533                 GOMP_PLUGIN_fatal ("cuStreamCreate error: %s", cuda_error (r));
 534             }
 535
 536           /* If CREATE is true, we're going to be queueing some work on this
 537              stream.  Associate it with the current host thread.  */
 538           s->host_thread = thread;
 539           s->multithreaded = false;
 540
 541           s->d = (CUdeviceptr) NULL;
 542           s->h = NULL;
 543           map_init (s);
 544
 545           s->next = ptx_dev->active_streams;
 546           ptx_dev->active_streams = s;
 547           ptx_dev->async_streams.arr[async] = s;
 548         }
 549
 550       stream = ptx_dev->async_streams.arr[async];
 551     }
 552   else if (async < 0)
 553     GOMP_PLUGIN_fatal ("bad async %d", async);
 554
 555   if (create)
 556     {
 557       assert (stream != NULL);
 558
 559       /* If we're trying to use the same stream from different threads
 560          simultaneously, set stream->multithreaded to true.  This affects the
 561          behaviour of acc_async_test_all and acc_wait_all, which are supposed to
 562          only wait for asynchronous launches from the same host thread they are
 563          invoked on.  If multiple threads use the same async value, we make note
 564          of that here and fall back to testing/waiting for all threads in those
 565          functions.  */
 566       if (thread != stream->host_thread)
 567         stream->multithreaded = true;
 568
 569       pthread_mutex_unlock (&ptx_dev->stream_lock);
 570     }
 571   else if (stream && !stream->multithreaded
 572            && !pthread_equal (stream->host_thread, thread))
 573     GOMP_PLUGIN_fatal ("async %d used on wrong thread", orig_async);
 574
 575   return stream;
 576 }
 577
 578 static int nvptx_get_num_devices (void);
 579
 580 /* Initialize the device.  */
 581 static int
 582 nvptx_init (void)
 583 {
 584   CUresult r;
 585   int rc;
 586
 587   if (ptx_inited)
 588     return nvptx_get_num_devices ();
 589
 590   rc = verify_device_library ();
 591   if (rc < 0)
 592     return -1;
 593
 594   r = cuInit (0);
 595   if (r != CUDA_SUCCESS)
 596     GOMP_PLUGIN_fatal ("cuInit error: %s", cuda_error (r));
 597
 598   ptx_events = NULL;
 599
 600   pthread_mutex_init (&ptx_event_lock, NULL);
 601
 602   ptx_inited = true;
 603
 604   return nvptx_get_num_devices ();
 605 }
 606
 607 static void
 608 nvptx_fini (void)
 609 {
 610   ptx_inited = false;
 611 }
 612
 613 static void *
 614 nvptx_open_device (int n)
 615 {
 616   struct ptx_device *ptx_dev;
 617   CUdevice dev;
 618   CUresult r;
 619   int async_engines, pi;
 620
 621   r = cuDeviceGet (&dev, n);
 622   if (r != CUDA_SUCCESS)
 623     GOMP_PLUGIN_fatal ("cuDeviceGet error: %s", cuda_error (r));
 624
 625   ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device));
 626
 627   ptx_dev->ord = n;
 628   ptx_dev->dev = dev;
 629   ptx_dev->ctx_shared = false;
 630
 631   r = cuCtxGetCurrent (&ptx_dev->ctx);
 632   if (r != CUDA_SUCCESS)
 633     GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));
 634
 635   if (!ptx_dev->ctx)
 636     {
 637       r = cuCtxCreate (&ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
 638       if (r != CUDA_SUCCESS)
 639         GOMP_PLUGIN_fatal ("cuCtxCreate error: %s", cuda_error (r));
 640     }
 641   else
 642     ptx_dev->ctx_shared = true;
 643
 644   r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
 645   if (r != CUDA_SUCCESS)
 646     GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
 647
 648   ptx_dev->overlap = pi;
 649
 650   r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
 651   if (r != CUDA_SUCCESS)
 652     GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
 653
 654   ptx_dev->map = pi;
 655
 656   r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
 657   if (r != CUDA_SUCCESS)
 658     GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
 659
 660   ptx_dev->concur = pi;
 661
 662   r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
 663   if (r != CUDA_SUCCESS)
 664     GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
 665
 666   ptx_dev->mode = pi;
 667
 668   r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
 669   if (r != CUDA_SUCCESS)
 670     GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
 671
 672   ptx_dev->mkern = pi;
 673
 674   r = cuDeviceGetAttribute (&async_engines,
 675                             CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
 676   if (r != CUDA_SUCCESS)
 677     async_engines = 1;
 678
 679   init_streams_for_device (ptx_dev, async_engines);
 680
 681   return (void *) ptx_dev;
 682 }
 683
 684 static int
 685 nvptx_close_device (void *targ_data)
 686 {
 687   CUresult r;
 688   struct ptx_device *ptx_dev = targ_data;
 689
 690   if (!ptx_dev)
 691     return 0;
 692
 693   fini_streams_for_device (ptx_dev);
 694
 695   if (!ptx_dev->ctx_shared)
 696     {
 697       r = cuCtxDestroy (ptx_dev->ctx);
 698       if (r != CUDA_SUCCESS)
 699         GOMP_PLUGIN_fatal ("cuCtxDestroy error: %s", cuda_error (r));
 700     }
 701
 702   free (ptx_dev);
 703
 704   return 0;
 705 }
 706
 707 static int
 708 nvptx_get_num_devices (void)
 709 {
 710   int n;
 711   CUresult r;
 712
 713   /* This function will be called before the plugin has been initialized in
 714      order to enumerate available devices, but CUDA API routines can't be used
 715      until cuInit has been called.  Just call it now (but don't yet do any
 716      further initialization).  */
 717   if (!ptx_inited)
 718     cuInit (0);
 719
 720   r = cuDeviceGetCount (&n);
 721   if (r!= CUDA_SUCCESS)
 722     GOMP_PLUGIN_fatal ("cuDeviceGetCount error: %s", cuda_error (r));
 723
 724   return n;
 725 }
 726
 727
 728 static void
 729 link_ptx (CUmodule *module, char *ptx_code)
 730 {
 731   CUjit_option opts[7];
 732   void *optvals[7];
 733   float elapsed = 0.0;
 734 #define LOGSIZE 8192
 735   char elog[LOGSIZE];
 736   char ilog[LOGSIZE];
 737   unsigned long logsize = LOGSIZE;
 738   CUlinkState linkstate;
 739   CUresult r;
 740   void *linkout;
 741   size_t linkoutsize __attribute__ ((unused));
 742
 743   GOMP_PLUGIN_debug (0, "attempting to load:\n---\n%s\n---\n", ptx_code);
 744
 745   opts[0] = CU_JIT_WALL_TIME;
 746   optvals[0] = &elapsed;
 747
 748   opts[1] = CU_JIT_INFO_LOG_BUFFER;
 749   optvals[1] = &ilog[0];
 750
 751   opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
 752   optvals[2] = (void *) logsize;
 753
 754   opts[3] = CU_JIT_ERROR_LOG_BUFFER;
 755   optvals[3] = &elog[0];
 756
 757   opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
 758   optvals[4] = (void *) logsize;
 759
 760   opts[5] = CU_JIT_LOG_VERBOSE;
 761   optvals[5] = (void *) 1;
 762
 763   opts[6] = CU_JIT_TARGET;
 764   optvals[6] = (void *) CU_TARGET_COMPUTE_30;
 765
 766   r = cuLinkCreate (7, opts, optvals, &linkstate);
 767   if (r != CUDA_SUCCESS)
 768     GOMP_PLUGIN_fatal ("cuLinkCreate error: %s", cuda_error (r));
 769
 770   char *abort_ptx = ABORT_PTX;
 771   r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, abort_ptx,
 772                      strlen (abort_ptx) + 1, 0, 0, 0, 0);
 773   if (r != CUDA_SUCCESS)
 774     {
 775       GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
 776       GOMP_PLUGIN_fatal ("cuLinkAddData (abort) error: %s", cuda_error (r));
 777     }
 778
 779   char *acc_on_device_ptx = ACC_ON_DEVICE_PTX;
 780   r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, acc_on_device_ptx,
 781                      strlen (acc_on_device_ptx) + 1, 0, 0, 0, 0);
 782   if (r != CUDA_SUCCESS)
 783     {
 784       GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
 785       GOMP_PLUGIN_fatal ("cuLinkAddData (acc_on_device) error: %s",
 786                          cuda_error (r));
 787     }
 788
 789   char *goacc_internal_ptx = GOACC_INTERNAL_PTX;
 790   r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, goacc_internal_ptx,
 791                      strlen (goacc_internal_ptx) + 1, 0, 0, 0, 0);
 792   if (r != CUDA_SUCCESS)
 793     {
 794       GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
 795       GOMP_PLUGIN_fatal ("cuLinkAddData (goacc_internal_ptx) error: %s",
 796                          cuda_error (r));
 797     }
 798
 799   r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, ptx_code,
 800               strlen (ptx_code) + 1, 0, 0, 0, 0);
 801   if (r != CUDA_SUCCESS)
 802     {
 803       GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
 804       GOMP_PLUGIN_fatal ("cuLinkAddData (ptx_code) error: %s", cuda_error (r));
 805     }
 806
 807   r = cuLinkComplete (linkstate, &linkout, &linkoutsize);
 808   if (r != CUDA_SUCCESS)
 809     GOMP_PLUGIN_fatal ("cuLinkComplete error: %s", cuda_error (r));
 810
 811   GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed);
 812   GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]);
 813
 814   r = cuModuleLoadData (module, linkout);
 815   if (r != CUDA_SUCCESS)
 816     GOMP_PLUGIN_fatal ("cuModuleLoadData error: %s", cuda_error (r));
 817 }
 818
 819 static void
 820 event_gc (bool memmap_lockable)
 821 {
 822   struct ptx_event *ptx_event = ptx_events;
 823   struct nvptx_thread *nvthd = nvptx_thread ();
 824
 825   pthread_mutex_lock (&ptx_event_lock);
 826
 827   while (ptx_event != NULL)
 828     {
 829       CUresult r;
 830       struct ptx_event *e = ptx_event;
 831
 832       ptx_event = ptx_event->next;
 833
 834       if (e->ord != nvthd->ptx_dev->ord)
 835         continue;
 836
 837       r = cuEventQuery (*e->evt);
 838       if (r == CUDA_SUCCESS)
 839         {
 840           CUevent *te;
 841
 842           te = e->evt;
 843
 844           switch (e->type)
 845             {
 846             case PTX_EVT_MEM:
 847             case PTX_EVT_SYNC:
 848               break;
 849
 850             case PTX_EVT_KNL:
 851               map_pop (e->addr);
 852               break;
 853
 854             case PTX_EVT_ASYNC_CLEANUP:
 855               {
 856                 /* The function gomp_plugin_async_unmap_vars needs to claim the
 857                    memory-map splay tree lock for the current device, so we
 858                    can't call it when one of our callers has already claimed
 859                    the lock.  In that case, just delay the GC for this event
 860                    until later.  */
 861                 if (!memmap_lockable)
 862                   continue;
 863
 864                 GOMP_PLUGIN_async_unmap_vars (e->addr);
 865               }
 866               break;
 867             }
 868
 869           cuEventDestroy (*te);
 870           free ((void *)te);
 871
 872           if (ptx_events == e)
 873             ptx_events = ptx_events->next;
 874           else
 875             {
 876               struct ptx_event *e_ = ptx_events;
 877               while (e_->next != e)
 878                 e_ = e_->next;
 879               e_->next = e_->next->next;
 880             }
 881
 882           free (e);
 883         }
 884     }
 885
 886   pthread_mutex_unlock (&ptx_event_lock);
 887 }
 888
 889 static void
 890 event_add (enum ptx_event_type type, CUevent *e, void *h)
 891 {
 892   struct ptx_event *ptx_event;
 893   struct nvptx_thread *nvthd = nvptx_thread ();
 894
 895   assert (type == PTX_EVT_MEM || type == PTX_EVT_KNL || type == PTX_EVT_SYNC
 896           || type == PTX_EVT_ASYNC_CLEANUP);
 897
 898   ptx_event = GOMP_PLUGIN_malloc (sizeof (struct ptx_event));
 899   ptx_event->type = type;
 900   ptx_event->evt = e;
 901   ptx_event->addr = h;
 902   ptx_event->ord = nvthd->ptx_dev->ord;
 903
 904   pthread_mutex_lock (&ptx_event_lock);
 905
 906   ptx_event->next = ptx_events;
 907   ptx_events = ptx_event;
 908
 909   pthread_mutex_unlock (&ptx_event_lock);
 910 }
 911
 912 void
 913 nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
 914           size_t *sizes, unsigned short *kinds, int num_gangs, int num_workers,
 915           int vector_length, int async, void *targ_mem_desc)
 916 {
 917   struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
 918   CUfunction function;
 919   CUresult r;
 920   int i;
 921   struct ptx_stream *dev_str;
 922   void *kargs[1];
 923   void *hp, *dp;
 924   unsigned int nthreads_in_block;
 925   struct nvptx_thread *nvthd = nvptx_thread ();
 926   const char *maybe_abort_msg = "(perhaps abort was called)";
 927
 928   function = targ_fn->fn;
 929
 930   dev_str = select_stream_for_async (async, pthread_self (), false, NULL);
 931   assert (dev_str == nvthd->current_stream);
 932
 933   /* This reserves a chunk of a pre-allocated page of memory mapped on both
 934      the host and the device. HP is a host pointer to the new chunk, and DP is
 935      the corresponding device pointer.  */
 936   map_push (dev_str, async, mapnum * sizeof (void *), &hp, &dp);
 937
 938   GOMP_PLUGIN_debug (0, "  %s: prepare mappings\n", __FUNCTION__);
 939
 940   /* Copy the array of arguments to the mapped page.  */
 941   for (i = 0; i < mapnum; i++)
 942     ((void **) hp)[i] = devaddrs[i];
 943
 944   /* Copy the (device) pointers to arguments to the device (dp and hp might in
 945      fact have the same value on a unified-memory system).  */
 946   r = cuMemcpy ((CUdeviceptr)dp, (CUdeviceptr)hp, mapnum * sizeof (void *));
 947   if (r != CUDA_SUCCESS)
 948     GOMP_PLUGIN_fatal ("cuMemcpy failed: %s", cuda_error (r));
 949
 950   GOMP_PLUGIN_debug (0, "  %s: kernel %s: launch\n", __FUNCTION__, targ_fn->name);
 951
 952   // OpenACC            CUDA
 953   //
 954   // num_gangs          blocks
 955   // num_workers        warps (where a warp is equivalent to 32 threads)
 956   // vector length      threads
 957   //
 958
 959   /* The openacc vector_length clause 'determines the vector length to use for
 960      vector or SIMD operations'.  The question is how to map this to CUDA.
 961
 962      In CUDA, the warp size is the vector length of a CUDA device.  However, the
 963      CUDA interface abstracts away from that, and only shows us warp size
 964      indirectly in maximum number of threads per block, which is a product of
 965      warp size and the number of hyperthreads of a multiprocessor.
 966
 967      We choose to map openacc vector_length directly onto the number of threads
 968      in a block, in the x dimension.  This is reflected in gcc code generation
 969      that uses ThreadIdx.x to access vector elements.
 970
 971      Attempting to use an openacc vector_length of more than the maximum number
 972      of threads per block will result in a cuda error.  */
 973   nthreads_in_block = vector_length;
 974
 975   kargs[0] = &dp;
 976   r = cuLaunchKernel (function,
 977                       num_gangs, 1, 1,
 978                       nthreads_in_block, 1, 1,
 979                       0, dev_str->stream, kargs, 0);
 980   if (r != CUDA_SUCCESS)
 981     GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
 982
 983 #ifndef DISABLE_ASYNC
 984   if (async < acc_async_noval)
 985     {
 986       r = cuStreamSynchronize (dev_str->stream);
 987       if (r == CUDA_ERROR_LAUNCH_FAILED)
 988         GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
 989                            maybe_abort_msg);
 990       else if (r != CUDA_SUCCESS)
 991         GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
 992     }
 993   else
 994     {
 995       CUevent *e;
 996
 997       e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
 998
 999       r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1000       if (r == CUDA_ERROR_LAUNCH_FAILED)
1001         GOMP_PLUGIN_fatal ("cuEventCreate error: %s %s\n", cuda_error (r),
1002                            maybe_abort_msg);
1003       else if (r != CUDA_SUCCESS)
1004         GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1005
1006       event_gc (true);
1007
1008       r = cuEventRecord (*e, dev_str->stream);
1009       if (r != CUDA_SUCCESS)
1010         GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1011
1012       event_add (PTX_EVT_KNL, e, (void *)dev_str);
1013     }
1014 #else
1015   r = cuCtxSynchronize ();
1016   if (r == CUDA_ERROR_LAUNCH_FAILED)
1017     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
1018                        maybe_abort_msg);
1019   else if (r != CUDA_SUCCESS)
1020     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
1021 #endif
1022
1023   GOMP_PLUGIN_debug (0, "  %s: kernel %s: finished\n", __FUNCTION__,
1024                      targ_fn->name);
1025
1026 #ifndef DISABLE_ASYNC
1027   if (async < acc_async_noval)
1028 #endif
1029     map_pop (dev_str);
1030 }
1031
1032 void * openacc_get_current_cuda_context (void);
1033
1034 static void *
1035 nvptx_alloc (size_t s)
1036 {
1037   CUdeviceptr d;
1038   CUresult r;
1039
1040   r = cuMemAlloc (&d, s);
1041   if (r == CUDA_ERROR_OUT_OF_MEMORY)
1042     return 0;
1043   if (r != CUDA_SUCCESS)
1044     GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
1045   return (void *)d;
1046 }
1047
1048 static void
1049 nvptx_free (void *p)
1050 {
1051   CUresult r;
1052   CUdeviceptr pb;
1053   size_t ps;
1054
1055   r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)p);
1056   if (r != CUDA_SUCCESS)
1057     GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r));
1058
1059   if ((CUdeviceptr)p != pb)
1060     GOMP_PLUGIN_fatal ("invalid device address");
1061
1062   r = cuMemFree ((CUdeviceptr)p);
1063   if (r != CUDA_SUCCESS)
1064     GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
1065 }
1066
1067 static void *
1068 nvptx_host2dev (void *d, const void *h, size_t s)
1069 {
1070   CUresult r;
1071   CUdeviceptr pb;
1072   size_t ps;
1073   struct nvptx_thread *nvthd = nvptx_thread ();
1074
1075   if (!s)
1076     return 0;
1077
1078   if (!d)
1079     GOMP_PLUGIN_fatal ("invalid device address");
1080
1081   r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)d);
1082   if (r != CUDA_SUCCESS)
1083     GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r));
1084
1085   if (!pb)
1086     GOMP_PLUGIN_fatal ("invalid device address");
1087
1088   if (!h)
1089     GOMP_PLUGIN_fatal ("invalid host address");
1090
1091   if (d == h)
1092     GOMP_PLUGIN_fatal ("invalid host or device address");
1093
1094   if ((void *)(d + s) > (void *)(pb + ps))
1095     GOMP_PLUGIN_fatal ("invalid size");
1096
1097 #ifndef DISABLE_ASYNC
1098   if (nvthd->current_stream != nvthd->ptx_dev->null_stream)
1099     {
1100       CUevent *e;
1101
1102       e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1103
1104       r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1105       if (r != CUDA_SUCCESS)
1106         GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1107
1108       event_gc (false);
1109
1110       r = cuMemcpyHtoDAsync ((CUdeviceptr)d, h, s,
1111                              nvthd->current_stream->stream);
1112       if (r != CUDA_SUCCESS)
1113         GOMP_PLUGIN_fatal ("cuMemcpyHtoDAsync error: %s", cuda_error (r));
1114
1115       r = cuEventRecord (*e, nvthd->current_stream->stream);
1116       if (r != CUDA_SUCCESS)
1117         GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1118
1119       event_add (PTX_EVT_MEM, e, (void *)h);
1120     }
1121   else
1122 #endif
1123     {
1124       r = cuMemcpyHtoD ((CUdeviceptr)d, h, s);
1125       if (r != CUDA_SUCCESS)
1126         GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
1127     }
1128
1129   return 0;
1130 }
1131
1132 static void *
1133 nvptx_dev2host (void *h, const void *d, size_t s)
1134 {
1135   CUresult r;
1136   CUdeviceptr pb;
1137   size_t ps;
1138   struct nvptx_thread *nvthd = nvptx_thread ();
1139
1140   if (!s)
1141     return 0;
1142
1143   if (!d)
1144     GOMP_PLUGIN_fatal ("invalid device address");
1145
1146   r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)d);
1147   if (r != CUDA_SUCCESS)
1148     GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r));
1149
1150   if (!pb)
1151     GOMP_PLUGIN_fatal ("invalid device address");
1152
1153   if (!h)
1154     GOMP_PLUGIN_fatal ("invalid host address");
1155
1156   if (d == h)
1157     GOMP_PLUGIN_fatal ("invalid host or device address");
1158
1159   if ((void *)(d + s) > (void *)(pb + ps))
1160     GOMP_PLUGIN_fatal ("invalid size");
1161
1162 #ifndef DISABLE_ASYNC
1163   if (nvthd->current_stream != nvthd->ptx_dev->null_stream)
1164     {
1165       CUevent *e;
1166
1167       e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1168
1169       r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1170       if (r != CUDA_SUCCESS)
1171         GOMP_PLUGIN_fatal ("cuEventCreate error: %s\n", cuda_error (r));
1172
1173       event_gc (false);
1174
1175       r = cuMemcpyDtoHAsync (h, (CUdeviceptr)d, s,
1176                              nvthd->current_stream->stream);
1177       if (r != CUDA_SUCCESS)
1178         GOMP_PLUGIN_fatal ("cuMemcpyDtoHAsync error: %s", cuda_error (r));
1179
1180       r = cuEventRecord (*e, nvthd->current_stream->stream);
1181       if (r != CUDA_SUCCESS)
1182         GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1183
1184       event_add (PTX_EVT_MEM, e, (void *)h);
1185     }
1186   else
1187 #endif
1188     {
1189       r = cuMemcpyDtoH (h, (CUdeviceptr)d, s);
1190       if (r != CUDA_SUCCESS)
1191         GOMP_PLUGIN_fatal ("cuMemcpyDtoH error: %s", cuda_error (r));
1192     }
1193
1194   return 0;
1195 }
1196
1197 static void
1198 nvptx_set_async (int async)
1199 {
1200   struct nvptx_thread *nvthd = nvptx_thread ();
1201   nvthd->current_stream
1202     = select_stream_for_async (async, pthread_self (), true, NULL);
1203 }
1204
1205 static int
1206 nvptx_async_test (int async)
1207 {
1208   CUresult r;
1209   struct ptx_stream *s;
1210
1211   s = select_stream_for_async (async, pthread_self (), false, NULL);
1212
1213   if (!s)
1214     GOMP_PLUGIN_fatal ("unknown async %d", async);
1215
1216   r = cuStreamQuery (s->stream);
1217   if (r == CUDA_SUCCESS)
1218     {
1219       /* The oacc-parallel.c:goacc_wait function calls this hook to determine
1220          whether all work has completed on this stream, and if so omits the call
1221          to the wait hook.  If that happens, event_gc might not get called
1222          (which prevents variables from getting unmapped and their associated
1223          device storage freed), so call it here.  */
1224       event_gc (true);
1225       return 1;
1226     }
1227   else if (r == CUDA_ERROR_NOT_READY)
1228     return 0;
1229
1230   GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
1231
1232   return 0;
1233 }
1234
1235 static int
1236 nvptx_async_test_all (void)
1237 {
1238   struct ptx_stream *s;
1239   pthread_t self = pthread_self ();
1240   struct nvptx_thread *nvthd = nvptx_thread ();
1241
1242   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1243
1244   for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
1245     {
1246       if ((s->multithreaded || pthread_equal (s->host_thread, self))
1247           && cuStreamQuery (s->stream) == CUDA_ERROR_NOT_READY)
1248         {
1249           pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1250           return 0;
1251         }
1252     }
1253
1254   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1255
1256   event_gc (true);
1257
1258   return 1;
1259 }
1260
1261 static void
1262 nvptx_wait (int async)
1263 {
1264   CUresult r;
1265   struct ptx_stream *s;
1266
1267   s = select_stream_for_async (async, pthread_self (), false, NULL);
1268
1269   if (!s)
1270     GOMP_PLUGIN_fatal ("unknown async %d", async);
1271
1272   r = cuStreamSynchronize (s->stream);
1273   if (r != CUDA_SUCCESS)
1274     GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1275
1276   event_gc (true);
1277 }
1278
1279 static void
1280 nvptx_wait_async (int async1, int async2)
1281 {
1282   CUresult r;
1283   CUevent *e;
1284   struct ptx_stream *s1, *s2;
1285   pthread_t self = pthread_self ();
1286
1287   /* The stream that is waiting (rather than being waited for) doesn't
1288      necessarily have to exist already.  */
1289   s2 = select_stream_for_async (async2, self, true, NULL);
1290
1291   s1 = select_stream_for_async (async1, self, false, NULL);
1292   if (!s1)
1293     GOMP_PLUGIN_fatal ("invalid async 1\n");
1294
1295   if (s1 == s2)
1296     GOMP_PLUGIN_fatal ("identical parameters");
1297
1298   e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1299
1300   r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1301   if (r != CUDA_SUCCESS)
1302     GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1303
1304   event_gc (true);
1305
1306   r = cuEventRecord (*e, s1->stream);
1307   if (r != CUDA_SUCCESS)
1308     GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1309
1310   event_add (PTX_EVT_SYNC, e, NULL);
1311
1312   r = cuStreamWaitEvent (s2->stream, *e, 0);
1313   if (r != CUDA_SUCCESS)
1314     GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuda_error (r));
1315 }
1316
1317 static void
1318 nvptx_wait_all (void)
1319 {
1320   CUresult r;
1321   struct ptx_stream *s;
1322   pthread_t self = pthread_self ();
1323   struct nvptx_thread *nvthd = nvptx_thread ();
1324
1325   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1326
1327   /* Wait for active streams initiated by this thread (or by multiple threads)
1328      to complete.  */
1329   for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
1330     {
1331       if (s->multithreaded || pthread_equal (s->host_thread, self))
1332         {
1333           r = cuStreamQuery (s->stream);
1334           if (r == CUDA_SUCCESS)
1335             continue;
1336           else if (r != CUDA_ERROR_NOT_READY)
1337             GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
1338
1339           r = cuStreamSynchronize (s->stream);
1340           if (r != CUDA_SUCCESS)
1341             GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1342         }
1343     }
1344
1345   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1346
1347   event_gc (true);
1348 }
1349
1350 static void
1351 nvptx_wait_all_async (int async)
1352 {
1353   CUresult r;
1354   struct ptx_stream *waiting_stream, *other_stream;
1355   CUevent *e;
1356   struct nvptx_thread *nvthd = nvptx_thread ();
1357   pthread_t self = pthread_self ();
1358
1359   /* The stream doing the waiting.  This could be the first mention of the
1360      stream, so create it if necessary.  */
1361   waiting_stream
1362     = select_stream_for_async (async, pthread_self (), true, NULL);
1363
1364   /* Launches on the null stream already block on other streams in the
1365      context.  */
1366   if (!waiting_stream || waiting_stream == nvthd->ptx_dev->null_stream)
1367     return;
1368
1369   event_gc (true);
1370
1371   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1372
1373   for (other_stream = nvthd->ptx_dev->active_streams;
1374        other_stream != NULL;
1375        other_stream = other_stream->next)
1376     {
1377       if (!other_stream->multithreaded
1378           && !pthread_equal (other_stream->host_thread, self))
1379         continue;
1380
1381       e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1382
1383       r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1384       if (r != CUDA_SUCCESS)
1385         GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1386
1387       /* Record an event on the waited-for stream.  */
1388       r = cuEventRecord (*e, other_stream->stream);
1389       if (r != CUDA_SUCCESS)
1390         GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1391
1392       event_add (PTX_EVT_SYNC, e, NULL);
1393
1394       r = cuStreamWaitEvent (waiting_stream->stream, *e, 0);
1395       if (r != CUDA_SUCCESS)
1396         GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuda_error (r));
1397    }
1398
1399   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1400 }
1401
1402 static void *
1403 nvptx_get_current_cuda_device (void)
1404 {
1405   struct nvptx_thread *nvthd = nvptx_thread ();
1406
1407   if (!nvthd || !nvthd->ptx_dev)
1408     return NULL;
1409
1410   return &nvthd->ptx_dev->dev;
1411 }
1412
1413 static void *
1414 nvptx_get_current_cuda_context (void)
1415 {
1416   struct nvptx_thread *nvthd = nvptx_thread ();
1417
1418   if (!nvthd || !nvthd->ptx_dev)
1419     return NULL;
1420
1421   return nvthd->ptx_dev->ctx;
1422 }
1423
1424 static void *
1425 nvptx_get_cuda_stream (int async)
1426 {
1427   struct ptx_stream *s;
1428   struct nvptx_thread *nvthd = nvptx_thread ();
1429
1430   if (!nvthd || !nvthd->ptx_dev)
1431     return NULL;
1432
1433   s = select_stream_for_async (async, pthread_self (), false, NULL);
1434
1435   return s ? s->stream : NULL;
1436 }
1437
1438 static int
1439 nvptx_set_cuda_stream (int async, void *stream)
1440 {
1441   struct ptx_stream *oldstream;
1442   pthread_t self = pthread_self ();
1443   struct nvptx_thread *nvthd = nvptx_thread ();
1444
1445   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1446
1447   if (async < 0)
1448     GOMP_PLUGIN_fatal ("bad async %d", async);
1449
1450   /* We have a list of active streams and an array mapping async values to
1451      entries of that list.  We need to take "ownership" of the passed-in stream,
1452      and add it to our list, removing the previous entry also (if there was one)
1453      in order to prevent resource leaks.  Note the potential for surprise
1454      here: maybe we should keep track of passed-in streams and leave it up to
1455      the user to tidy those up, but that doesn't work for stream handles
1456      returned from acc_get_cuda_stream above...  */
1457
1458   oldstream = select_stream_for_async (async, self, false, NULL);
1459
1460   if (oldstream)
1461     {
1462       if (nvthd->ptx_dev->active_streams == oldstream)
1463         nvthd->ptx_dev->active_streams = nvthd->ptx_dev->active_streams->next;
1464       else
1465         {
1466           struct ptx_stream *s = nvthd->ptx_dev->active_streams;
1467           while (s->next != oldstream)
1468             s = s->next;
1469           s->next = s->next->next;
1470         }
1471
1472       cuStreamDestroy (oldstream->stream);
1473       map_fini (oldstream);
1474       free (oldstream);
1475     }
1476
1477   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1478
1479   (void) select_stream_for_async (async, self, true, (CUstream) stream);
1480
1481   return 1;
1482 }
1483
1484 /* Plugin entry points.  */
1485
1486 const char *
1487 GOMP_OFFLOAD_get_name (void)
1488 {
1489   return "nvptx";
1490 }
1491
1492 unsigned int
1493 GOMP_OFFLOAD_get_caps (void)
1494 {
1495   return GOMP_OFFLOAD_CAP_OPENACC_200;
1496 }
1497
1498 int
1499 GOMP_OFFLOAD_get_type (void)
1500 {
1501   return OFFLOAD_TARGET_TYPE_NVIDIA_PTX;
1502 }
1503
1504 int
1505 GOMP_OFFLOAD_get_num_devices (void)
1506 {
1507   return nvptx_get_num_devices ();
1508 }
1509
1510 static void **kernel_target_data;
1511 static void **kernel_host_table;
1512
1513 void
1514 GOMP_OFFLOAD_register_image (void *host_table, void *target_data)
1515 {
1516   kernel_target_data = target_data;
1517   kernel_host_table = host_table;
1518 }
1519
1520 void
1521 GOMP_OFFLOAD_init_device (int n __attribute__ ((unused)))
1522 {
1523   (void) nvptx_init ();
1524 }
1525
1526 void
1527 GOMP_OFFLOAD_fini_device (int n __attribute__ ((unused)))
1528 {
1529   nvptx_fini ();
1530 }
1531
1532 int
1533 GOMP_OFFLOAD_get_table (int n __attribute__ ((unused)),
1534                         struct mapping_table **tablep)
1535 {
1536   CUmodule module;
1537   void **fn_table;
1538   char **fn_names;
1539   int fn_entries, i;
1540   CUresult r;
1541   struct targ_fn_descriptor *targ_fns;
1542
1543   if (nvptx_init () <= 0)
1544     return 0;
1545
1546   /* This isn't an error, because an image may legitimately have no offloaded
1547      regions and so will not call GOMP_offload_register.  */
1548   if (kernel_target_data == NULL)
1549     return 0;
1550
1551   link_ptx (&module, kernel_target_data[0]);
1552
1553   /* kernel_target_data[0] -> ptx code
1554      kernel_target_data[1] -> variable mappings
1555      kernel_target_data[2] -> array of kernel names in ascii
1556
1557      kernel_host_table[0] -> start of function addresses (__offload_func_table)
1558      kernel_host_table[1] -> end of function addresses (__offload_funcs_end)
1559
1560      The array of kernel names and the functions addresses form a
1561      one-to-one correspondence.  */
1562
1563   fn_table = kernel_host_table[0];
1564   fn_names = (char **) kernel_target_data[2];
1565   fn_entries = (kernel_host_table[1] - kernel_host_table[0]) / sizeof (void *);
1566
1567   *tablep = GOMP_PLUGIN_malloc (sizeof (struct mapping_table) * fn_entries);
1568   targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
1569                                  * fn_entries);
1570
1571   for (i = 0; i < fn_entries; i++)
1572     {
1573       CUfunction function;
1574
1575       r = cuModuleGetFunction (&function, module, fn_names[i]);
1576       if (r != CUDA_SUCCESS)
1577         GOMP_PLUGIN_fatal ("cuModuleGetFunction error: %s", cuda_error (r));
1578
1579       targ_fns[i].fn = function;
1580       targ_fns[i].name = (const char *) fn_names[i];
1581
1582       (*tablep)[i].host_start = (uintptr_t) fn_table[i];
1583       (*tablep)[i].host_end = (*tablep)[i].host_start + 1;
1584       (*tablep)[i].tgt_start = (uintptr_t) &targ_fns[i];
1585       (*tablep)[i].tgt_end = (*tablep)[i].tgt_start + 1;
1586     }
1587
1588   return fn_entries;
1589 }
1590
1591 void *
1592 GOMP_OFFLOAD_alloc (int n __attribute__ ((unused)), size_t size)
1593 {
1594   return nvptx_alloc (size);
1595 }
1596
1597 void
1598 GOMP_OFFLOAD_free (int n __attribute__ ((unused)), void *ptr)
1599 {
1600   nvptx_free (ptr);
1601 }
1602
1603 void *
1604 GOMP_OFFLOAD_dev2host (int ord __attribute__ ((unused)), void *dst,
1605                        const void *src, size_t n)
1606 {
1607   return nvptx_dev2host (dst, src, n);
1608 }
1609
1610 void *
1611 GOMP_OFFLOAD_host2dev (int ord __attribute__ ((unused)), void *dst,
1612                        const void *src, size_t n)
1613 {
1614   return nvptx_host2dev (dst, src, n);
1615 }
1616
1617 void (*device_run) (int n, void *fn_ptr, void *vars) = NULL;
1618
1619 void
1620 GOMP_OFFLOAD_openacc_parallel (void (*fn) (void *), size_t mapnum,
1621                                void **hostaddrs, void **devaddrs, size_t *sizes,
1622                                unsigned short *kinds, int num_gangs,
1623                                int num_workers, int vector_length, int async,
1624                                void *targ_mem_desc)
1625 {
1626   nvptx_exec (fn, mapnum, hostaddrs, devaddrs, sizes, kinds, num_gangs,
1627             num_workers, vector_length, async, targ_mem_desc);
1628 }
1629
1630 void *
1631 GOMP_OFFLOAD_openacc_open_device (int n)
1632 {
1633   return nvptx_open_device (n);
1634 }
1635
1636 int
1637 GOMP_OFFLOAD_openacc_close_device (void *h)
1638 {
1639   return nvptx_close_device (h);
1640 }
1641
1642 void
1643 GOMP_OFFLOAD_openacc_set_device_num (int n)
1644 {
1645   struct nvptx_thread *nvthd = nvptx_thread ();
1646
1647   assert (n >= 0);
1648
1649   if (!nvthd->ptx_dev || nvthd->ptx_dev->ord != n)
1650     (void) nvptx_open_device (n);
1651 }
1652
1653 /* This can be called before the device is "opened" for the current thread, in
1654    which case we can't tell which device number should be returned.  We don't
1655    actually want to open the device here, so just return -1 and let the caller
1656    (oacc-init.c:acc_get_device_num) handle it.  */
1657
1658 int
1659 GOMP_OFFLOAD_openacc_get_device_num (void)
1660 {
1661   struct nvptx_thread *nvthd = nvptx_thread ();
1662
1663   if (nvthd && nvthd->ptx_dev)
1664     return nvthd->ptx_dev->ord;
1665   else
1666     return -1;
1667 }
1668
1669 void
1670 GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc)
1671 {
1672   CUevent *e;
1673   CUresult r;
1674   struct nvptx_thread *nvthd = nvptx_thread ();
1675
1676   e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1677
1678   r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1679   if (r != CUDA_SUCCESS)
1680     GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1681
1682   r = cuEventRecord (*e, nvthd->current_stream->stream);
1683   if (r != CUDA_SUCCESS)
1684     GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1685
1686   event_add (PTX_EVT_ASYNC_CLEANUP, e, targ_mem_desc);
1687 }
1688
1689 int
1690 GOMP_OFFLOAD_openacc_async_test (int async)
1691 {
1692   return nvptx_async_test (async);
1693 }
1694
1695 int
1696 GOMP_OFFLOAD_openacc_async_test_all (void)
1697 {
1698   return nvptx_async_test_all ();
1699 }
1700
1701 void
1702 GOMP_OFFLOAD_openacc_async_wait (int async)
1703 {
1704   nvptx_wait (async);
1705 }
1706
1707 void
1708 GOMP_OFFLOAD_openacc_async_wait_async (int async1, int async2)
1709 {
1710   nvptx_wait_async (async1, async2);
1711 }
1712
1713 void
1714 GOMP_OFFLOAD_openacc_async_wait_all (void)
1715 {
1716   nvptx_wait_all ();
1717 }
1718
1719 void
1720 GOMP_OFFLOAD_openacc_async_wait_all_async (int async)
1721 {
1722   nvptx_wait_all_async (async);
1723 }
1724
1725 void
1726 GOMP_OFFLOAD_openacc_async_set_async (int async)
1727 {
1728   nvptx_set_async (async);
1729 }
1730
1731 void *
1732 GOMP_OFFLOAD_openacc_create_thread_data (void *targ_data)
1733 {
1734   struct ptx_device *ptx_dev = (struct ptx_device *) targ_data;
1735   struct nvptx_thread *nvthd
1736     = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
1737   CUresult r;
1738   CUcontext thd_ctx;
1739
1740   r = cuCtxGetCurrent (&thd_ctx);
1741   if (r != CUDA_SUCCESS)
1742     GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));
1743
1744   assert (ptx_dev->ctx);
1745
1746   if (!thd_ctx)
1747     {
1748       r = cuCtxPushCurrent (ptx_dev->ctx);
1749       if (r != CUDA_SUCCESS)
1750         GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s", cuda_error (r));
1751     }
1752
1753   nvthd->current_stream = ptx_dev->null_stream;
1754   nvthd->ptx_dev = ptx_dev;
1755
1756   return (void *) nvthd;
1757 }
1758
1759 void
1760 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
1761 {
1762   free (data);
1763 }
1764
1765 void *
1766 GOMP_OFFLOAD_openacc_get_current_cuda_device (void)
1767 {
1768   return nvptx_get_current_cuda_device ();
1769 }
1770
1771 void *
1772 GOMP_OFFLOAD_openacc_get_current_cuda_context (void)
1773 {
1774   return nvptx_get_current_cuda_context ();
1775 }
1776
1777 /* NOTE: This returns a CUstream, not a ptx_stream pointer.  */
1778
1779 void *
1780 GOMP_OFFLOAD_openacc_get_cuda_stream (int async)
1781 {
1782   return nvptx_get_cuda_stream (async);
1783 }
1784
1785 /* NOTE: This takes a CUstream, not a ptx_stream pointer.  */
1786
1787 int
1788 GOMP_OFFLOAD_openacc_set_cuda_stream (int async, void *stream)
1789 {
1790   return nvptx_set_cuda_stream (async, stream);
1791 }