libgomp/plugin-nvptx.c

   1 /* Plugin for NVPTX execution.
   2
   3    Copyright (C) 2013-2014 Free Software Foundation, Inc.
   4
   5    Contributed by Mentor Embedded.
   6
   7    This file is part of the GNU OpenMP Library (libgomp).
   8
   9    Libgomp is free software; you can redistribute it and/or modify it
  10    under the terms of the GNU General Public License as published by
  11    the Free Software Foundation; either version 3, or (at your option)
  12    any later version.
  13
  14    Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
  15    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  16    FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  17    more details.
  18
  19    Under Section 7 of GPL version 3, you are granted additional
  20    permissions described in the GCC Runtime Library Exception, version
  21    3.1, as published by the Free Software Foundation.
  22
  23    You should have received a copy of the GNU General Public License and
  24    a copy of the GCC Runtime Library Exception along with this program;
  25    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  26    <http://www.gnu.org/licenses/>.  */
  27
  28 /* Nvidia PTX-specific parts of OpenACC support.  The cuda driver
  29    library appears to hold some implicit state, but the documentation
  30    is not clear as to what that state might be.  Or how one might
  31    propagate it from one thread to another.  */
  32
  33 //#define DEBUG
  34 //#define DISABLE_ASYNC
  35
  36 #include "openacc.h"
  37 #include "config.h"
  38 #include "libgomp.h"
  39 #include "target.h"
  40 #include "libgomp-plugin.h"
  41
  42 #include <cuda.h>
  43 #include <stdint.h>
  44 #include <string.h>
  45 #include <stdio.h>
  46 #include <dlfcn.h>
  47 #include <unistd.h>
  48 #include <assert.h>
  49
  50 #define ARRAYSIZE(X) (sizeof (X) / sizeof ((X)[0]))
  51
  52 static struct _errlist
  53 {
  54   CUresult r;
  55   char *m;
  56 } cuErrorList[] = {
  57     { CUDA_ERROR_INVALID_VALUE, "invalid value" },
  58     { CUDA_ERROR_OUT_OF_MEMORY, "out of memory" },
  59     { CUDA_ERROR_NOT_INITIALIZED, "not initialized" },
  60     { CUDA_ERROR_DEINITIALIZED, "deinitialized" },
  61     { CUDA_ERROR_PROFILER_DISABLED, "profiler disabled" },
  62     { CUDA_ERROR_PROFILER_NOT_INITIALIZED, "profiler not initialized" },
  63     { CUDA_ERROR_PROFILER_ALREADY_STARTED, "already started" },
  64     { CUDA_ERROR_PROFILER_ALREADY_STOPPED, "already stopped" },
  65     { CUDA_ERROR_NO_DEVICE, "no device" },
  66     { CUDA_ERROR_INVALID_DEVICE, "invalid device" },
  67     { CUDA_ERROR_INVALID_IMAGE, "invalid image" },
  68     { CUDA_ERROR_INVALID_CONTEXT, "invalid context" },
  69     { CUDA_ERROR_CONTEXT_ALREADY_CURRENT, "context already current" },
  70     { CUDA_ERROR_MAP_FAILED, "map error" },
  71     { CUDA_ERROR_UNMAP_FAILED, "unmap error" },
  72     { CUDA_ERROR_ARRAY_IS_MAPPED, "array is mapped" },
  73     { CUDA_ERROR_ALREADY_MAPPED, "already mapped" },
  74     { CUDA_ERROR_NO_BINARY_FOR_GPU, "no binary for gpu" },
  75     { CUDA_ERROR_ALREADY_ACQUIRED, "already acquired" },
  76     { CUDA_ERROR_NOT_MAPPED, "not mapped" },
  77     { CUDA_ERROR_NOT_MAPPED_AS_ARRAY, "not mapped as array" },
  78     { CUDA_ERROR_NOT_MAPPED_AS_POINTER, "not mapped as pointer" },
  79     { CUDA_ERROR_ECC_UNCORRECTABLE, "ecc uncorrectable" },
  80     { CUDA_ERROR_UNSUPPORTED_LIMIT, "unsupported limit" },
  81     { CUDA_ERROR_CONTEXT_ALREADY_IN_USE, "context already in use" },
  82     { CUDA_ERROR_PEER_ACCESS_UNSUPPORTED, "peer access unsupported" },
  83     { CUDA_ERROR_INVALID_SOURCE, "invalid source" },
  84     { CUDA_ERROR_FILE_NOT_FOUND, "file not found" },
  85     { CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
  86                                             "shared object symbol not found" },
  87     { CUDA_ERROR_SHARED_OBJECT_INIT_FAILED, "shared object init error" },
  88     { CUDA_ERROR_OPERATING_SYSTEM, "operating system" },
  89     { CUDA_ERROR_INVALID_HANDLE, "invalid handle" },
  90     { CUDA_ERROR_NOT_FOUND, "not found" },
  91     { CUDA_ERROR_NOT_READY, "not ready" },
  92     { CUDA_ERROR_LAUNCH_FAILED, "launch error" },
  93     { CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, "launch out of resources" },
  94     { CUDA_ERROR_LAUNCH_TIMEOUT, "launch timeout" },
  95     { CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
  96                                             "launch incompatibe texturing" },
  97     { CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED, "peer access already enabled" },
  98     { CUDA_ERROR_PEER_ACCESS_NOT_ENABLED, "peer access not enabled " },
  99     { CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE, "primary cotext active" },
 100     { CUDA_ERROR_CONTEXT_IS_DESTROYED, "context is destroyed" },
 101     { CUDA_ERROR_ASSERT, "assert" },
 102     { CUDA_ERROR_TOO_MANY_PEERS, "too many peers" },
 103     { CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED,
 104                                             "host memory already registered" },
 105     { CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED, "host memory not registered" },
 106     { CUDA_ERROR_NOT_PERMITTED, "no permitted" },
 107     { CUDA_ERROR_NOT_SUPPORTED, "not supported" },
 108     { CUDA_ERROR_UNKNOWN, "unknown" }
 109 };
 110
 111 static char errmsg[128];
 112
 113 static char *
 114 cuErrorMsg (CUresult r)
 115 {
 116   int i;
 117
 118   for (i = 0; i < ARRAYSIZE (cuErrorList); i++)
 119     {
 120       if (cuErrorList[i].r == r)
 121         return &cuErrorList[i].m[0];
 122     }
 123
 124   sprintf (&errmsg[0], "unknown result code: %5d", r);
 125
 126   return &errmsg[0];
 127 }
 128
 129 struct targ_fn_descriptor
 130 {
 131   CUfunction fn;
 132   const char *name;
 133 };
 134
 135 static bool PTX_inited = false;
 136
 137 struct PTX_stream
 138 {
 139   CUstream stream;
 140   pthread_t host_thread;
 141   bool multithreaded;
 142
 143   CUdeviceptr d;
 144   void *h;
 145   void *h_begin;
 146   void *h_end;
 147   void *h_next;
 148   void *h_prev;
 149   void *h_tail;
 150
 151   struct PTX_stream *next;
 152 };
 153
 154 /* Each thread may select a stream (also specific to a device/context).  */
 155 static __thread struct PTX_stream *current_stream;
 156
 157 struct map
 158 {
 159   int     async;
 160   size_t  size;
 161   char    mappings[0];
 162 };
 163
 164 static void
 165 map_init (struct PTX_stream *s)
 166 {
 167   CUresult r;
 168
 169   int size = getpagesize ();
 170
 171   assert (s);
 172   assert (!s->d);
 173   assert (!s->h);
 174
 175   r = cuMemAllocHost (&s->h, size);
 176   if (r != CUDA_SUCCESS)
 177     GOMP_PLUGIN_fatal ("cuMemAllocHost error: %s", cuErrorMsg (r));
 178
 179   r = cuMemHostGetDevicePointer (&s->d, s->h, 0);
 180   if (r != CUDA_SUCCESS)
 181     GOMP_PLUGIN_fatal ("cuMemHostGetDevicePointer error: %s", cuErrorMsg (r));
 182
 183   assert (s->h);
 184
 185   s->h_begin = s->h;
 186   s->h_end = s->h_begin + size;
 187   s->h_next = s->h_prev = s->h_tail = s->h_begin;
 188
 189   assert (s->h_next);
 190   assert (s->h_end);
 191 }
 192
 193 static void
 194 map_fini (struct PTX_stream *s)
 195 {
 196   CUresult r;
 197
 198   r = cuMemFreeHost (s->h);
 199   if (r != CUDA_SUCCESS)
 200     GOMP_PLUGIN_fatal ("cuMemFreeHost error: %s", cuErrorMsg (r));
 201 }
 202
 203 static void
 204 map_pop (struct PTX_stream *s)
 205 {
 206   struct map *m;
 207
 208   assert (s != NULL);
 209   assert (s->h_next);
 210   assert (s->h_prev);
 211   assert (s->h_tail);
 212
 213   m = s->h_tail;
 214
 215   s->h_tail += m->size;
 216
 217   if (s->h_tail >= s->h_end)
 218     s->h_tail = s->h_begin + (int) (s->h_tail - s->h_end);
 219
 220   if (s->h_next == s->h_tail)
 221     s->h_prev = s->h_next;
 222
 223   assert (s->h_next >= s->h_begin);
 224   assert (s->h_tail >= s->h_begin);
 225   assert (s->h_prev >= s->h_begin);
 226
 227   assert (s->h_next <= s->h_end);
 228   assert (s->h_tail <= s->h_end);
 229   assert (s->h_prev <= s->h_end);
 230 }
 231
 232 static void
 233 map_push (struct PTX_stream *s, int async, size_t size, void **h, void **d)
 234 {
 235   int left;
 236   int offset;
 237   struct map *m;
 238
 239   assert (s != NULL);
 240
 241   left = s->h_end - s->h_next;
 242   size += sizeof (struct map);
 243
 244   assert (s->h_prev);
 245   assert (s->h_next);
 246
 247   if (size >= left)
 248     {
 249       m = s->h_prev;
 250       m->size += left;
 251       s->h_next = s->h_begin;
 252
 253       if (s->h_next + size > s->h_end)
 254         GOMP_PLUGIN_fatal ("unable to push map");
 255     }
 256
 257   assert (s->h_next);
 258
 259   m = s->h_next;
 260   m->async = async;
 261   m->size = size;
 262
 263   offset = (void *)&m->mappings[0] - s->h;
 264
 265   *d = (void *)(s->d + offset);
 266   *h = (void *)(s->h + offset);
 267
 268   s->h_prev = s->h_next;
 269   s->h_next += size;
 270
 271   assert (s->h_prev);
 272   assert (s->h_next);
 273
 274   assert (s->h_next >= s->h_begin);
 275   assert (s->h_tail >= s->h_begin);
 276   assert (s->h_prev >= s->h_begin);
 277   assert (s->h_next <= s->h_end);
 278   assert (s->h_tail <= s->h_end);
 279   assert (s->h_prev <= s->h_end);
 280
 281   return;
 282 }
 283
 284 struct PTX_device
 285 {
 286   CUcontext ctx;
 287   bool ctx_shared;
 288   CUdevice dev;
 289   struct PTX_stream *null_stream;
 290   /* All non-null streams associated with this device (actually context),
 291      either created implicitly or passed in from the user (via
 292      acc_set_cuda_stream).  */
 293   struct PTX_stream *active_streams;
 294   struct {
 295     struct PTX_stream **arr;
 296     int size;
 297   } async_streams;
 298   /* A lock for use when manipulating the above stream list and array.  */
 299   gomp_mutex_t stream_lock;
 300   int ord;
 301   bool overlap;
 302   bool map;
 303   bool concur;
 304   int  mode;
 305   bool mkern;
 306
 307   struct PTX_device *next;
 308 };
 309
 310 static __thread struct PTX_device *PTX_dev;
 311 static struct PTX_device *PTX_devices;
 312
 313 enum PTX_event_type
 314 {
 315   PTX_EVT_MEM,
 316   PTX_EVT_KNL,
 317   PTX_EVT_SYNC,
 318   PTX_EVT_ASYNC_CLEANUP
 319 };
 320
 321 struct PTX_event
 322 {
 323   CUevent *evt;
 324   int type;
 325   void *addr;
 326   int ord;
 327
 328   struct PTX_event *next;
 329 };
 330
 331 static gomp_mutex_t PTX_event_lock;
 332 static struct PTX_event *PTX_events;
 333
 334 #define _XSTR(s) _STR(s)
 335 #define _STR(s) #s
 336
 337 static struct _synames
 338 {
 339   char *n;
 340 } cuSymNames[] =
 341 {
 342   { _XSTR(cuCtxCreate) },
 343   { _XSTR(cuCtxDestroy) },
 344   { _XSTR(cuCtxGetCurrent) },
 345   { _XSTR(cuCtxPushCurrent) },
 346   { _XSTR(cuCtxSynchronize) },
 347   { _XSTR(cuDeviceGet) },
 348   { _XSTR(cuDeviceGetAttribute) },
 349   { _XSTR(cuDeviceGetCount) },
 350   { _XSTR(cuEventCreate) },
 351   { _XSTR(cuEventDestroy) },
 352   { _XSTR(cuEventQuery) },
 353   { _XSTR(cuEventRecord) },
 354   { _XSTR(cuInit) },
 355   { _XSTR(cuLaunchKernel) },
 356   { _XSTR(cuLinkAddData) },
 357   { _XSTR(cuLinkComplete) },
 358   { _XSTR(cuLinkCreate) },
 359   { _XSTR(cuMemAlloc) },
 360   { _XSTR(cuMemAllocHost) },
 361   { _XSTR(cuMemcpy) },
 362   { _XSTR(cuMemcpyDtoH) },
 363   { _XSTR(cuMemcpyDtoHAsync) },
 364   { _XSTR(cuMemcpyHtoD) },
 365   { _XSTR(cuMemcpyHtoDAsync) },
 366   { _XSTR(cuMemFree) },
 367   { _XSTR(cuMemFreeHost) },
 368   { _XSTR(cuMemGetAddressRange) },
 369   { _XSTR(cuMemHostGetDevicePointer) },
 370   { _XSTR(cuMemHostRegister) },
 371   { _XSTR(cuMemHostUnregister) },
 372   { _XSTR(cuModuleGetFunction) },
 373   { _XSTR(cuModuleLoadData) },
 374   { _XSTR(cuStreamDestroy) },
 375   { _XSTR(cuStreamQuery) },
 376   { _XSTR(cuStreamSynchronize) },
 377   { _XSTR(cuStreamWaitEvent) }
 378 };
 379
 380 static int
 381 verify_device_library (void)
 382 {
 383   int i;
 384   void *dh, *ds;
 385
 386   dh = dlopen ("libcuda.so", RTLD_LAZY);
 387   if (!dh)
 388     return -1;
 389
 390   for (i = 0; i < ARRAYSIZE (cuSymNames); i++)
 391     {
 392       ds = dlsym (dh, cuSymNames[i].n);
 393       if (!ds)
 394         return -1;
 395     }
 396
 397   dlclose (dh);
 398
 399   return 0;
 400 }
 401
 402 static void
 403 init_streams_for_device (struct PTX_device *ptx_dev, int concurrency)
 404 {
 405   int i;
 406   struct PTX_stream *null_stream
 407     = GOMP_PLUGIN_malloc (sizeof (struct PTX_stream));
 408
 409   null_stream->stream = NULL;
 410   null_stream->host_thread = pthread_self ();
 411   null_stream->multithreaded = true;
 412   null_stream->d = (CUdeviceptr) NULL;
 413   null_stream->h = NULL;
 414   map_init (null_stream);
 415   ptx_dev->null_stream = null_stream;
 416
 417   ptx_dev->active_streams = NULL;
 418   GOMP_PLUGIN_mutex_init (&ptx_dev->stream_lock);
 419
 420   if (concurrency < 1)
 421     concurrency = 1;
 422
 423   /* This is just a guess -- make space for as many async streams as the
 424      current device is capable of concurrently executing.  This can grow
 425      later as necessary.  No streams are created yet.  */
 426   ptx_dev->async_streams.arr
 427     = GOMP_PLUGIN_malloc (concurrency * sizeof (struct PTX_stream *));
 428   ptx_dev->async_streams.size = concurrency;
 429
 430   for (i = 0; i < concurrency; i++)
 431     ptx_dev->async_streams.arr[i] = NULL;
 432 }
 433
 434 static void
 435 fini_streams_for_device (struct PTX_device *ptx_dev)
 436 {
 437   free (ptx_dev->async_streams.arr);
 438
 439   while (ptx_dev->active_streams != NULL)
 440     {
 441       struct PTX_stream *s = ptx_dev->active_streams;
 442       ptx_dev->active_streams = ptx_dev->active_streams->next;
 443
 444       cuStreamDestroy (s->stream);
 445       map_fini (s);
 446       free (s);
 447     }
 448
 449   map_fini (ptx_dev->null_stream);
 450   free (ptx_dev->null_stream);
 451 }
 452
 453 /* Select a stream for (OpenACC-semantics) ASYNC argument for the current
 454    thread THREAD (and also current device/context).  If CREATE is true, create
 455    the stream if it does not exist (or use EXISTING if it is non-NULL), and
 456    associate the stream with the same thread argument.  Returns stream to use
 457    as result.  */
 458
 459 static struct PTX_stream *
 460 select_stream_for_async (int async, pthread_t thread, bool create,
 461                          CUstream existing)
 462 {
 463   /* Local copy of TLS variable.  */
 464   struct PTX_device *ptx_dev = PTX_dev;
 465   struct PTX_stream *stream = NULL;
 466   int orig_async = async;
 467
 468   /* The special value acc_async_noval (-1) maps (for now) to an
 469      implicitly-created stream, which is then handled the same as any other
 470      numbered async stream.  Other options are available, e.g. using the null
 471      stream for anonymous async operations, or choosing an idle stream from an
 472      active set.  But, stick with this for now.  */
 473   if (async > acc_async_sync)
 474     async++;
 475
 476   if (create)
 477     GOMP_PLUGIN_mutex_lock (&ptx_dev->stream_lock);
 478
 479   /* NOTE: AFAICT there's no particular need for acc_async_sync to map to the
 480      null stream, and in fact better performance may be obtainable if it doesn't
 481      (because the null stream enforces overly-strict synchronisation with
 482      respect to other streams for legacy reasons, and that's probably not
 483      needed with OpenACC).  Maybe investigate later.  */
 484   if (async == acc_async_sync)
 485     stream = ptx_dev->null_stream;
 486   else if (async >= 0 && async < ptx_dev->async_streams.size
 487            && ptx_dev->async_streams.arr[async] && !(create && existing))
 488     stream = ptx_dev->async_streams.arr[async];
 489   else if (async >= 0 && create)
 490     {
 491       if (async >= ptx_dev->async_streams.size)
 492         {
 493           int i, newsize = ptx_dev->async_streams.size * 2;
 494
 495           if (async >= newsize)
 496             newsize = async + 1;
 497
 498           ptx_dev->async_streams.arr
 499             = GOMP_PLUGIN_realloc (ptx_dev->async_streams.arr,
 500                                    newsize * sizeof (struct PTX_stream *));
 501
 502           for (i = ptx_dev->async_streams.size; i < newsize; i++)
 503             ptx_dev->async_streams.arr[i] = NULL;
 504
 505           ptx_dev->async_streams.size = newsize;
 506         }
 507
 508       /* Create a new stream on-demand if there isn't one already, or if we're
 509          setting a particular async value to an existing (externally-provided)
 510          stream.  */
 511       if (!ptx_dev->async_streams.arr[async] || existing)
 512         {
 513           CUresult r;
 514           struct PTX_stream *s
 515             = GOMP_PLUGIN_malloc (sizeof (struct PTX_stream));
 516
 517           if (existing)
 518             s->stream = existing;
 519           else
 520             {
 521               r = cuStreamCreate (&s->stream, CU_STREAM_DEFAULT);
 522               if (r != CUDA_SUCCESS)
 523                 GOMP_PLUGIN_fatal ("cuStreamCreate error: %s", cuErrorMsg (r));
 524             }
 525
 526           /* If CREATE is true, we're going to be queueing some work on this
 527              stream.  Associate it with the current host thread.  */
 528           s->host_thread = thread;
 529           s->multithreaded = false;
 530
 531           s->d = (CUdeviceptr) NULL;
 532           s->h = NULL;
 533           map_init (s);
 534
 535           s->next = ptx_dev->active_streams;
 536           ptx_dev->active_streams = s;
 537           ptx_dev->async_streams.arr[async] = s;
 538         }
 539
 540       stream = ptx_dev->async_streams.arr[async];
 541     }
 542   else if (async < 0)
 543     GOMP_PLUGIN_fatal ("bad async %d", async);
 544
 545   if (create)
 546     {
 547       assert (stream != NULL);
 548
 549       /* If we're trying to use the same stream from different threads
 550          simultaneously, set stream->multithreaded to true.  This affects the
 551          behaviour of acc_async_test_all and acc_wait_all, which are supposed to
 552          only wait for asynchronous launches from the same host thread they are
 553          invoked on.  If multiple threads use the same async value, we make note
 554          of that here and fall back to testing/waiting for all threads in those
 555          functions.  */
 556       if (thread != stream->host_thread)
 557         stream->multithreaded = true;
 558
 559       GOMP_PLUGIN_mutex_unlock (&ptx_dev->stream_lock);
 560     }
 561   else if (stream && !stream->multithreaded
 562            && !pthread_equal (stream->host_thread, thread))
 563     GOMP_PLUGIN_fatal ("async %d used on wrong thread", orig_async);
 564
 565 #ifdef DEBUG
 566   fprintf (stderr, "libgomp plugin: %s:%s using stream %p (CUDA stream %p) "
 567            "for async %d\n", __FILE__, __FUNCTION__, stream,
 568            stream ? stream->stream : NULL, orig_async);
 569 #endif
 570
 571   return stream;
 572 }
 573
 574 static int PTX_get_num_devices (void);
 575
 576 /* Initialize the device.  */
 577 static int
 578 PTX_init (void)
 579 {
 580   CUresult r;
 581   int rc;
 582
 583   if (PTX_inited)
 584     return PTX_get_num_devices ();
 585
 586   rc = verify_device_library ();
 587   if (rc < 0)
 588     return -1;
 589
 590   r = cuInit (0);
 591   if (r != CUDA_SUCCESS)
 592     GOMP_PLUGIN_fatal ("cuInit error: %s", cuErrorMsg (r));
 593
 594   PTX_devices = NULL;
 595   PTX_events = NULL;
 596
 597   GOMP_PLUGIN_mutex_init (&PTX_event_lock);
 598
 599   PTX_inited = true;
 600
 601   return PTX_get_num_devices ();
 602 }
 603
 604 static int
 605 PTX_fini (void)
 606 {
 607   PTX_inited = false;
 608
 609   return 0;
 610 }
 611
 612 static void *
 613 PTX_open_device (int n)
 614 {
 615   CUdevice dev;
 616   CUresult r;
 617   int async_engines, pi;
 618
 619   if (PTX_devices)
 620     {
 621       struct PTX_device *ptx_device;
 622
 623       for (ptx_device = PTX_devices;
 624            ptx_device != NULL;
 625            ptx_device = ptx_device->next)
 626         {
 627           if (ptx_device->ord == n)
 628             {
 629               PTX_dev = ptx_device;
 630
 631               if (PTX_dev->ctx)
 632                 {
 633                   r = cuCtxPushCurrent (PTX_dev->ctx);
 634                   if (r != CUDA_SUCCESS)
 635                     GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s",
 636                                        cuErrorMsg (r));
 637                 }
 638
 639               return (void *)PTX_dev;
 640             }
 641         }
 642     }
 643
 644   r = cuDeviceGet (&dev, n);
 645   if (r != CUDA_SUCCESS)
 646     GOMP_PLUGIN_fatal ("cuDeviceGet error: %s", cuErrorMsg (r));
 647
 648   PTX_dev = GOMP_PLUGIN_malloc (sizeof (struct PTX_device));
 649   PTX_dev->ord = n;
 650   PTX_dev->dev = dev;
 651   PTX_dev->ctx_shared = false;
 652
 653   PTX_dev->next = PTX_devices;
 654   PTX_devices = PTX_dev;
 655
 656   r = cuCtxGetCurrent (&PTX_dev->ctx);
 657   if (r != CUDA_SUCCESS)
 658     GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuErrorMsg (r));
 659
 660   if (!PTX_dev->ctx)
 661     {
 662       r = cuCtxCreate (&PTX_dev->ctx, CU_CTX_SCHED_AUTO, dev);
 663       if (r != CUDA_SUCCESS)
 664         GOMP_PLUGIN_fatal ("cuCtxCreate error: %s", cuErrorMsg (r));
 665     }
 666   else
 667     {
 668       PTX_dev->ctx_shared = true;
 669     }
 670
 671   r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
 672   if (r != CUDA_SUCCESS)
 673     GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuErrorMsg (r));
 674
 675   PTX_dev->overlap = pi;
 676
 677   r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
 678   if (r != CUDA_SUCCESS)
 679     GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuErrorMsg (r));
 680
 681   PTX_dev->map = pi;
 682
 683   r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
 684   if (r != CUDA_SUCCESS)
 685     GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuErrorMsg (r));
 686
 687   PTX_dev->concur = pi;
 688
 689   r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
 690   if (r != CUDA_SUCCESS)
 691     GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuErrorMsg (r));
 692
 693   PTX_dev->mode = pi;
 694
 695   r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
 696   if (r != CUDA_SUCCESS)
 697     GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuErrorMsg (r));
 698
 699   PTX_dev->mkern = pi;
 700
 701   r = cuDeviceGetAttribute (&async_engines,
 702                             CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
 703   if (r != CUDA_SUCCESS)
 704     async_engines = 1;
 705
 706   init_streams_for_device (PTX_dev, async_engines);
 707
 708   current_stream = PTX_dev->null_stream;
 709
 710   return (void *)PTX_dev;
 711 }
 712
 713 static int
 714 PTX_close_device (void *h __attribute__((unused)))
 715 {
 716   CUresult r;
 717
 718   if (!PTX_dev)
 719     return 0;
 720
 721   fini_streams_for_device (PTX_dev);
 722
 723   if (!PTX_dev->ctx_shared)
 724     {
 725       r = cuCtxDestroy (PTX_dev->ctx);
 726       if (r != CUDA_SUCCESS)
 727         GOMP_PLUGIN_fatal ("cuCtxDestroy error: %s", cuErrorMsg (r));
 728     }
 729
 730   if (PTX_devices == PTX_dev)
 731     PTX_devices = PTX_devices->next;
 732   else
 733     {
 734       struct PTX_device* d = PTX_devices;
 735       while (d->next != PTX_dev)
 736         d = d->next;
 737       d->next = d->next->next;
 738     }
 739   free (PTX_dev);
 740
 741   PTX_dev = NULL;
 742
 743   return 0;
 744 }
 745
 746 static int
 747 PTX_get_num_devices (void)
 748 {
 749   int n;
 750   CUresult r;
 751
 752   assert (PTX_inited);
 753
 754   r = cuDeviceGetCount (&n);
 755   if (r!= CUDA_SUCCESS)
 756     GOMP_PLUGIN_fatal ("cuDeviceGetCount error: %s", cuErrorMsg (r));
 757
 758   return n;
 759 }
 760
 761 static bool
 762 PTX_avail(void)
 763 {
 764   bool avail = false;
 765
 766   if (PTX_init () > 0)
 767     avail = true;
 768
 769   return avail;
 770 }
 771
 772 #define ABORT_PTX                               \
 773   ".version 3.1\n"                              \
 774   ".target sm_30\n"                             \
 775   ".address_size 64\n"                          \
 776   ".visible .func abort;\n"                     \
 777   ".visible .func abort\n"                      \
 778   "{\n"                                         \
 779   "trap;\n"                                     \
 780   "ret;\n"                                      \
 781   "}\n"                                         \
 782   ".visible .func _gfortran_abort;\n"           \
 783   ".visible .func _gfortran_abort\n"            \
 784   "{\n"                                         \
 785   "trap;\n"                                     \
 786   "ret;\n"                                      \
 787   "}\n" \
 788
 789 /* Generated with:
 790
 791    $ echo 'int acc_on_device(int d) { return __builtin_acc_on_device(d); } int acc_on_device_h_(int *d) { return acc_on_device(*d); }' | accel-gcc/xgcc -Baccel-gcc -x c - -o - -S -m64 -O3 -fno-builtin-acc_on_device -fno-inline
 792 */
 793 #define ACC_ON_DEVICE_PTX                                               \
 794   "        .version        3.1\n"                                       \
 795   "        .target sm_30\n"                                             \
 796   "        .address_size 64\n"                                          \
 797   ".visible .func (.param.u32 %out_retval)acc_on_device(.param.u32 %in_ar1);\n" \
 798   ".visible .func (.param.u32 %out_retval)acc_on_device(.param.u32 %in_ar1)\n" \
 799   "{\n"                                                                 \
 800   "        .reg.u32 %ar1;\n"                                            \
 801   ".reg.u32 %retval;\n"                                                 \
 802   "        .reg.u64 %hr10;\n"                                           \
 803   "        .reg.u32 %r24;\n"                                            \
 804   "        .reg.u32 %r25;\n"                                            \
 805   "        .reg.pred %r27;\n"                                           \
 806   "        .reg.u32 %r30;\n"                                            \
 807   "        ld.param.u32 %ar1, [%in_ar1];\n"                             \
 808   "                mov.u32 %r24, %ar1;\n"                               \
 809   "                setp.ne.u32 %r27,%r24,4;\n"                          \
 810   "                set.u32.eq.u32 %r30,%r24,5;\n"                       \
 811   "                neg.s32 %r25, %r30;\n"                               \
 812   "        @%r27   bra     $L3;\n"                                      \
 813   "                mov.u32 %r25, 1;\n"                                  \
 814   "$L3:\n"                                                              \
 815   "                mov.u32 %retval, %r25;\n"                            \
 816   "        st.param.u32    [%out_retval], %retval;\n"                   \
 817   "        ret;\n"                                                      \
 818   "        }\n"                                                         \
 819   ".visible .func (.param.u32 %out_retval)acc_on_device_h_(.param.u64 %in_ar1);\n" \
 820   ".visible .func (.param.u32 %out_retval)acc_on_device_h_(.param.u64 %in_ar1)\n" \
 821   "{\n"                                                                 \
 822   "        .reg.u64 %ar1;\n"                                            \
 823   ".reg.u32 %retval;\n"                                                 \
 824   "        .reg.u64 %hr10;\n"                                           \
 825   "        .reg.u64 %r25;\n"                                            \
 826   "        .reg.u32 %r26;\n"                                            \
 827   "        .reg.u32 %r27;\n"                                            \
 828   "        ld.param.u64 %ar1, [%in_ar1];\n"                             \
 829   "                mov.u64 %r25, %ar1;\n"                               \
 830   "                ld.u32  %r26, [%r25];\n"                             \
 831   "        {\n"                                                         \
 832   "                .param.u32 %retval_in;\n"                            \
 833   "        {\n"                                                         \
 834   "                .param.u32 %out_arg0;\n"                             \
 835   "                st.param.u32 [%out_arg0], %r26;\n"                   \
 836   "                call (%retval_in), acc_on_device, (%out_arg0);\n"    \
 837   "        }\n"                                                         \
 838   "                ld.param.u32    %r27, [%retval_in];\n"               \
 839   "}\n"                                                                 \
 840   "                mov.u32 %retval, %r27;\n"                            \
 841   "        st.param.u32    [%out_retval], %retval;\n"                   \
 842   "        ret;\n"                                                      \
 843   "        }"
 844
 845 static void
 846 link_ptx (CUmodule *module, char *ptx_code)
 847 {
 848   CUjit_option opts[7];
 849   void *optvals[7];
 850   float elapsed = 0.0;
 851 #define LOGSIZE 8192
 852   char elog[LOGSIZE];
 853   char ilog[LOGSIZE];
 854   unsigned long logsize = LOGSIZE;
 855   CUlinkState linkstate;
 856   CUresult r;
 857   void *linkout;
 858   size_t linkoutsize __attribute__((unused));
 859
 860   GOMP_PLUGIN_notify ("attempting to load:\n---\n%s\n---\n", ptx_code);
 861
 862   opts[0] = CU_JIT_WALL_TIME;
 863   optvals[0] = &elapsed;
 864
 865   opts[1] = CU_JIT_INFO_LOG_BUFFER;
 866   optvals[1] = &ilog[0];
 867
 868   opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
 869   optvals[2] = (void *) logsize;
 870
 871   opts[3] = CU_JIT_ERROR_LOG_BUFFER;
 872   optvals[3] = &elog[0];
 873
 874   opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
 875   optvals[4] = (void *) logsize;
 876
 877   opts[5] = CU_JIT_LOG_VERBOSE;
 878   optvals[5] = (void *) 1;
 879
 880   opts[6] = CU_JIT_TARGET;
 881   optvals[6] = (void *) CU_TARGET_COMPUTE_30;
 882
 883   r = cuLinkCreate (7, opts, optvals, &linkstate);
 884   if (r != CUDA_SUCCESS)
 885     GOMP_PLUGIN_fatal ("cuLinkCreate error: %s", cuErrorMsg (r));
 886
 887   char *abort_ptx = ABORT_PTX;
 888   r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, abort_ptx,
 889                      strlen (abort_ptx) + 1, 0, 0, 0, 0);
 890   if (r != CUDA_SUCCESS)
 891     {
 892       GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
 893       GOMP_PLUGIN_fatal ("cuLinkAddData (abort) error: %s", cuErrorMsg (r));
 894     }
 895
 896   char *acc_on_device_ptx = ACC_ON_DEVICE_PTX;
 897   r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, acc_on_device_ptx,
 898                      strlen (acc_on_device_ptx) + 1, 0, 0, 0, 0);
 899   if (r != CUDA_SUCCESS)
 900     {
 901       GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
 902       GOMP_PLUGIN_fatal ("cuLinkAddData (acc_on_device) error: %s",
 903                          cuErrorMsg (r));
 904     }
 905
 906   r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, ptx_code,
 907               strlen (ptx_code) + 1, 0, 0, 0, 0);
 908   if (r != CUDA_SUCCESS)
 909     {
 910       GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
 911       GOMP_PLUGIN_fatal ("cuLinkAddData (ptx_code) error: %s", cuErrorMsg (r));
 912     }
 913
 914   r = cuLinkComplete (linkstate, &linkout, &linkoutsize);
 915   if (r != CUDA_SUCCESS)
 916     GOMP_PLUGIN_fatal ("cuLinkComplete error: %s", cuErrorMsg (r));
 917
 918   GOMP_PLUGIN_notify ("Link complete: %fms\n", elapsed);
 919   GOMP_PLUGIN_notify ("Link log %s\n", &ilog[0]);
 920
 921   r = cuModuleLoadData (module, linkout);
 922   if (r != CUDA_SUCCESS)
 923     GOMP_PLUGIN_fatal ("cuModuleLoadData error: %s", cuErrorMsg (r));
 924 }
 925
 926 static void
 927 event_gc (bool memmap_lockable)
 928 {
 929   struct PTX_event *ptx_event = PTX_events;
 930
 931   GOMP_PLUGIN_mutex_lock (&PTX_event_lock);
 932
 933   while (ptx_event != NULL)
 934     {
 935       CUresult r;
 936       struct PTX_event *e = ptx_event;
 937
 938       ptx_event = ptx_event->next;
 939
 940       if (e->ord != PTX_dev->ord)
 941         continue;
 942
 943       r = cuEventQuery (*e->evt);
 944       if (r == CUDA_SUCCESS)
 945         {
 946           CUevent *te;
 947
 948           te = e->evt;
 949
 950           switch (e->type)
 951             {
 952             case PTX_EVT_MEM:
 953             case PTX_EVT_SYNC:
 954               break;
 955
 956             case PTX_EVT_KNL:
 957               map_pop (e->addr);
 958               break;
 959
 960             case PTX_EVT_ASYNC_CLEANUP:
 961               {
 962                 /* The function GOMP_PLUGIN_async_unmap_vars needs to claim the
 963                    memory-map splay tree lock for the current device, so we
 964                    can't call it when one of our callers has already claimed
 965                    the lock.  In that case, just delay the GC for this event
 966                    until later.  */
 967                 if (!memmap_lockable)
 968                   continue;
 969
 970                 GOMP_PLUGIN_async_unmap_vars (e->addr);
 971               }
 972               break;
 973             }
 974
 975           cuEventDestroy (*te);
 976           free ((void *)te);
 977
 978           if (PTX_events == e)
 979             PTX_events = PTX_events->next;
 980           else
 981             {
 982               struct PTX_event *e_ = PTX_events;
 983               while (e_->next != e)
 984                 e_ = e_->next;
 985               e_->next = e_->next->next;
 986             }
 987
 988           free (e);
 989         }
 990     }
 991
 992   GOMP_PLUGIN_mutex_unlock (&PTX_event_lock);
 993 }
 994
 995 static void
 996 event_add (enum PTX_event_type type, CUevent *e, void *h)
 997 {
 998   struct PTX_event *ptx_event;
 999
1000   assert (type == PTX_EVT_MEM || type == PTX_EVT_KNL || type == PTX_EVT_SYNC
1001           || type == PTX_EVT_ASYNC_CLEANUP);
1002
1003   ptx_event = GOMP_PLUGIN_malloc (sizeof (struct PTX_event));
1004   ptx_event->type = type;
1005   ptx_event->evt = e;
1006   ptx_event->addr = h;
1007   ptx_event->ord = PTX_dev->ord;
1008
1009   GOMP_PLUGIN_mutex_lock (&PTX_event_lock);
1010
1011   ptx_event->next = PTX_events;
1012   PTX_events = ptx_event;
1013
1014   GOMP_PLUGIN_mutex_unlock (&PTX_event_lock);
1015 }
1016
1017 void
1018 PTX_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
1019           size_t *sizes, unsigned short *kinds, int num_gangs, int num_workers,
1020           int vector_length, int async, void *targ_mem_desc)
1021 {
1022   struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
1023   CUfunction function;
1024   CUresult r;
1025   int i;
1026   struct PTX_stream *dev_str;
1027   void *kargs[1];
1028   void *hp, *dp;
1029   unsigned int nthreads_in_block;
1030
1031   function = targ_fn->fn;
1032
1033   dev_str = select_stream_for_async (async, pthread_self (), false, NULL);
1034   assert (dev_str == current_stream);
1035
1036   /* This reserves a chunk of a pre-allocated page of memory mapped on both
1037      the host and the device. HP is a host pointer to the new chunk, and DP is
1038      the corresponding device pointer.  */
1039   map_push (dev_str, async, mapnum * sizeof (void *), &hp, &dp);
1040
1041   GOMP_PLUGIN_notify ("  %s: prepare mappings\n", __FUNCTION__);
1042
1043   /* Copy the array of arguments to the mapped page.  */
1044   for (i = 0; i < mapnum; i++)
1045     ((void **) hp)[i] = devaddrs[i];
1046
1047   /* Copy the (device) pointers to arguments to the device (dp and hp might in
1048      fact have the same value on a unified-memory system).  */
1049   r = cuMemcpy ((CUdeviceptr)dp, (CUdeviceptr)hp, mapnum * sizeof (void *));
1050   if (r != CUDA_SUCCESS)
1051     GOMP_PLUGIN_fatal ("cuMemcpy failed: %s", cuErrorMsg (r));
1052
1053   GOMP_PLUGIN_notify ("  %s: kernel %s: launch\n", __FUNCTION__, targ_fn->name);
1054
1055   // XXX: possible geometry mappings??
1056   //
1057   // OpenACC            CUDA
1058   //
1059   // num_gangs          blocks
1060   // num_workers        warps (where a warp is equivalent to 32 threads)
1061   // vector length      threads
1062   //
1063
1064   /* The openacc vector_length clause 'determines the vector length to use for
1065      vector or SIMD operations'.  The question is how to map this to CUDA.
1066
1067      In CUDA, the warp size is the vector length of a CUDA device.  However, the
1068      CUDA interface abstracts away from that, and only shows us warp size
1069      indirectly in maximum number of threads per block, which is a product of
1070      warp size and the number of hyperthreads of a multiprocessor.
1071
1072      We choose to map openacc vector_length directly onto the number of threads
1073      in a block, in the x dimension.  This is reflected in gcc code generation
1074      that uses ThreadIdx.x to access vector elements.
1075
1076      Attempting to use an openacc vector_length of more than the maximum number
1077      of threads per block will result in a cuda error.  */
1078   nthreads_in_block = vector_length;
1079
1080   kargs[0] = &dp;
1081   r = cuLaunchKernel (function,
1082                         1, 1, 1,
1083                         nthreads_in_block, 1, 1,
1084                         0, dev_str->stream, kargs, 0);
1085   if (r != CUDA_SUCCESS)
1086     GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuErrorMsg (r));
1087
1088 #ifndef DISABLE_ASYNC
1089   if (async < acc_async_noval)
1090     {
1091       r = cuStreamSynchronize (dev_str->stream);
1092       if (r != CUDA_SUCCESS)
1093         GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuErrorMsg (r));
1094     }
1095   else
1096     {
1097       CUevent *e;
1098
1099       e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1100
1101       r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1102       if (r != CUDA_SUCCESS)
1103         GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuErrorMsg (r));
1104
1105       event_gc (true);
1106
1107       r = cuEventRecord (*e, dev_str->stream);
1108       if (r != CUDA_SUCCESS)
1109         GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuErrorMsg (r));
1110
1111       event_add (PTX_EVT_KNL, e, (void *)dev_str);
1112     }
1113 #else
1114   r = cuCtxSynchronize ();
1115   if (r != CUDA_SUCCESS)
1116     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuErrorMsg (r));
1117 #endif
1118
1119   GOMP_PLUGIN_notify ("  %s: kernel %s: finished\n", __FUNCTION__,
1120                       targ_fn->name);
1121
1122 #ifndef DISABLE_ASYNC
1123   if (async < acc_async_noval)
1124 #endif
1125     map_pop (dev_str);
1126 }
1127
1128 void * openacc_get_current_cuda_context (void);
1129
1130 static void *
1131 PTX_alloc (size_t s)
1132 {
1133   CUdeviceptr d;
1134   CUresult r;
1135
1136   r = cuMemAlloc (&d, s);
1137   if (r == CUDA_ERROR_OUT_OF_MEMORY)
1138     return 0;
1139   if (r != CUDA_SUCCESS)
1140     GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuErrorMsg (r));
1141   return (void *)d;
1142 }
1143
1144 static void
1145 PTX_free (void *p)
1146 {
1147   CUresult r;
1148   CUdeviceptr pb;
1149   size_t ps;
1150
1151   r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)p);
1152   if (r != CUDA_SUCCESS)
1153     GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuErrorMsg (r));
1154
1155   if ((CUdeviceptr)p != pb)
1156     GOMP_PLUGIN_fatal ("invalid device address");
1157
1158   r = cuMemFree ((CUdeviceptr)p);
1159   if (r != CUDA_SUCCESS)
1160     GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuErrorMsg (r));
1161 }
1162
1163 static void *
1164 PTX_host2dev (void *d, const void *h, size_t s)
1165 {
1166   CUresult r;
1167   CUdeviceptr pb;
1168   size_t ps;
1169
1170   if (!s)
1171     return 0;
1172
1173   if (!d)
1174     GOMP_PLUGIN_fatal ("invalid device address");
1175
1176   r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)d);
1177   if (r != CUDA_SUCCESS)
1178     GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuErrorMsg (r));
1179
1180   if (!pb)
1181     GOMP_PLUGIN_fatal ("invalid device address");
1182
1183   if (!h)
1184     GOMP_PLUGIN_fatal ("invalid host address");
1185
1186   if (d == h)
1187     GOMP_PLUGIN_fatal ("invalid host or device address");
1188
1189   if ((void *)(d + s) > (void *)(pb + ps))
1190     GOMP_PLUGIN_fatal ("invalid size");
1191
1192 #ifndef DISABLE_ASYNC
1193   if (current_stream != PTX_dev->null_stream)
1194     {
1195       CUevent *e;
1196
1197       e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1198
1199       r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1200       if (r != CUDA_SUCCESS)
1201         GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuErrorMsg (r));
1202
1203       event_gc (false);
1204
1205       r = cuMemcpyHtoDAsync ((CUdeviceptr)d, h, s, current_stream->stream);
1206       if (r != CUDA_SUCCESS)
1207         GOMP_PLUGIN_fatal ("cuMemcpyHtoDAsync error: %s", cuErrorMsg (r));
1208
1209       r = cuEventRecord (*e, current_stream->stream);
1210       if (r != CUDA_SUCCESS)
1211         GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuErrorMsg (r));
1212
1213       event_add (PTX_EVT_MEM, e, (void *)h);
1214     }
1215   else
1216 #endif
1217     {
1218       r = cuMemcpyHtoD ((CUdeviceptr)d, h, s);
1219       if (r != CUDA_SUCCESS)
1220         GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuErrorMsg (r));
1221     }
1222
1223   return 0;
1224 }
1225
1226 static void *
1227 PTX_dev2host (void *h, const void *d, size_t s)
1228 {
1229   CUresult r;
1230   CUdeviceptr pb;
1231   size_t ps;
1232
1233   if (!s)
1234     return 0;
1235
1236   if (!d)
1237     GOMP_PLUGIN_fatal ("invalid device address");
1238
1239   r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)d);
1240   if (r != CUDA_SUCCESS)
1241     GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuErrorMsg (r));
1242
1243   if (!pb)
1244     GOMP_PLUGIN_fatal ("invalid device address");
1245
1246   if (!h)
1247     GOMP_PLUGIN_fatal ("invalid host address");
1248
1249   if (d == h)
1250     GOMP_PLUGIN_fatal ("invalid host or device address");
1251
1252   if ((void *)(d + s) > (void *)(pb + ps))
1253     GOMP_PLUGIN_fatal ("invalid size");
1254
1255 #ifndef DISABLE_ASYNC
1256   if (current_stream != PTX_dev->null_stream)
1257     {
1258       CUevent *e;
1259
1260       e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1261
1262       r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1263       if (r != CUDA_SUCCESS)
1264         GOMP_PLUGIN_fatal ("cuEventCreate error: %s\n", cuErrorMsg (r));
1265
1266       event_gc (false);
1267
1268       r = cuMemcpyDtoHAsync (h, (CUdeviceptr)d, s, current_stream->stream);
1269       if (r != CUDA_SUCCESS)
1270         GOMP_PLUGIN_fatal ("cuMemcpyDtoHAsync error: %s", cuErrorMsg (r));
1271
1272       r = cuEventRecord (*e, current_stream->stream);
1273       if (r != CUDA_SUCCESS)
1274         GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuErrorMsg (r));
1275
1276       event_add (PTX_EVT_MEM, e, (void *)h);
1277     }
1278   else
1279 #endif
1280     {
1281       r = cuMemcpyDtoH (h, (CUdeviceptr)d, s);
1282       if (r != CUDA_SUCCESS)
1283         GOMP_PLUGIN_fatal ("cuMemcpyDtoH error: %s", cuErrorMsg (r));
1284     }
1285
1286   return 0;
1287 }
1288
1289 static void
1290 PTX_set_async (int async)
1291 {
1292   current_stream = select_stream_for_async (async, pthread_self (), true, NULL);
1293 }
1294
1295 static int
1296 PTX_async_test (int async)
1297 {
1298   CUresult r;
1299   struct PTX_stream *s;
1300
1301   s = select_stream_for_async (async, pthread_self (), false, NULL);
1302
1303   if (!s)
1304     GOMP_PLUGIN_fatal ("unknown async %d", async);
1305
1306   r = cuStreamQuery (s->stream);
1307   if (r == CUDA_SUCCESS)
1308     {
1309       /* The oacc-parallel.c:goacc_wait function calls this hook to determine
1310          whether all work has completed on this stream, and if so omits the call
1311          to the wait hook.  If that happens, event_gc might not get called
1312          (which prevents variables from getting unmapped and their associated
1313          device storage freed), so call it here.  */
1314       event_gc (true);
1315       return 1;
1316     }
1317   else if (r == CUDA_ERROR_NOT_READY)
1318     return 0;
1319
1320   GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuErrorMsg (r));
1321
1322   return 0;
1323 }
1324
1325 static int
1326 PTX_async_test_all (void)
1327 {
1328   struct PTX_stream *s;
1329   pthread_t self = pthread_self ();
1330
1331   GOMP_PLUGIN_mutex_lock (&PTX_dev->stream_lock);
1332
1333   for (s = PTX_dev->active_streams; s != NULL; s = s->next)
1334     {
1335       if ((s->multithreaded || pthread_equal (s->host_thread, self))
1336           && cuStreamQuery (s->stream) == CUDA_ERROR_NOT_READY)
1337         {
1338           GOMP_PLUGIN_mutex_unlock (&PTX_dev->stream_lock);
1339           return 0;
1340         }
1341     }
1342
1343   GOMP_PLUGIN_mutex_unlock (&PTX_dev->stream_lock);
1344
1345   event_gc (true);
1346
1347   return 1;
1348 }
1349
1350 static void
1351 PTX_wait (int async)
1352 {
1353   CUresult r;
1354   struct PTX_stream *s;
1355
1356   s = select_stream_for_async (async, pthread_self (), false, NULL);
1357
1358   if (!s)
1359     GOMP_PLUGIN_fatal ("unknown async %d", async);
1360
1361   r = cuStreamSynchronize (s->stream);
1362   if (r != CUDA_SUCCESS)
1363     GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuErrorMsg (r));
1364
1365   event_gc (true);
1366 }
1367
1368 static void
1369 PTX_wait_async (int async1, int async2)
1370 {
1371   CUresult r;
1372   CUevent *e;
1373   struct PTX_stream *s1, *s2;
1374   pthread_t self = pthread_self ();
1375
1376   /* The stream that is waiting (rather than being waited for) doesn't
1377      necessarily have to exist already.  */
1378   s2 = select_stream_for_async (async2, self, true, NULL);
1379
1380   s1 = select_stream_for_async (async1, self, false, NULL);
1381   if (!s1)
1382     GOMP_PLUGIN_fatal ("invalid async 1\n");
1383
1384   if (s1 == s2)
1385     GOMP_PLUGIN_fatal ("identical parameters");
1386
1387   e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1388
1389   r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1390   if (r != CUDA_SUCCESS)
1391     GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuErrorMsg (r));
1392
1393   event_gc (true);
1394
1395   r = cuEventRecord (*e, s1->stream);
1396   if (r != CUDA_SUCCESS)
1397     GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuErrorMsg (r));
1398
1399   event_add (PTX_EVT_SYNC, e, NULL);
1400
1401   r = cuStreamWaitEvent (s2->stream, *e, 0);
1402   if (r != CUDA_SUCCESS)
1403     GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuErrorMsg (r));
1404 }
1405
1406 static void
1407 PTX_wait_all (void)
1408 {
1409   CUresult r;
1410   struct PTX_stream *s;
1411   pthread_t self = pthread_self ();
1412
1413   GOMP_PLUGIN_mutex_lock (&PTX_dev->stream_lock);
1414
1415   /* Wait for active streams initiated by this thread (or by multiple threads)
1416      to complete.  */
1417   for (s = PTX_dev->active_streams; s != NULL; s = s->next)
1418     {
1419       if (s->multithreaded || pthread_equal (s->host_thread, self))
1420         {
1421           r = cuStreamQuery (s->stream);
1422           if (r == CUDA_SUCCESS)
1423             continue;
1424           else if (r != CUDA_ERROR_NOT_READY)
1425             GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuErrorMsg (r));
1426
1427           r = cuStreamSynchronize (s->stream);
1428           if (r != CUDA_SUCCESS)
1429             GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuErrorMsg (r));
1430         }
1431     }
1432
1433   GOMP_PLUGIN_mutex_unlock (&PTX_dev->stream_lock);
1434
1435   event_gc (true);
1436 }
1437
1438 static void
1439 PTX_wait_all_async (int async)
1440 {
1441   CUresult r;
1442   struct PTX_stream *waiting_stream, *other_stream;
1443   CUevent *e;
1444   pthread_t self = pthread_self ();
1445
1446   /* The stream doing the waiting.  This could be the first mention of the
1447      stream, so create it if necessary.  */
1448   waiting_stream
1449     = select_stream_for_async (async, pthread_self (), true, NULL);
1450
1451   /* Launches on the null stream already block on other streams in the
1452      context.  */
1453   if (!waiting_stream || waiting_stream == PTX_dev->null_stream)
1454     return;
1455
1456   event_gc (true);
1457
1458   GOMP_PLUGIN_mutex_lock (&PTX_dev->stream_lock);
1459
1460   for (other_stream = PTX_dev->active_streams;
1461        other_stream != NULL;
1462        other_stream = other_stream->next)
1463     {
1464       if (!other_stream->multithreaded
1465           && !pthread_equal (other_stream->host_thread, self))
1466         continue;
1467
1468       e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1469
1470       r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1471       if (r != CUDA_SUCCESS)
1472         GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuErrorMsg (r));
1473
1474       /* Record an event on the waited-for stream.  */
1475       r = cuEventRecord (*e, other_stream->stream);
1476       if (r != CUDA_SUCCESS)
1477         GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuErrorMsg (r));
1478
1479       event_add (PTX_EVT_SYNC, e, NULL);
1480
1481       r = cuStreamWaitEvent (waiting_stream->stream, *e, 0);
1482       if (r != CUDA_SUCCESS)
1483         GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuErrorMsg (r));
1484    }
1485
1486   GOMP_PLUGIN_mutex_unlock (&PTX_dev->stream_lock);
1487 }
1488
1489 static void *
1490 PTX_get_current_cuda_device (void)
1491 {
1492   if (!PTX_dev)
1493     return NULL;
1494
1495   return &PTX_dev->dev;
1496 }
1497
1498 static void *
1499 PTX_get_current_cuda_context (void)
1500 {
1501   if (!PTX_dev)
1502     return NULL;
1503
1504   return PTX_dev->ctx;
1505 }
1506
1507 static void *
1508 PTX_get_cuda_stream (int async)
1509 {
1510   struct PTX_stream *s;
1511
1512   if (!PTX_dev)
1513     return NULL;
1514
1515   s = select_stream_for_async (async, pthread_self (), false, NULL);
1516
1517   return s ? s->stream : NULL;
1518 }
1519
1520 static int
1521 PTX_set_cuda_stream (int async, void *stream)
1522 {
1523   struct PTX_stream *oldstream;
1524   pthread_t self = pthread_self ();
1525
1526   GOMP_PLUGIN_mutex_lock (&PTX_dev->stream_lock);
1527
1528   if (async < 0)
1529     GOMP_PLUGIN_fatal ("bad async %d", async);
1530
1531   /* We have a list of active streams and an array mapping async values to
1532      entries of that list.  We need to take "ownership" of the passed-in stream,
1533      and add it to our list, removing the previous entry also (if there was one)
1534      in order to prevent resource leaks.  Note the potential for surprise
1535      here: maybe we should keep track of passed-in streams and leave it up to
1536      the user to tidy those up, but that doesn't work for stream handles
1537      returned from acc_get_cuda_stream above...  */
1538
1539   oldstream = select_stream_for_async (async, self, false, NULL);
1540
1541   if (oldstream)
1542     {
1543       if (PTX_dev->active_streams == oldstream)
1544         PTX_dev->active_streams = PTX_dev->active_streams->next;
1545       else
1546         {
1547           struct PTX_stream *s = PTX_dev->active_streams;
1548           while (s->next != oldstream)
1549             s = s->next;
1550           s->next = s->next->next;
1551         }
1552
1553       cuStreamDestroy (oldstream->stream);
1554       map_fini (oldstream);
1555       free (oldstream);
1556     }
1557
1558   GOMP_PLUGIN_mutex_unlock (&PTX_dev->stream_lock);
1559
1560   (void) select_stream_for_async (async, self, true, (CUstream) stream);
1561
1562   return 1;
1563 }
1564
1565 /* Plugin entry points.  */
1566
1567
1568 int
1569 get_type (void)
1570 {
1571 #ifdef DEBUG
1572   fprintf (stderr, "libgomp plugin: %s:%s\n", __FILE__, __FUNCTION__);
1573 #endif
1574
1575   return TARGET_TYPE_NVIDIA_PTX;
1576 }
1577
1578 unsigned int
1579 get_caps (void)
1580 {
1581   return TARGET_CAP_OPENACC_200;
1582 }
1583
1584 const char *
1585 get_name (void)
1586 {
1587   return "nvidia";
1588 }
1589
1590 int
1591 get_num_devices (void)
1592 {
1593 #ifdef DEBUG
1594   fprintf (stderr, "libgomp plugin: %s:%s\n", __FILE__, __FUNCTION__);
1595 #endif
1596
1597   return PTX_get_num_devices ();
1598 }
1599
1600 static void **kernel_target_data;
1601 static void **kernel_host_table;
1602
1603 void
1604 offload_register (void *host_table, void *target_data)
1605 {
1606 #ifdef DEBUG
1607   fprintf (stderr, "libgomp plugin: %s:%s (%p, %p)\n", __FILE__, __FUNCTION__,
1608            host_table, target_data);
1609 #endif
1610
1611   kernel_target_data = target_data;
1612   kernel_host_table = host_table;
1613 }
1614
1615 int
1616 device_init (void)
1617 {
1618 #ifdef DEBUG
1619   fprintf (stderr, "libgomp plugin: %s:%s\n", __FILE__, __FUNCTION__);
1620 #endif
1621
1622   return PTX_init ();
1623 }
1624
1625 int
1626 device_fini (void)
1627 {
1628 #ifdef DEBUG
1629   fprintf (stderr, "libgomp plugin: %s:%s\n", __FILE__, __FUNCTION__);
1630 #endif
1631
1632   return PTX_fini ();
1633 }
1634
1635 int
1636 device_get_table (struct mapping_table **tablep)
1637 {
1638   CUmodule module;
1639   void **fn_table;
1640   char **fn_names;
1641   int fn_entries, i;
1642   CUresult r;
1643   struct targ_fn_descriptor *targ_fns;
1644
1645 #ifdef DEBUG
1646   fprintf (stderr, "libgomp plugin: %s:%s (%p)\n", __FILE__, __FUNCTION__,
1647            tablep);
1648 #endif
1649
1650   if (PTX_init () <= 0)
1651     return 0;
1652
1653   /* This isn't an error, because an image may legitimately have no offloaded
1654      regions and so will not call GOMP_offload_register.  */
1655   if (kernel_target_data == NULL)
1656     return 0;
1657
1658   link_ptx (&module, kernel_target_data[0]);
1659
1660   /* kernel_target_data[0] -> ptx code
1661      kernel_target_data[1] -> variable mappings
1662      kernel_target_data[2] -> array of kernel names in ascii
1663
1664      kernel_host_table[0] -> start of function addresses (_omp_func_table)
1665      kernel_host_table[1] -> end of function addresses (_omp_funcs_end)
1666
1667      The array of kernel names and the functions addresses form a
1668      one-to-one correspondence.  */
1669
1670   fn_table = kernel_host_table[0];
1671   fn_names = (char **) kernel_target_data[2];
1672   fn_entries = (kernel_host_table[1] - kernel_host_table[0]) / sizeof (void *);
1673
1674   *tablep = GOMP_PLUGIN_malloc (sizeof (struct mapping_table) * fn_entries);
1675   targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
1676                                  * fn_entries);
1677
1678   for (i = 0; i < fn_entries; i++)
1679     {
1680       CUfunction function;
1681
1682       r = cuModuleGetFunction (&function, module, fn_names[i]);
1683       if (r != CUDA_SUCCESS)
1684         GOMP_PLUGIN_fatal ("cuModuleGetFunction error: %s", cuErrorMsg (r));
1685
1686       targ_fns[i].fn = function;
1687       targ_fns[i].name = (const char *) fn_names[i];
1688
1689       (*tablep)[i].host_start = (uintptr_t) fn_table[i];
1690       (*tablep)[i].host_end = (*tablep)[i].host_start + 1;
1691       (*tablep)[i].tgt_start = (uintptr_t) &targ_fns[i];
1692       (*tablep)[i].tgt_end = (*tablep)[i].tgt_start + 1;
1693     }
1694
1695   return fn_entries;
1696 }
1697
1698 void *
1699 device_alloc (size_t size)
1700 {
1701 #ifdef DEBUG
1702   fprintf (stderr, "libgomp plugin: %s:%s (%zu)\n", __FILE__, __FUNCTION__,
1703            size);
1704 #endif
1705
1706   return PTX_alloc (size);
1707 }
1708
1709 void
1710 device_free (void *ptr)
1711 {
1712 #ifdef DEBUG
1713   fprintf (stderr, "libgomp plugin: %s:%s (%p)\n", __FILE__, __FUNCTION__, ptr);
1714 #endif
1715
1716   PTX_free (ptr);
1717 }
1718
1719 void *
1720 device_dev2host (void *dst, const void *src, size_t n)
1721 {
1722 #ifdef DEBUG
1723   fprintf (stderr, "libgomp plugin: %s:%s (%p, %p, %zu)\n", __FILE__,
1724            __FUNCTION__, dst,
1725           src, n);
1726 #endif
1727
1728   return PTX_dev2host (dst, src, n);
1729 }
1730
1731 void *
1732 device_host2dev (void *dst, const void *src, size_t n)
1733 {
1734 #ifdef DEBUG
1735   fprintf (stderr, "libgomp plugin: %s:%s (%p, %p, %zu)\n", __FILE__,
1736            __FUNCTION__, dst, src, n);
1737 #endif
1738
1739   return PTX_host2dev (dst, src, n);
1740 }
1741
1742 void (*device_run) (void *fn_ptr, void *vars) = NULL;
1743
1744 void
1745 openacc_parallel (void (*fn) (void *), size_t mapnum, void **hostaddrs,
1746                   void **devaddrs, size_t *sizes, unsigned short *kinds,
1747                   int num_gangs, int num_workers, int vector_length,
1748                   int async, void *targ_mem_desc)
1749 {
1750 #ifdef DEBUG
1751   fprintf (stderr, "libgomp plugin: %s:%s (%p, %zu, %p, %p, %p, %d, %d, %d, "
1752            "%d, %p)\n", __FILE__, __FUNCTION__, fn, mapnum, hostaddrs, sizes,
1753            kinds, num_gangs, num_workers, vector_length, async, targ_mem_desc);
1754 #endif
1755
1756   PTX_exec (fn, mapnum, hostaddrs, devaddrs, sizes, kinds, num_gangs,
1757             num_workers, vector_length, async, targ_mem_desc);
1758 }
1759
1760 void *
1761 openacc_open_device (int n)
1762 {
1763 #ifdef DEBUG
1764   fprintf (stderr, "libgomp plugin: %s:%s (%d)\n", __FILE__, __FUNCTION__, n);
1765 #endif
1766   return PTX_open_device (n);
1767 }
1768
1769 int
1770 openacc_close_device (void *h)
1771 {
1772 #ifdef DEBUG
1773   fprintf (stderr, "libgomp plugin: %s:%s (%p)\n", __FILE__, __FUNCTION__, h);
1774 #endif
1775   return PTX_close_device (h);
1776 }
1777
1778 void
1779 openacc_set_device_num (int n)
1780 {
1781   assert (n >= 0);
1782
1783   if (!PTX_dev || PTX_dev->ord != n)
1784     (void) PTX_open_device (n);
1785 }
1786
1787 /* This can be called before the device is "opened" for the current thread, in
1788    which case we can't tell which device number should be returned.  We don't
1789    actually want to open the device here, so just return -1 and let the caller
1790    (oacc-init.c:acc_get_device_num) handle it.  */
1791
1792 int
1793 openacc_get_device_num (void)
1794 {
1795   if (PTX_dev)
1796     return PTX_dev->ord;
1797   else
1798     return -1;
1799 }
1800
1801 bool
1802 openacc_avail (void)
1803 {
1804 #ifdef DEBUG
1805   fprintf (stderr, "libgomp plugin: %s:%s\n", __FILE__, __FUNCTION__);
1806 #endif
1807   return PTX_avail ();
1808 }
1809
1810 void
1811 openacc_register_async_cleanup (void *targ_mem_desc)
1812 {
1813   CUevent *e;
1814   CUresult r;
1815
1816 #ifdef DEBUG
1817   fprintf (stderr, "libgomp plugin: %s:%s (%p)\n", __FILE__, __FUNCTION__,
1818           targ_mem_desc);
1819 #endif
1820
1821   e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1822
1823   r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1824   if (r != CUDA_SUCCESS)
1825     GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuErrorMsg (r));
1826
1827   r = cuEventRecord (*e, current_stream->stream);
1828   if (r != CUDA_SUCCESS)
1829     GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuErrorMsg (r));
1830
1831   event_add (PTX_EVT_ASYNC_CLEANUP, e, targ_mem_desc);
1832 }
1833
1834 int
1835 openacc_async_test (int async)
1836 {
1837 #ifdef DEBUG
1838   fprintf (stderr, "libgomp plugin: %s:%s (%d)\n", __FILE__, __FUNCTION__,
1839            async);
1840 #endif
1841   return PTX_async_test (async);
1842 }
1843
1844 int
1845 openacc_async_test_all (void)
1846 {
1847 #ifdef DEBUG
1848   fprintf (stderr, "libgomp plugin: %s:%s\n", __FILE__, __FUNCTION__);
1849 #endif
1850   return PTX_async_test_all ();
1851 }
1852
1853 void
1854 openacc_async_wait (int async)
1855 {
1856 #ifdef DEBUG
1857   fprintf (stderr, "libgomp plugin: %s:%s (%d)\n", __FILE__, __FUNCTION__,
1858            async);
1859 #endif
1860   PTX_wait (async);
1861 }
1862
1863 void
1864 openacc_async_wait_async (int async1, int async2)
1865 {
1866 #ifdef DEBUG
1867   fprintf (stderr, "libgomp plugin: %s:%s (%d, %d)\n", __FILE__, __FUNCTION__,
1868            async1, async2);
1869 #endif
1870   PTX_wait_async (async1, async2);
1871 }
1872
1873 void
1874 openacc_async_wait_all (void)
1875 {
1876 #ifdef DEBUG
1877   fprintf (stderr, "libgomp plugin: %s:%s\n", __FILE__, __FUNCTION__);
1878 #endif
1879   PTX_wait_all ();
1880 }
1881
1882 void
1883 openacc_async_wait_all_async (int async)
1884 {
1885 #ifdef DEBUG
1886   fprintf (stderr, "libgomp plugin: %s:%s (%d)\n", __FILE__, __FUNCTION__,
1887            async);
1888 #endif
1889   PTX_wait_all_async (async);
1890 }
1891
1892 void
1893 openacc_async_set_async (int async)
1894 {
1895 #ifdef DEBUG
1896   fprintf (stderr, "libgomp plugin: %s:%s (%d)\n", __FILE__, __FUNCTION__,
1897            async);
1898 #endif
1899   PTX_set_async (async);
1900 }
1901
1902 void *
1903 openacc_get_current_cuda_device (void)
1904 {
1905 #ifdef DEBUG
1906   fprintf (stderr, "libgomp plugin: %s:%s\n", __FILE__, __FUNCTION__);
1907 #endif
1908   return PTX_get_current_cuda_device ();
1909 }
1910
1911 void *
1912 openacc_get_current_cuda_context (void)
1913 {
1914 #ifdef DEBUG
1915   fprintf (stderr, "libgomp plugin: %s:%s\n", __FILE__, __FUNCTION__);
1916 #endif
1917   return PTX_get_current_cuda_context ();
1918 }
1919
1920 /* NOTE: This returns a CUstream, not a PTX_stream pointer.  */
1921
1922 void *
1923 openacc_get_cuda_stream (int async)
1924 {
1925 #ifdef DEBUG
1926   fprintf (stderr, "libgomp plugin: %s:%s (%d)\n", __FILE__, __FUNCTION__,
1927            async);
1928 #endif
1929   return PTX_get_cuda_stream (async);
1930 }
1931
1932 /* NOTE: This takes a CUstream, not a PTX_stream pointer.  */
1933
1934 int
1935 openacc_set_cuda_stream (int async, void *stream)
1936 {
1937 #ifdef DEBUG
1938   fprintf (stderr, "libgomp plugin: %s:%s (%d, %p)\n", __FILE__, __FUNCTION__,
1939            async, stream);
1940 #endif
1941   return PTX_set_cuda_stream (async, stream);
1942 }