libgomp/plugin/plugin-nvptx.c

   1 /* Plugin for NVPTX execution.
   2
   3    Copyright (C) 2013-2015 Free Software Foundation, Inc.
   4
   5    Contributed by Mentor Embedded.
   6
   7    This file is part of the GNU Offloading and Multi Processing Library
   8    (libgomp).
   9
  10    Libgomp is free software; you can redistribute it and/or modify it
  11    under the terms of the GNU General Public License as published by
  12    the Free Software Foundation; either version 3, or (at your option)
  13    any later version.
  14
  15    Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
  16    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  17    FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  18    more details.
  19
  20    Under Section 7 of GPL version 3, you are granted additional
  21    permissions described in the GCC Runtime Library Exception, version
  22    3.1, as published by the Free Software Foundation.
  23
  24    You should have received a copy of the GNU General Public License and
  25    a copy of the GCC Runtime Library Exception along with this program;
  26    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  27    <http://www.gnu.org/licenses/>.  */
  28
  29 /* Nvidia PTX-specific parts of OpenACC support.  The cuda driver
  30    library appears to hold some implicit state, but the documentation
  31    is not clear as to what that state might be.  Or how one might
  32    propagate it from one thread to another.  */
  33
  34 #include "openacc.h"
  35 #include "config.h"
  36 #include "libgomp-plugin.h"
  37 #include "oacc-ptx.h"
  38 #include "oacc-plugin.h"
  39
  40 #include <pthread.h>
  41 #include <cuda.h>
  42 #include <stdbool.h>
  43 #include <stdint.h>
  44 #include <string.h>
  45 #include <stdio.h>
  46 #include <unistd.h>
  47 #include <assert.h>
  48
  49 #define ARRAYSIZE(X) (sizeof (X) / sizeof ((X)[0]))
  50
  51 static const struct
  52 {
  53   CUresult r;
  54   const char *m;
  55 } cuda_errlist[]=
  56 {
  57   { CUDA_ERROR_INVALID_VALUE, "invalid value" },
  58   { CUDA_ERROR_OUT_OF_MEMORY, "out of memory" },
  59   { CUDA_ERROR_NOT_INITIALIZED, "not initialized" },
  60   { CUDA_ERROR_DEINITIALIZED, "deinitialized" },
  61   { CUDA_ERROR_PROFILER_DISABLED, "profiler disabled" },
  62   { CUDA_ERROR_PROFILER_NOT_INITIALIZED, "profiler not initialized" },
  63   { CUDA_ERROR_PROFILER_ALREADY_STARTED, "already started" },
  64   { CUDA_ERROR_PROFILER_ALREADY_STOPPED, "already stopped" },
  65   { CUDA_ERROR_NO_DEVICE, "no device" },
  66   { CUDA_ERROR_INVALID_DEVICE, "invalid device" },
  67   { CUDA_ERROR_INVALID_IMAGE, "invalid image" },
  68   { CUDA_ERROR_INVALID_CONTEXT, "invalid context" },
  69   { CUDA_ERROR_CONTEXT_ALREADY_CURRENT, "context already current" },
  70   { CUDA_ERROR_MAP_FAILED, "map error" },
  71   { CUDA_ERROR_UNMAP_FAILED, "unmap error" },
  72   { CUDA_ERROR_ARRAY_IS_MAPPED, "array is mapped" },
  73   { CUDA_ERROR_ALREADY_MAPPED, "already mapped" },
  74   { CUDA_ERROR_NO_BINARY_FOR_GPU, "no binary for gpu" },
  75   { CUDA_ERROR_ALREADY_ACQUIRED, "already acquired" },
  76   { CUDA_ERROR_NOT_MAPPED, "not mapped" },
  77   { CUDA_ERROR_NOT_MAPPED_AS_ARRAY, "not mapped as array" },
  78   { CUDA_ERROR_NOT_MAPPED_AS_POINTER, "not mapped as pointer" },
  79   { CUDA_ERROR_ECC_UNCORRECTABLE, "ecc uncorrectable" },
  80   { CUDA_ERROR_UNSUPPORTED_LIMIT, "unsupported limit" },
  81   { CUDA_ERROR_CONTEXT_ALREADY_IN_USE, "context already in use" },
  82   { CUDA_ERROR_PEER_ACCESS_UNSUPPORTED, "peer access unsupported" },
  83   { CUDA_ERROR_INVALID_SOURCE, "invalid source" },
  84   { CUDA_ERROR_FILE_NOT_FOUND, "file not found" },
  85   { CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
  86                                            "shared object symbol not found" },
  87   { CUDA_ERROR_SHARED_OBJECT_INIT_FAILED, "shared object init error" },
  88   { CUDA_ERROR_OPERATING_SYSTEM, "operating system" },
  89   { CUDA_ERROR_INVALID_HANDLE, "invalid handle" },
  90   { CUDA_ERROR_NOT_FOUND, "not found" },
  91   { CUDA_ERROR_NOT_READY, "not ready" },
  92   { CUDA_ERROR_LAUNCH_FAILED, "launch error" },
  93   { CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, "launch out of resources" },
  94   { CUDA_ERROR_LAUNCH_TIMEOUT, "launch timeout" },
  95   { CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
  96                                              "launch incompatibe texturing" },
  97   { CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED, "peer access already enabled" },
  98   { CUDA_ERROR_PEER_ACCESS_NOT_ENABLED, "peer access not enabled " },
  99   { CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE, "primary cotext active" },
 100   { CUDA_ERROR_CONTEXT_IS_DESTROYED, "context is destroyed" },
 101   { CUDA_ERROR_ASSERT, "assert" },
 102   { CUDA_ERROR_TOO_MANY_PEERS, "too many peers" },
 103   { CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED,
 104                                            "host memory already registered" },
 105   { CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED, "host memory not registered" },
 106   { CUDA_ERROR_NOT_PERMITTED, "not permitted" },
 107   { CUDA_ERROR_NOT_SUPPORTED, "not supported" },
 108   { CUDA_ERROR_UNKNOWN, "unknown" }
 109 };
 110
 111 static const char *
 112 cuda_error (CUresult r)
 113 {
 114   int i;
 115
 116   for (i = 0; i < ARRAYSIZE (cuda_errlist); i++)
 117     {
 118       if (cuda_errlist[i].r == r)
 119         return cuda_errlist[i].m;
 120     }
 121
 122   static char errmsg[30];
 123
 124   snprintf (errmsg, sizeof (errmsg), "unknown error code: %d", r);
 125
 126   return errmsg;
 127 }
 128
 129 static unsigned int instantiated_devices = 0;
 130 static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
 131
 132 struct ptx_stream
 133 {
 134   CUstream stream;
 135   pthread_t host_thread;
 136   bool multithreaded;
 137
 138   CUdeviceptr d;
 139   void *h;
 140   void *h_begin;
 141   void *h_end;
 142   void *h_next;
 143   void *h_prev;
 144   void *h_tail;
 145
 146   struct ptx_stream *next;
 147 };
 148
 149 /* Thread-specific data for PTX.  */
 150
 151 struct nvptx_thread
 152 {
 153   struct ptx_stream *current_stream;
 154   struct ptx_device *ptx_dev;
 155 };
 156
 157 struct map
 158 {
 159   int     async;
 160   size_t  size;
 161   char    mappings[0];
 162 };
 163
 164 static void
 165 map_init (struct ptx_stream *s)
 166 {
 167   CUresult r;
 168
 169   int size = getpagesize ();
 170
 171   assert (s);
 172   assert (!s->d);
 173   assert (!s->h);
 174
 175   r = cuMemAllocHost (&s->h, size);
 176   if (r != CUDA_SUCCESS)
 177     GOMP_PLUGIN_fatal ("cuMemAllocHost error: %s", cuda_error (r));
 178
 179   r = cuMemHostGetDevicePointer (&s->d, s->h, 0);
 180   if (r != CUDA_SUCCESS)
 181     GOMP_PLUGIN_fatal ("cuMemHostGetDevicePointer error: %s", cuda_error (r));
 182
 183   assert (s->h);
 184
 185   s->h_begin = s->h;
 186   s->h_end = s->h_begin + size;
 187   s->h_next = s->h_prev = s->h_tail = s->h_begin;
 188
 189   assert (s->h_next);
 190   assert (s->h_end);
 191 }
 192
 193 static void
 194 map_fini (struct ptx_stream *s)
 195 {
 196   CUresult r;
 197
 198   r = cuMemFreeHost (s->h);
 199   if (r != CUDA_SUCCESS)
 200     GOMP_PLUGIN_fatal ("cuMemFreeHost error: %s", cuda_error (r));
 201 }
 202
 203 static void
 204 map_pop (struct ptx_stream *s)
 205 {
 206   struct map *m;
 207
 208   assert (s != NULL);
 209   assert (s->h_next);
 210   assert (s->h_prev);
 211   assert (s->h_tail);
 212
 213   m = s->h_tail;
 214
 215   s->h_tail += m->size;
 216
 217   if (s->h_tail >= s->h_end)
 218     s->h_tail = s->h_begin + (int) (s->h_tail - s->h_end);
 219
 220   if (s->h_next == s->h_tail)
 221     s->h_prev = s->h_next;
 222
 223   assert (s->h_next >= s->h_begin);
 224   assert (s->h_tail >= s->h_begin);
 225   assert (s->h_prev >= s->h_begin);
 226
 227   assert (s->h_next <= s->h_end);
 228   assert (s->h_tail <= s->h_end);
 229   assert (s->h_prev <= s->h_end);
 230 }
 231
 232 static void
 233 map_push (struct ptx_stream *s, int async, size_t size, void **h, void **d)
 234 {
 235   int left;
 236   int offset;
 237   struct map *m;
 238
 239   assert (s != NULL);
 240
 241   left = s->h_end - s->h_next;
 242   size += sizeof (struct map);
 243
 244   assert (s->h_prev);
 245   assert (s->h_next);
 246
 247   if (size >= left)
 248     {
 249       m = s->h_prev;
 250       m->size += left;
 251       s->h_next = s->h_begin;
 252
 253       if (s->h_next + size > s->h_end)
 254         GOMP_PLUGIN_fatal ("unable to push map");
 255     }
 256
 257   assert (s->h_next);
 258
 259   m = s->h_next;
 260   m->async = async;
 261   m->size = size;
 262
 263   offset = (void *)&m->mappings[0] - s->h;
 264
 265   *d = (void *)(s->d + offset);
 266   *h = (void *)(s->h + offset);
 267
 268   s->h_prev = s->h_next;
 269   s->h_next += size;
 270
 271   assert (s->h_prev);
 272   assert (s->h_next);
 273
 274   assert (s->h_next >= s->h_begin);
 275   assert (s->h_tail >= s->h_begin);
 276   assert (s->h_prev >= s->h_begin);
 277   assert (s->h_next <= s->h_end);
 278   assert (s->h_tail <= s->h_end);
 279   assert (s->h_prev <= s->h_end);
 280
 281   return;
 282 }
 283
 284 /* Descriptor of a loaded function.  */
 285
 286 struct targ_fn_descriptor
 287 {
 288   CUfunction fn;
 289   const char *name;
 290 };
 291
 292 /* A loaded PTX image.  */
 293 struct ptx_image_data
 294 {
 295   const void *target_data;
 296   CUmodule module;
 297
 298   struct targ_fn_descriptor *fns;  /* Array of functions.  */
 299
 300   struct ptx_image_data *next;
 301 };
 302
 303 struct ptx_device
 304 {
 305   CUcontext ctx;
 306   bool ctx_shared;
 307   CUdevice dev;
 308   struct ptx_stream *null_stream;
 309   /* All non-null streams associated with this device (actually context),
 310      either created implicitly or passed in from the user (via
 311      acc_set_cuda_stream).  */
 312   struct ptx_stream *active_streams;
 313   struct {
 314     struct ptx_stream **arr;
 315     int size;
 316   } async_streams;
 317   /* A lock for use when manipulating the above stream list and array.  */
 318   pthread_mutex_t stream_lock;
 319   int ord;
 320   bool overlap;
 321   bool map;
 322   bool concur;
 323   int  mode;
 324   bool mkern;
 325
 326   struct ptx_image_data *images;  /* Images loaded on device.  */
 327   pthread_mutex_t image_lock;     /* Lock for above list.  */
 328
 329   struct ptx_device *next;
 330 };
 331
 332 enum ptx_event_type
 333 {
 334   PTX_EVT_MEM,
 335   PTX_EVT_KNL,
 336   PTX_EVT_SYNC,
 337   PTX_EVT_ASYNC_CLEANUP
 338 };
 339
 340 struct ptx_event
 341 {
 342   CUevent *evt;
 343   int type;
 344   void *addr;
 345   int ord;
 346
 347   struct ptx_event *next;
 348 };
 349
 350 static pthread_mutex_t ptx_event_lock;
 351 static struct ptx_event *ptx_events;
 352
 353 static struct ptx_device **ptx_devices;
 354
 355 static inline struct nvptx_thread *
 356 nvptx_thread (void)
 357 {
 358   return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
 359 }
 360
 361 static void
 362 init_streams_for_device (struct ptx_device *ptx_dev, int concurrency)
 363 {
 364   int i;
 365   struct ptx_stream *null_stream
 366     = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
 367
 368   null_stream->stream = NULL;
 369   null_stream->host_thread = pthread_self ();
 370   null_stream->multithreaded = true;
 371   null_stream->d = (CUdeviceptr) NULL;
 372   null_stream->h = NULL;
 373   map_init (null_stream);
 374   ptx_dev->null_stream = null_stream;
 375
 376   ptx_dev->active_streams = NULL;
 377   pthread_mutex_init (&ptx_dev->stream_lock, NULL);
 378
 379   if (concurrency < 1)
 380     concurrency = 1;
 381
 382   /* This is just a guess -- make space for as many async streams as the
 383      current device is capable of concurrently executing.  This can grow
 384      later as necessary.  No streams are created yet.  */
 385   ptx_dev->async_streams.arr
 386     = GOMP_PLUGIN_malloc (concurrency * sizeof (struct ptx_stream *));
 387   ptx_dev->async_streams.size = concurrency;
 388
 389   for (i = 0; i < concurrency; i++)
 390     ptx_dev->async_streams.arr[i] = NULL;
 391 }
 392
 393 static void
 394 fini_streams_for_device (struct ptx_device *ptx_dev)
 395 {
 396   free (ptx_dev->async_streams.arr);
 397
 398   while (ptx_dev->active_streams != NULL)
 399     {
 400       struct ptx_stream *s = ptx_dev->active_streams;
 401       ptx_dev->active_streams = ptx_dev->active_streams->next;
 402
 403       map_fini (s);
 404       cuStreamDestroy (s->stream);
 405       free (s);
 406     }
 407
 408   map_fini (ptx_dev->null_stream);
 409   free (ptx_dev->null_stream);
 410 }
 411
 412 /* Select a stream for (OpenACC-semantics) ASYNC argument for the current
 413    thread THREAD (and also current device/context).  If CREATE is true, create
 414    the stream if it does not exist (or use EXISTING if it is non-NULL), and
 415    associate the stream with the same thread argument.  Returns stream to use
 416    as result.  */
 417
 418 static struct ptx_stream *
 419 select_stream_for_async (int async, pthread_t thread, bool create,
 420                          CUstream existing)
 421 {
 422   struct nvptx_thread *nvthd = nvptx_thread ();
 423   /* Local copy of TLS variable.  */
 424   struct ptx_device *ptx_dev = nvthd->ptx_dev;
 425   struct ptx_stream *stream = NULL;
 426   int orig_async = async;
 427
 428   /* The special value acc_async_noval (-1) maps (for now) to an
 429      implicitly-created stream, which is then handled the same as any other
 430      numbered async stream.  Other options are available, e.g. using the null
 431      stream for anonymous async operations, or choosing an idle stream from an
 432      active set.  But, stick with this for now.  */
 433   if (async > acc_async_sync)
 434     async++;
 435
 436   if (create)
 437     pthread_mutex_lock (&ptx_dev->stream_lock);
 438
 439   /* NOTE: AFAICT there's no particular need for acc_async_sync to map to the
 440      null stream, and in fact better performance may be obtainable if it doesn't
 441      (because the null stream enforces overly-strict synchronisation with
 442      respect to other streams for legacy reasons, and that's probably not
 443      needed with OpenACC).  Maybe investigate later.  */
 444   if (async == acc_async_sync)
 445     stream = ptx_dev->null_stream;
 446   else if (async >= 0 && async < ptx_dev->async_streams.size
 447            && ptx_dev->async_streams.arr[async] && !(create && existing))
 448     stream = ptx_dev->async_streams.arr[async];
 449   else if (async >= 0 && create)
 450     {
 451       if (async >= ptx_dev->async_streams.size)
 452         {
 453           int i, newsize = ptx_dev->async_streams.size * 2;
 454
 455           if (async >= newsize)
 456             newsize = async + 1;
 457
 458           ptx_dev->async_streams.arr
 459             = GOMP_PLUGIN_realloc (ptx_dev->async_streams.arr,
 460                                    newsize * sizeof (struct ptx_stream *));
 461
 462           for (i = ptx_dev->async_streams.size; i < newsize; i++)
 463             ptx_dev->async_streams.arr[i] = NULL;
 464
 465           ptx_dev->async_streams.size = newsize;
 466         }
 467
 468       /* Create a new stream on-demand if there isn't one already, or if we're
 469          setting a particular async value to an existing (externally-provided)
 470          stream.  */
 471       if (!ptx_dev->async_streams.arr[async] || existing)
 472         {
 473           CUresult r;
 474           struct ptx_stream *s
 475             = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
 476
 477           if (existing)
 478             s->stream = existing;
 479           else
 480             {
 481               r = cuStreamCreate (&s->stream, CU_STREAM_DEFAULT);
 482               if (r != CUDA_SUCCESS)
 483                 GOMP_PLUGIN_fatal ("cuStreamCreate error: %s", cuda_error (r));
 484             }
 485
 486           /* If CREATE is true, we're going to be queueing some work on this
 487              stream.  Associate it with the current host thread.  */
 488           s->host_thread = thread;
 489           s->multithreaded = false;
 490
 491           s->d = (CUdeviceptr) NULL;
 492           s->h = NULL;
 493           map_init (s);
 494
 495           s->next = ptx_dev->active_streams;
 496           ptx_dev->active_streams = s;
 497           ptx_dev->async_streams.arr[async] = s;
 498         }
 499
 500       stream = ptx_dev->async_streams.arr[async];
 501     }
 502   else if (async < 0)
 503     GOMP_PLUGIN_fatal ("bad async %d", async);
 504
 505   if (create)
 506     {
 507       assert (stream != NULL);
 508
 509       /* If we're trying to use the same stream from different threads
 510          simultaneously, set stream->multithreaded to true.  This affects the
 511          behaviour of acc_async_test_all and acc_wait_all, which are supposed to
 512          only wait for asynchronous launches from the same host thread they are
 513          invoked on.  If multiple threads use the same async value, we make note
 514          of that here and fall back to testing/waiting for all threads in those
 515          functions.  */
 516       if (thread != stream->host_thread)
 517         stream->multithreaded = true;
 518
 519       pthread_mutex_unlock (&ptx_dev->stream_lock);
 520     }
 521   else if (stream && !stream->multithreaded
 522            && !pthread_equal (stream->host_thread, thread))
 523     GOMP_PLUGIN_fatal ("async %d used on wrong thread", orig_async);
 524
 525   return stream;
 526 }
 527
 528 /* Initialize the device.  Return TRUE on success, else FALSE.  PTX_DEV_LOCK
 529    should be locked on entry and remains locked on exit.  */
 530
 531 static bool
 532 nvptx_init (void)
 533 {
 534   CUresult r;
 535   int ndevs;
 536
 537   if (instantiated_devices != 0)
 538     return true;
 539
 540   r = cuInit (0);
 541   if (r != CUDA_SUCCESS)
 542     GOMP_PLUGIN_fatal ("cuInit error: %s", cuda_error (r));
 543
 544   ptx_events = NULL;
 545
 546   pthread_mutex_init (&ptx_event_lock, NULL);
 547
 548   r = cuDeviceGetCount (&ndevs);
 549   if (r != CUDA_SUCCESS)
 550     GOMP_PLUGIN_fatal ("cuDeviceGetCount error: %s", cuda_error (r));
 551
 552   ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *)
 553                                             * ndevs);
 554
 555   return true;
 556 }
 557
 558 /* Select the N'th PTX device for the current host thread.  The device must
 559    have been previously opened before calling this function.  */
 560
 561 static void
 562 nvptx_attach_host_thread_to_device (int n)
 563 {
 564   CUdevice dev;
 565   CUresult r;
 566   struct ptx_device *ptx_dev;
 567   CUcontext thd_ctx;
 568
 569   r = cuCtxGetDevice (&dev);
 570   if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
 571     GOMP_PLUGIN_fatal ("cuCtxGetDevice error: %s", cuda_error (r));
 572
 573   if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n)
 574     return;
 575   else
 576     {
 577       CUcontext old_ctx;
 578
 579       ptx_dev = ptx_devices[n];
 580       assert (ptx_dev);
 581
 582       r = cuCtxGetCurrent (&thd_ctx);
 583       if (r != CUDA_SUCCESS)
 584         GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));
 585
 586       /* We don't necessarily have a current context (e.g. if it has been
 587          destroyed.  Pop it if we do though.  */
 588       if (thd_ctx != NULL)
 589         {
 590           r = cuCtxPopCurrent (&old_ctx);
 591           if (r != CUDA_SUCCESS)
 592             GOMP_PLUGIN_fatal ("cuCtxPopCurrent error: %s", cuda_error (r));
 593         }
 594
 595       r = cuCtxPushCurrent (ptx_dev->ctx);
 596       if (r != CUDA_SUCCESS)
 597         GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s", cuda_error (r));
 598     }
 599 }
 600
 601 static struct ptx_device *
 602 nvptx_open_device (int n)
 603 {
 604   struct ptx_device *ptx_dev;
 605   CUdevice dev, ctx_dev;
 606   CUresult r;
 607   int async_engines, pi;
 608
 609   r = cuDeviceGet (&dev, n);
 610   if (r != CUDA_SUCCESS)
 611     GOMP_PLUGIN_fatal ("cuDeviceGet error: %s", cuda_error (r));
 612
 613   ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device));
 614
 615   ptx_dev->ord = n;
 616   ptx_dev->dev = dev;
 617   ptx_dev->ctx_shared = false;
 618
 619   r = cuCtxGetDevice (&ctx_dev);
 620   if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
 621     GOMP_PLUGIN_fatal ("cuCtxGetDevice error: %s", cuda_error (r));
 622
 623   if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev)
 624     {
 625       /* The current host thread has an active context for a different device.
 626          Detach it.  */
 627       CUcontext old_ctx;
 628
 629       r = cuCtxPopCurrent (&old_ctx);
 630       if (r != CUDA_SUCCESS)
 631         GOMP_PLUGIN_fatal ("cuCtxPopCurrent error: %s", cuda_error (r));
 632     }
 633
 634   r = cuCtxGetCurrent (&ptx_dev->ctx);
 635   if (r != CUDA_SUCCESS)
 636     GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));
 637
 638   if (!ptx_dev->ctx)
 639     {
 640       r = cuCtxCreate (&ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
 641       if (r != CUDA_SUCCESS)
 642         GOMP_PLUGIN_fatal ("cuCtxCreate error: %s", cuda_error (r));
 643     }
 644   else
 645     ptx_dev->ctx_shared = true;
 646
 647   r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
 648   if (r != CUDA_SUCCESS)
 649     GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
 650
 651   ptx_dev->overlap = pi;
 652
 653   r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
 654   if (r != CUDA_SUCCESS)
 655     GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
 656
 657   ptx_dev->map = pi;
 658
 659   r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
 660   if (r != CUDA_SUCCESS)
 661     GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
 662
 663   ptx_dev->concur = pi;
 664
 665   r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
 666   if (r != CUDA_SUCCESS)
 667     GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
 668
 669   ptx_dev->mode = pi;
 670
 671   r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
 672   if (r != CUDA_SUCCESS)
 673     GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
 674
 675   ptx_dev->mkern = pi;
 676
 677   r = cuDeviceGetAttribute (&async_engines,
 678                             CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
 679   if (r != CUDA_SUCCESS)
 680     async_engines = 1;
 681
 682   ptx_dev->images = NULL;
 683   pthread_mutex_init (&ptx_dev->image_lock, NULL);
 684
 685   init_streams_for_device (ptx_dev, async_engines);
 686
 687   return ptx_dev;
 688 }
 689
 690 static void
 691 nvptx_close_device (struct ptx_device *ptx_dev)
 692 {
 693   CUresult r;
 694
 695   if (!ptx_dev)
 696     return;
 697
 698   fini_streams_for_device (ptx_dev);
 699
 700   pthread_mutex_destroy (&ptx_dev->image_lock);
 701
 702   if (!ptx_dev->ctx_shared)
 703     {
 704       r = cuCtxDestroy (ptx_dev->ctx);
 705       if (r != CUDA_SUCCESS)
 706         GOMP_PLUGIN_fatal ("cuCtxDestroy error: %s", cuda_error (r));
 707     }
 708
 709   free (ptx_dev);
 710 }
 711
 712 static int
 713 nvptx_get_num_devices (void)
 714 {
 715   int n;
 716   CUresult r;
 717
 718   /* PR libgomp/65099: Currently, we only support offloading in 64-bit
 719      configurations.  */
 720   if (sizeof (void *) != 8)
 721     return 0;
 722
 723   /* This function will be called before the plugin has been initialized in
 724      order to enumerate available devices, but CUDA API routines can't be used
 725      until cuInit has been called.  Just call it now (but don't yet do any
 726      further initialization).  */
 727   if (instantiated_devices == 0)
 728     {
 729       r = cuInit (0);
 730       /* This is not an error: e.g. we may have CUDA libraries installed but
 731          no devices available.  */
 732       if (r != CUDA_SUCCESS)
 733         return 0;
 734     }
 735
 736   r = cuDeviceGetCount (&n);
 737   if (r!= CUDA_SUCCESS)
 738     GOMP_PLUGIN_fatal ("cuDeviceGetCount error: %s", cuda_error (r));
 739
 740   return n;
 741 }
 742
 743
 744 static void
 745 link_ptx (CUmodule *module, const char *ptx_code)
 746 {
 747   CUjit_option opts[7];
 748   void *optvals[7];
 749   float elapsed = 0.0;
 750 #define LOGSIZE 8192
 751   char elog[LOGSIZE];
 752   char ilog[LOGSIZE];
 753   unsigned long logsize = LOGSIZE;
 754   CUlinkState linkstate;
 755   CUresult r;
 756   void *linkout;
 757   size_t linkoutsize __attribute__ ((unused));
 758
 759   GOMP_PLUGIN_debug (0, "attempting to load:\n---\n%s\n---\n", ptx_code);
 760
 761   opts[0] = CU_JIT_WALL_TIME;
 762   optvals[0] = &elapsed;
 763
 764   opts[1] = CU_JIT_INFO_LOG_BUFFER;
 765   optvals[1] = &ilog[0];
 766
 767   opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
 768   optvals[2] = (void *) logsize;
 769
 770   opts[3] = CU_JIT_ERROR_LOG_BUFFER;
 771   optvals[3] = &elog[0];
 772
 773   opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
 774   optvals[4] = (void *) logsize;
 775
 776   opts[5] = CU_JIT_LOG_VERBOSE;
 777   optvals[5] = (void *) 1;
 778
 779   opts[6] = CU_JIT_TARGET;
 780   optvals[6] = (void *) CU_TARGET_COMPUTE_30;
 781
 782   r = cuLinkCreate (7, opts, optvals, &linkstate);
 783   if (r != CUDA_SUCCESS)
 784     GOMP_PLUGIN_fatal ("cuLinkCreate error: %s", cuda_error (r));
 785
 786   char *abort_ptx = ABORT_PTX;
 787   r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, abort_ptx,
 788                      strlen (abort_ptx) + 1, 0, 0, 0, 0);
 789   if (r != CUDA_SUCCESS)
 790     {
 791       GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
 792       GOMP_PLUGIN_fatal ("cuLinkAddData (abort) error: %s", cuda_error (r));
 793     }
 794
 795   char *acc_on_device_ptx = ACC_ON_DEVICE_PTX;
 796   r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, acc_on_device_ptx,
 797                      strlen (acc_on_device_ptx) + 1, 0, 0, 0, 0);
 798   if (r != CUDA_SUCCESS)
 799     {
 800       GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
 801       GOMP_PLUGIN_fatal ("cuLinkAddData (acc_on_device) error: %s",
 802                          cuda_error (r));
 803     }
 804
 805   char *goacc_internal_ptx = GOACC_INTERNAL_PTX;
 806   r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, goacc_internal_ptx,
 807                      strlen (goacc_internal_ptx) + 1, 0, 0, 0, 0);
 808   if (r != CUDA_SUCCESS)
 809     {
 810       GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
 811       GOMP_PLUGIN_fatal ("cuLinkAddData (goacc_internal_ptx) error: %s",
 812                          cuda_error (r));
 813     }
 814
 815   /* cuLinkAddData's 'data' argument erroneously omits the const qualifier.  */
 816   r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, (char *)ptx_code,
 817               strlen (ptx_code) + 1, 0, 0, 0, 0);
 818   if (r != CUDA_SUCCESS)
 819     {
 820       GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
 821       GOMP_PLUGIN_fatal ("cuLinkAddData (ptx_code) error: %s", cuda_error (r));
 822     }
 823
 824   r = cuLinkComplete (linkstate, &linkout, &linkoutsize);
 825   if (r != CUDA_SUCCESS)
 826     GOMP_PLUGIN_fatal ("cuLinkComplete error: %s", cuda_error (r));
 827
 828   GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed);
 829   GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]);
 830
 831   r = cuModuleLoadData (module, linkout);
 832   if (r != CUDA_SUCCESS)
 833     GOMP_PLUGIN_fatal ("cuModuleLoadData error: %s", cuda_error (r));
 834 }
 835
 836 static void
 837 event_gc (bool memmap_lockable)
 838 {
 839   struct ptx_event *ptx_event = ptx_events;
 840   struct nvptx_thread *nvthd = nvptx_thread ();
 841
 842   pthread_mutex_lock (&ptx_event_lock);
 843
 844   while (ptx_event != NULL)
 845     {
 846       CUresult r;
 847       struct ptx_event *e = ptx_event;
 848
 849       ptx_event = ptx_event->next;
 850
 851       if (e->ord != nvthd->ptx_dev->ord)
 852         continue;
 853
 854       r = cuEventQuery (*e->evt);
 855       if (r == CUDA_SUCCESS)
 856         {
 857           CUevent *te;
 858
 859           te = e->evt;
 860
 861           switch (e->type)
 862             {
 863             case PTX_EVT_MEM:
 864             case PTX_EVT_SYNC:
 865               break;
 866
 867             case PTX_EVT_KNL:
 868               map_pop (e->addr);
 869               break;
 870
 871             case PTX_EVT_ASYNC_CLEANUP:
 872               {
 873                 /* The function gomp_plugin_async_unmap_vars needs to claim the
 874                    memory-map splay tree lock for the current device, so we
 875                    can't call it when one of our callers has already claimed
 876                    the lock.  In that case, just delay the GC for this event
 877                    until later.  */
 878                 if (!memmap_lockable)
 879                   continue;
 880
 881                 GOMP_PLUGIN_async_unmap_vars (e->addr);
 882               }
 883               break;
 884             }
 885
 886           cuEventDestroy (*te);
 887           free ((void *)te);
 888
 889           if (ptx_events == e)
 890             ptx_events = ptx_events->next;
 891           else
 892             {
 893               struct ptx_event *e_ = ptx_events;
 894               while (e_->next != e)
 895                 e_ = e_->next;
 896               e_->next = e_->next->next;
 897             }
 898
 899           free (e);
 900         }
 901     }
 902
 903   pthread_mutex_unlock (&ptx_event_lock);
 904 }
 905
 906 static void
 907 event_add (enum ptx_event_type type, CUevent *e, void *h)
 908 {
 909   struct ptx_event *ptx_event;
 910   struct nvptx_thread *nvthd = nvptx_thread ();
 911
 912   assert (type == PTX_EVT_MEM || type == PTX_EVT_KNL || type == PTX_EVT_SYNC
 913           || type == PTX_EVT_ASYNC_CLEANUP);
 914
 915   ptx_event = GOMP_PLUGIN_malloc (sizeof (struct ptx_event));
 916   ptx_event->type = type;
 917   ptx_event->evt = e;
 918   ptx_event->addr = h;
 919   ptx_event->ord = nvthd->ptx_dev->ord;
 920
 921   pthread_mutex_lock (&ptx_event_lock);
 922
 923   ptx_event->next = ptx_events;
 924   ptx_events = ptx_event;
 925
 926   pthread_mutex_unlock (&ptx_event_lock);
 927 }
 928
 929 void
 930 nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
 931           size_t *sizes, unsigned short *kinds, int num_gangs, int num_workers,
 932           int vector_length, int async, void *targ_mem_desc)
 933 {
 934   struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
 935   CUfunction function;
 936   CUresult r;
 937   int i;
 938   struct ptx_stream *dev_str;
 939   void *kargs[1];
 940   void *hp, *dp;
 941   unsigned int nthreads_in_block;
 942   struct nvptx_thread *nvthd = nvptx_thread ();
 943   const char *maybe_abort_msg = "(perhaps abort was called)";
 944
 945   function = targ_fn->fn;
 946
 947   dev_str = select_stream_for_async (async, pthread_self (), false, NULL);
 948   assert (dev_str == nvthd->current_stream);
 949
 950   /* This reserves a chunk of a pre-allocated page of memory mapped on both
 951      the host and the device. HP is a host pointer to the new chunk, and DP is
 952      the corresponding device pointer.  */
 953   map_push (dev_str, async, mapnum * sizeof (void *), &hp, &dp);
 954
 955   GOMP_PLUGIN_debug (0, "  %s: prepare mappings\n", __FUNCTION__);
 956
 957   /* Copy the array of arguments to the mapped page.  */
 958   for (i = 0; i < mapnum; i++)
 959     ((void **) hp)[i] = devaddrs[i];
 960
 961   /* Copy the (device) pointers to arguments to the device (dp and hp might in
 962      fact have the same value on a unified-memory system).  */
 963   r = cuMemcpy ((CUdeviceptr)dp, (CUdeviceptr)hp, mapnum * sizeof (void *));
 964   if (r != CUDA_SUCCESS)
 965     GOMP_PLUGIN_fatal ("cuMemcpy failed: %s", cuda_error (r));
 966
 967   GOMP_PLUGIN_debug (0, "  %s: kernel %s: launch\n", __FUNCTION__, targ_fn->name);
 968
 969   // OpenACC            CUDA
 970   //
 971   // num_gangs          blocks
 972   // num_workers        warps (where a warp is equivalent to 32 threads)
 973   // vector length      threads
 974   //
 975
 976   /* The openacc vector_length clause 'determines the vector length to use for
 977      vector or SIMD operations'.  The question is how to map this to CUDA.
 978
 979      In CUDA, the warp size is the vector length of a CUDA device.  However, the
 980      CUDA interface abstracts away from that, and only shows us warp size
 981      indirectly in maximum number of threads per block, which is a product of
 982      warp size and the number of hyperthreads of a multiprocessor.
 983
 984      We choose to map openacc vector_length directly onto the number of threads
 985      in a block, in the x dimension.  This is reflected in gcc code generation
 986      that uses ThreadIdx.x to access vector elements.
 987
 988      Attempting to use an openacc vector_length of more than the maximum number
 989      of threads per block will result in a cuda error.  */
 990   nthreads_in_block = vector_length;
 991
 992   kargs[0] = &dp;
 993   r = cuLaunchKernel (function,
 994                       num_gangs, 1, 1,
 995                       nthreads_in_block, 1, 1,
 996                       0, dev_str->stream, kargs, 0);
 997   if (r != CUDA_SUCCESS)
 998     GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
 999
1000 #ifndef DISABLE_ASYNC
1001   if (async < acc_async_noval)
1002     {
1003       r = cuStreamSynchronize (dev_str->stream);
1004       if (r == CUDA_ERROR_LAUNCH_FAILED)
1005         GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
1006                            maybe_abort_msg);
1007       else if (r != CUDA_SUCCESS)
1008         GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1009     }
1010   else
1011     {
1012       CUevent *e;
1013
1014       e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1015
1016       r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1017       if (r == CUDA_ERROR_LAUNCH_FAILED)
1018         GOMP_PLUGIN_fatal ("cuEventCreate error: %s %s\n", cuda_error (r),
1019                            maybe_abort_msg);
1020       else if (r != CUDA_SUCCESS)
1021         GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1022
1023       event_gc (true);
1024
1025       r = cuEventRecord (*e, dev_str->stream);
1026       if (r != CUDA_SUCCESS)
1027         GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1028
1029       event_add (PTX_EVT_KNL, e, (void *)dev_str);
1030     }
1031 #else
1032   r = cuCtxSynchronize ();
1033   if (r == CUDA_ERROR_LAUNCH_FAILED)
1034     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
1035                        maybe_abort_msg);
1036   else if (r != CUDA_SUCCESS)
1037     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
1038 #endif
1039
1040   GOMP_PLUGIN_debug (0, "  %s: kernel %s: finished\n", __FUNCTION__,
1041                      targ_fn->name);
1042
1043 #ifndef DISABLE_ASYNC
1044   if (async < acc_async_noval)
1045 #endif
1046     map_pop (dev_str);
1047 }
1048
1049 void * openacc_get_current_cuda_context (void);
1050
1051 static void *
1052 nvptx_alloc (size_t s)
1053 {
1054   CUdeviceptr d;
1055   CUresult r;
1056
1057   r = cuMemAlloc (&d, s);
1058   if (r == CUDA_ERROR_OUT_OF_MEMORY)
1059     return 0;
1060   if (r != CUDA_SUCCESS)
1061     GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
1062   return (void *)d;
1063 }
1064
1065 static void
1066 nvptx_free (void *p)
1067 {
1068   CUresult r;
1069   CUdeviceptr pb;
1070   size_t ps;
1071
1072   r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)p);
1073   if (r != CUDA_SUCCESS)
1074     GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r));
1075
1076   if ((CUdeviceptr)p != pb)
1077     GOMP_PLUGIN_fatal ("invalid device address");
1078
1079   r = cuMemFree ((CUdeviceptr)p);
1080   if (r != CUDA_SUCCESS)
1081     GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
1082 }
1083
1084 static void *
1085 nvptx_host2dev (void *d, const void *h, size_t s)
1086 {
1087   CUresult r;
1088   CUdeviceptr pb;
1089   size_t ps;
1090   struct nvptx_thread *nvthd = nvptx_thread ();
1091
1092   if (!s)
1093     return 0;
1094
1095   if (!d)
1096     GOMP_PLUGIN_fatal ("invalid device address");
1097
1098   r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)d);
1099   if (r != CUDA_SUCCESS)
1100     GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r));
1101
1102   if (!pb)
1103     GOMP_PLUGIN_fatal ("invalid device address");
1104
1105   if (!h)
1106     GOMP_PLUGIN_fatal ("invalid host address");
1107
1108   if (d == h)
1109     GOMP_PLUGIN_fatal ("invalid host or device address");
1110
1111   if ((void *)(d + s) > (void *)(pb + ps))
1112     GOMP_PLUGIN_fatal ("invalid size");
1113
1114 #ifndef DISABLE_ASYNC
1115   if (nvthd->current_stream != nvthd->ptx_dev->null_stream)
1116     {
1117       CUevent *e;
1118
1119       e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1120
1121       r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1122       if (r != CUDA_SUCCESS)
1123         GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1124
1125       event_gc (false);
1126
1127       r = cuMemcpyHtoDAsync ((CUdeviceptr)d, h, s,
1128                              nvthd->current_stream->stream);
1129       if (r != CUDA_SUCCESS)
1130         GOMP_PLUGIN_fatal ("cuMemcpyHtoDAsync error: %s", cuda_error (r));
1131
1132       r = cuEventRecord (*e, nvthd->current_stream->stream);
1133       if (r != CUDA_SUCCESS)
1134         GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1135
1136       event_add (PTX_EVT_MEM, e, (void *)h);
1137     }
1138   else
1139 #endif
1140     {
1141       r = cuMemcpyHtoD ((CUdeviceptr)d, h, s);
1142       if (r != CUDA_SUCCESS)
1143         GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
1144     }
1145
1146   return 0;
1147 }
1148
1149 static void *
1150 nvptx_dev2host (void *h, const void *d, size_t s)
1151 {
1152   CUresult r;
1153   CUdeviceptr pb;
1154   size_t ps;
1155   struct nvptx_thread *nvthd = nvptx_thread ();
1156
1157   if (!s)
1158     return 0;
1159
1160   if (!d)
1161     GOMP_PLUGIN_fatal ("invalid device address");
1162
1163   r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)d);
1164   if (r != CUDA_SUCCESS)
1165     GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r));
1166
1167   if (!pb)
1168     GOMP_PLUGIN_fatal ("invalid device address");
1169
1170   if (!h)
1171     GOMP_PLUGIN_fatal ("invalid host address");
1172
1173   if (d == h)
1174     GOMP_PLUGIN_fatal ("invalid host or device address");
1175
1176   if ((void *)(d + s) > (void *)(pb + ps))
1177     GOMP_PLUGIN_fatal ("invalid size");
1178
1179 #ifndef DISABLE_ASYNC
1180   if (nvthd->current_stream != nvthd->ptx_dev->null_stream)
1181     {
1182       CUevent *e;
1183
1184       e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1185
1186       r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1187       if (r != CUDA_SUCCESS)
1188         GOMP_PLUGIN_fatal ("cuEventCreate error: %s\n", cuda_error (r));
1189
1190       event_gc (false);
1191
1192       r = cuMemcpyDtoHAsync (h, (CUdeviceptr)d, s,
1193                              nvthd->current_stream->stream);
1194       if (r != CUDA_SUCCESS)
1195         GOMP_PLUGIN_fatal ("cuMemcpyDtoHAsync error: %s", cuda_error (r));
1196
1197       r = cuEventRecord (*e, nvthd->current_stream->stream);
1198       if (r != CUDA_SUCCESS)
1199         GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1200
1201       event_add (PTX_EVT_MEM, e, (void *)h);
1202     }
1203   else
1204 #endif
1205     {
1206       r = cuMemcpyDtoH (h, (CUdeviceptr)d, s);
1207       if (r != CUDA_SUCCESS)
1208         GOMP_PLUGIN_fatal ("cuMemcpyDtoH error: %s", cuda_error (r));
1209     }
1210
1211   return 0;
1212 }
1213
1214 static void
1215 nvptx_set_async (int async)
1216 {
1217   struct nvptx_thread *nvthd = nvptx_thread ();
1218   nvthd->current_stream
1219     = select_stream_for_async (async, pthread_self (), true, NULL);
1220 }
1221
1222 static int
1223 nvptx_async_test (int async)
1224 {
1225   CUresult r;
1226   struct ptx_stream *s;
1227
1228   s = select_stream_for_async (async, pthread_self (), false, NULL);
1229
1230   if (!s)
1231     GOMP_PLUGIN_fatal ("unknown async %d", async);
1232
1233   r = cuStreamQuery (s->stream);
1234   if (r == CUDA_SUCCESS)
1235     {
1236       /* The oacc-parallel.c:goacc_wait function calls this hook to determine
1237          whether all work has completed on this stream, and if so omits the call
1238          to the wait hook.  If that happens, event_gc might not get called
1239          (which prevents variables from getting unmapped and their associated
1240          device storage freed), so call it here.  */
1241       event_gc (true);
1242       return 1;
1243     }
1244   else if (r == CUDA_ERROR_NOT_READY)
1245     return 0;
1246
1247   GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
1248
1249   return 0;
1250 }
1251
1252 static int
1253 nvptx_async_test_all (void)
1254 {
1255   struct ptx_stream *s;
1256   pthread_t self = pthread_self ();
1257   struct nvptx_thread *nvthd = nvptx_thread ();
1258
1259   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1260
1261   for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
1262     {
1263       if ((s->multithreaded || pthread_equal (s->host_thread, self))
1264           && cuStreamQuery (s->stream) == CUDA_ERROR_NOT_READY)
1265         {
1266           pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1267           return 0;
1268         }
1269     }
1270
1271   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1272
1273   event_gc (true);
1274
1275   return 1;
1276 }
1277
1278 static void
1279 nvptx_wait (int async)
1280 {
1281   CUresult r;
1282   struct ptx_stream *s;
1283
1284   s = select_stream_for_async (async, pthread_self (), false, NULL);
1285
1286   if (!s)
1287     GOMP_PLUGIN_fatal ("unknown async %d", async);
1288
1289   r = cuStreamSynchronize (s->stream);
1290   if (r != CUDA_SUCCESS)
1291     GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1292
1293   event_gc (true);
1294 }
1295
1296 static void
1297 nvptx_wait_async (int async1, int async2)
1298 {
1299   CUresult r;
1300   CUevent *e;
1301   struct ptx_stream *s1, *s2;
1302   pthread_t self = pthread_self ();
1303
1304   /* The stream that is waiting (rather than being waited for) doesn't
1305      necessarily have to exist already.  */
1306   s2 = select_stream_for_async (async2, self, true, NULL);
1307
1308   s1 = select_stream_for_async (async1, self, false, NULL);
1309   if (!s1)
1310     GOMP_PLUGIN_fatal ("invalid async 1\n");
1311
1312   if (s1 == s2)
1313     GOMP_PLUGIN_fatal ("identical parameters");
1314
1315   e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1316
1317   r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1318   if (r != CUDA_SUCCESS)
1319     GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1320
1321   event_gc (true);
1322
1323   r = cuEventRecord (*e, s1->stream);
1324   if (r != CUDA_SUCCESS)
1325     GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1326
1327   event_add (PTX_EVT_SYNC, e, NULL);
1328
1329   r = cuStreamWaitEvent (s2->stream, *e, 0);
1330   if (r != CUDA_SUCCESS)
1331     GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuda_error (r));
1332 }
1333
1334 static void
1335 nvptx_wait_all (void)
1336 {
1337   CUresult r;
1338   struct ptx_stream *s;
1339   pthread_t self = pthread_self ();
1340   struct nvptx_thread *nvthd = nvptx_thread ();
1341
1342   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1343
1344   /* Wait for active streams initiated by this thread (or by multiple threads)
1345      to complete.  */
1346   for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
1347     {
1348       if (s->multithreaded || pthread_equal (s->host_thread, self))
1349         {
1350           r = cuStreamQuery (s->stream);
1351           if (r == CUDA_SUCCESS)
1352             continue;
1353           else if (r != CUDA_ERROR_NOT_READY)
1354             GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
1355
1356           r = cuStreamSynchronize (s->stream);
1357           if (r != CUDA_SUCCESS)
1358             GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1359         }
1360     }
1361
1362   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1363
1364   event_gc (true);
1365 }
1366
1367 static void
1368 nvptx_wait_all_async (int async)
1369 {
1370   CUresult r;
1371   struct ptx_stream *waiting_stream, *other_stream;
1372   CUevent *e;
1373   struct nvptx_thread *nvthd = nvptx_thread ();
1374   pthread_t self = pthread_self ();
1375
1376   /* The stream doing the waiting.  This could be the first mention of the
1377      stream, so create it if necessary.  */
1378   waiting_stream
1379     = select_stream_for_async (async, pthread_self (), true, NULL);
1380
1381   /* Launches on the null stream already block on other streams in the
1382      context.  */
1383   if (!waiting_stream || waiting_stream == nvthd->ptx_dev->null_stream)
1384     return;
1385
1386   event_gc (true);
1387
1388   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1389
1390   for (other_stream = nvthd->ptx_dev->active_streams;
1391        other_stream != NULL;
1392        other_stream = other_stream->next)
1393     {
1394       if (!other_stream->multithreaded
1395           && !pthread_equal (other_stream->host_thread, self))
1396         continue;
1397
1398       e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1399
1400       r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1401       if (r != CUDA_SUCCESS)
1402         GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1403
1404       /* Record an event on the waited-for stream.  */
1405       r = cuEventRecord (*e, other_stream->stream);
1406       if (r != CUDA_SUCCESS)
1407         GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1408
1409       event_add (PTX_EVT_SYNC, e, NULL);
1410
1411       r = cuStreamWaitEvent (waiting_stream->stream, *e, 0);
1412       if (r != CUDA_SUCCESS)
1413         GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuda_error (r));
1414    }
1415
1416   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1417 }
1418
1419 static void *
1420 nvptx_get_current_cuda_device (void)
1421 {
1422   struct nvptx_thread *nvthd = nvptx_thread ();
1423
1424   if (!nvthd || !nvthd->ptx_dev)
1425     return NULL;
1426
1427   return &nvthd->ptx_dev->dev;
1428 }
1429
1430 static void *
1431 nvptx_get_current_cuda_context (void)
1432 {
1433   struct nvptx_thread *nvthd = nvptx_thread ();
1434
1435   if (!nvthd || !nvthd->ptx_dev)
1436     return NULL;
1437
1438   return nvthd->ptx_dev->ctx;
1439 }
1440
1441 static void *
1442 nvptx_get_cuda_stream (int async)
1443 {
1444   struct ptx_stream *s;
1445   struct nvptx_thread *nvthd = nvptx_thread ();
1446
1447   if (!nvthd || !nvthd->ptx_dev)
1448     return NULL;
1449
1450   s = select_stream_for_async (async, pthread_self (), false, NULL);
1451
1452   return s ? s->stream : NULL;
1453 }
1454
1455 static int
1456 nvptx_set_cuda_stream (int async, void *stream)
1457 {
1458   struct ptx_stream *oldstream;
1459   pthread_t self = pthread_self ();
1460   struct nvptx_thread *nvthd = nvptx_thread ();
1461
1462   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1463
1464   if (async < 0)
1465     GOMP_PLUGIN_fatal ("bad async %d", async);
1466
1467   /* We have a list of active streams and an array mapping async values to
1468      entries of that list.  We need to take "ownership" of the passed-in stream,
1469      and add it to our list, removing the previous entry also (if there was one)
1470      in order to prevent resource leaks.  Note the potential for surprise
1471      here: maybe we should keep track of passed-in streams and leave it up to
1472      the user to tidy those up, but that doesn't work for stream handles
1473      returned from acc_get_cuda_stream above...  */
1474
1475   oldstream = select_stream_for_async (async, self, false, NULL);
1476
1477   if (oldstream)
1478     {
1479       if (nvthd->ptx_dev->active_streams == oldstream)
1480         nvthd->ptx_dev->active_streams = nvthd->ptx_dev->active_streams->next;
1481       else
1482         {
1483           struct ptx_stream *s = nvthd->ptx_dev->active_streams;
1484           while (s->next != oldstream)
1485             s = s->next;
1486           s->next = s->next->next;
1487         }
1488
1489       cuStreamDestroy (oldstream->stream);
1490       map_fini (oldstream);
1491       free (oldstream);
1492     }
1493
1494   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1495
1496   (void) select_stream_for_async (async, self, true, (CUstream) stream);
1497
1498   return 1;
1499 }
1500
1501 /* Plugin entry points.  */
1502
1503 const char *
1504 GOMP_OFFLOAD_get_name (void)
1505 {
1506   return "nvptx";
1507 }
1508
1509 unsigned int
1510 GOMP_OFFLOAD_get_caps (void)
1511 {
1512   return GOMP_OFFLOAD_CAP_OPENACC_200;
1513 }
1514
1515 int
1516 GOMP_OFFLOAD_get_type (void)
1517 {
1518   return OFFLOAD_TARGET_TYPE_NVIDIA_PTX;
1519 }
1520
1521 int
1522 GOMP_OFFLOAD_get_num_devices (void)
1523 {
1524   return nvptx_get_num_devices ();
1525 }
1526
1527 void
1528 GOMP_OFFLOAD_init_device (int n)
1529 {
1530   pthread_mutex_lock (&ptx_dev_lock);
1531
1532   if (!nvptx_init () || ptx_devices[n] != NULL)
1533     {
1534       pthread_mutex_unlock (&ptx_dev_lock);
1535       return;
1536     }
1537
1538   ptx_devices[n] = nvptx_open_device (n);
1539   instantiated_devices++;
1540
1541   pthread_mutex_unlock (&ptx_dev_lock);
1542 }
1543
1544 void
1545 GOMP_OFFLOAD_fini_device (int n)
1546 {
1547   pthread_mutex_lock (&ptx_dev_lock);
1548
1549   if (ptx_devices[n] != NULL)
1550     {
1551       nvptx_attach_host_thread_to_device (n);
1552       nvptx_close_device (ptx_devices[n]);
1553       ptx_devices[n] = NULL;
1554       instantiated_devices--;
1555     }
1556
1557   pthread_mutex_unlock (&ptx_dev_lock);
1558 }
1559
1560 /* Data emitted by mkoffload.  */
1561
1562 typedef struct nvptx_tdata
1563 {
1564   const char *ptx_src;
1565
1566   const char *const *var_names;
1567   size_t var_num;
1568
1569   const char *const *fn_names;
1570   size_t fn_num;
1571 } nvptx_tdata_t;
1572
1573 /* Load the (partial) program described by TARGET_DATA to device
1574    number ORD.  Allocate and return TARGET_TABLE.  */
1575
1576 int
1577 GOMP_OFFLOAD_load_image (int ord, const void *target_data,
1578                          struct addr_pair **target_table)
1579 {
1580   CUmodule module;
1581   const char *const *fn_names, *const *var_names;
1582   unsigned int fn_entries, var_entries, i, j;
1583   CUresult r;
1584   struct targ_fn_descriptor *targ_fns;
1585   struct addr_pair *targ_tbl;
1586   const nvptx_tdata_t *img_header = (const nvptx_tdata_t *) target_data;
1587   struct ptx_image_data *new_image;
1588   struct ptx_device *dev;
1589
1590   GOMP_OFFLOAD_init_device (ord);
1591
1592   dev = ptx_devices[ord];
1593
1594   nvptx_attach_host_thread_to_device (ord);
1595
1596   link_ptx (&module, img_header->ptx_src);
1597
1598   /* The mkoffload utility emits a struct of pointers/integers at the
1599      start of each offload image.  The array of kernel names and the
1600      functions addresses form a one-to-one correspondence.  */
1601
1602   var_entries = img_header->var_num;
1603   var_names = img_header->var_names;
1604   fn_entries = img_header->fn_num;
1605   fn_names = img_header->fn_names;
1606
1607   targ_tbl = GOMP_PLUGIN_malloc (sizeof (struct addr_pair)
1608                                  * (fn_entries + var_entries));
1609   targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
1610                                  * fn_entries);
1611
1612   *target_table = targ_tbl;
1613
1614   new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data));
1615   new_image->target_data = target_data;
1616   new_image->module = module;
1617   new_image->fns = targ_fns;
1618
1619   pthread_mutex_lock (&dev->image_lock);
1620   new_image->next = dev->images;
1621   dev->images = new_image;
1622   pthread_mutex_unlock (&dev->image_lock);
1623
1624   for (i = 0; i < fn_entries; i++, targ_fns++, targ_tbl++)
1625     {
1626       CUfunction function;
1627
1628       r = cuModuleGetFunction (&function, module, fn_names[i]);
1629       if (r != CUDA_SUCCESS)
1630         GOMP_PLUGIN_fatal ("cuModuleGetFunction error: %s", cuda_error (r));
1631
1632       targ_fns->fn = function;
1633       targ_fns->name = (const char *) fn_names[i];
1634
1635       targ_tbl->start = (uintptr_t) targ_fns;
1636       targ_tbl->end = targ_tbl->start + 1;
1637     }
1638
1639   for (j = 0; j < var_entries; j++, targ_tbl++)
1640     {
1641       CUdeviceptr var;
1642       size_t bytes;
1643
1644       r = cuModuleGetGlobal (&var, &bytes, module, var_names[j]);
1645       if (r != CUDA_SUCCESS)
1646         GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
1647
1648       targ_tbl->start = (uintptr_t) var;
1649       targ_tbl->end = targ_tbl->start + bytes;
1650     }
1651
1652   return fn_entries + var_entries;
1653 }
1654
1655 /* Unload the program described by TARGET_DATA.  DEV_DATA is the
1656    function descriptors allocated by G_O_load_image.  */
1657
1658 void
1659 GOMP_OFFLOAD_unload_image (int ord, const void *target_data)
1660 {
1661   struct ptx_image_data *image, **prev_p;
1662   struct ptx_device *dev = ptx_devices[ord];
1663
1664   pthread_mutex_lock (&dev->image_lock);
1665   for (prev_p = &dev->images; (image = *prev_p) != 0; prev_p = &image->next)
1666     if (image->target_data == target_data)
1667       {
1668         *prev_p = image->next;
1669         cuModuleUnload (image->module);
1670         free (image->fns);
1671         free (image);
1672         break;
1673       }
1674   pthread_mutex_unlock (&dev->image_lock);
1675 }
1676
1677 void *
1678 GOMP_OFFLOAD_alloc (int ord, size_t size)
1679 {
1680   nvptx_attach_host_thread_to_device (ord);
1681   return nvptx_alloc (size);
1682 }
1683
1684 void
1685 GOMP_OFFLOAD_free (int ord, void *ptr)
1686 {
1687   nvptx_attach_host_thread_to_device (ord);
1688   nvptx_free (ptr);
1689 }
1690
1691 void *
1692 GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
1693 {
1694   nvptx_attach_host_thread_to_device (ord);
1695   return nvptx_dev2host (dst, src, n);
1696 }
1697
1698 void *
1699 GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
1700 {
1701   nvptx_attach_host_thread_to_device (ord);
1702   return nvptx_host2dev (dst, src, n);
1703 }
1704
1705 void (*device_run) (int n, void *fn_ptr, void *vars) = NULL;
1706
1707 void
1708 GOMP_OFFLOAD_openacc_parallel (void (*fn) (void *), size_t mapnum,
1709                                void **hostaddrs, void **devaddrs, size_t *sizes,
1710                                unsigned short *kinds, int num_gangs,
1711                                int num_workers, int vector_length, int async,
1712                                void *targ_mem_desc)
1713 {
1714   nvptx_exec (fn, mapnum, hostaddrs, devaddrs, sizes, kinds, num_gangs,
1715             num_workers, vector_length, async, targ_mem_desc);
1716 }
1717
1718 void
1719 GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc)
1720 {
1721   CUevent *e;
1722   CUresult r;
1723   struct nvptx_thread *nvthd = nvptx_thread ();
1724
1725   e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1726
1727   r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1728   if (r != CUDA_SUCCESS)
1729     GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1730
1731   r = cuEventRecord (*e, nvthd->current_stream->stream);
1732   if (r != CUDA_SUCCESS)
1733     GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1734
1735   event_add (PTX_EVT_ASYNC_CLEANUP, e, targ_mem_desc);
1736 }
1737
1738 int
1739 GOMP_OFFLOAD_openacc_async_test (int async)
1740 {
1741   return nvptx_async_test (async);
1742 }
1743
1744 int
1745 GOMP_OFFLOAD_openacc_async_test_all (void)
1746 {
1747   return nvptx_async_test_all ();
1748 }
1749
1750 void
1751 GOMP_OFFLOAD_openacc_async_wait (int async)
1752 {
1753   nvptx_wait (async);
1754 }
1755
1756 void
1757 GOMP_OFFLOAD_openacc_async_wait_async (int async1, int async2)
1758 {
1759   nvptx_wait_async (async1, async2);
1760 }
1761
1762 void
1763 GOMP_OFFLOAD_openacc_async_wait_all (void)
1764 {
1765   nvptx_wait_all ();
1766 }
1767
1768 void
1769 GOMP_OFFLOAD_openacc_async_wait_all_async (int async)
1770 {
1771   nvptx_wait_all_async (async);
1772 }
1773
1774 void
1775 GOMP_OFFLOAD_openacc_async_set_async (int async)
1776 {
1777   nvptx_set_async (async);
1778 }
1779
1780 void *
1781 GOMP_OFFLOAD_openacc_create_thread_data (int ord)
1782 {
1783   struct ptx_device *ptx_dev;
1784   struct nvptx_thread *nvthd
1785     = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
1786   CUresult r;
1787   CUcontext thd_ctx;
1788
1789   ptx_dev = ptx_devices[ord];
1790
1791   assert (ptx_dev);
1792
1793   r = cuCtxGetCurrent (&thd_ctx);
1794   if (r != CUDA_SUCCESS)
1795     GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));
1796
1797   assert (ptx_dev->ctx);
1798
1799   if (!thd_ctx)
1800     {
1801       r = cuCtxPushCurrent (ptx_dev->ctx);
1802       if (r != CUDA_SUCCESS)
1803         GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s", cuda_error (r));
1804     }
1805
1806   nvthd->current_stream = ptx_dev->null_stream;
1807   nvthd->ptx_dev = ptx_dev;
1808
1809   return (void *) nvthd;
1810 }
1811
1812 void
1813 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
1814 {
1815   free (data);
1816 }
1817
1818 void *
1819 GOMP_OFFLOAD_openacc_get_current_cuda_device (void)
1820 {
1821   return nvptx_get_current_cuda_device ();
1822 }
1823
1824 void *
1825 GOMP_OFFLOAD_openacc_get_current_cuda_context (void)
1826 {
1827   return nvptx_get_current_cuda_context ();
1828 }
1829
1830 /* NOTE: This returns a CUstream, not a ptx_stream pointer.  */
1831
1832 void *
1833 GOMP_OFFLOAD_openacc_get_cuda_stream (int async)
1834 {
1835   return nvptx_get_cuda_stream (async);
1836 }
1837
1838 /* NOTE: This takes a CUstream, not a ptx_stream pointer.  */
1839
1840 int
1841 GOMP_OFFLOAD_openacc_set_cuda_stream (int async, void *stream)
1842 {
1843   return nvptx_set_cuda_stream (async, stream);
1844 }