libgomp/plugin/plugin-nvptx.c

   1 /* Plugin for NVPTX execution.
   2
   3    Copyright (C) 2013-2016 Free Software Foundation, Inc.
   4
   5    Contributed by Mentor Embedded.
   6
   7    This file is part of the GNU Offloading and Multi Processing Library
   8    (libgomp).
   9
  10    Libgomp is free software; you can redistribute it and/or modify it
  11    under the terms of the GNU General Public License as published by
  12    the Free Software Foundation; either version 3, or (at your option)
  13    any later version.
  14
  15    Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
  16    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  17    FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  18    more details.
  19
  20    Under Section 7 of GPL version 3, you are granted additional
  21    permissions described in the GCC Runtime Library Exception, version
  22    3.1, as published by the Free Software Foundation.
  23
  24    You should have received a copy of the GNU General Public License and
  25    a copy of the GCC Runtime Library Exception along with this program;
  26    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  27    <http://www.gnu.org/licenses/>.  */
  28
  29 /* Nvidia PTX-specific parts of OpenACC support.  The cuda driver
  30    library appears to hold some implicit state, but the documentation
  31    is not clear as to what that state might be.  Or how one might
  32    propagate it from one thread to another.  */
  33
  34 #include "openacc.h"
  35 #include "config.h"
  36 #include "libgomp-plugin.h"
  37 #include "oacc-plugin.h"
  38 #include "gomp-constants.h"
  39
  40 #include <pthread.h>
  41 #include <cuda.h>
  42 #include <stdbool.h>
  43 #include <stdint.h>
  44 #include <limits.h>
  45 #include <string.h>
  46 #include <stdio.h>
  47 #include <unistd.h>
  48 #include <assert.h>
  49 #include <errno.h>
  50
  51 static const char *
  52 cuda_error (CUresult r)
  53 {
  54 #if CUDA_VERSION < 7000
  55   /* Specified in documentation and present in library from at least
  56      5.5.  Not declared in header file prior to 7.0.  */
  57   extern CUresult cuGetErrorString (CUresult, const char **);
  58 #endif
  59   const char *desc;
  60
  61   r = cuGetErrorString (r, &desc);
  62   if (r != CUDA_SUCCESS)
  63     desc = "unknown cuda error";
  64
  65   return desc;
  66 }
  67
  68 /* Convenience macros for the frequently used CUDA library call and
  69    error handling sequence.  This does not capture all the cases we
  70    use in this file, but is common enough.  */
  71
  72 #define CUDA_CALL_ERET(ERET, FN, ...)           \
  73   do {                                          \
  74     unsigned __r = FN (__VA_ARGS__);            \
  75     if (__r != CUDA_SUCCESS)                    \
  76       {                                         \
  77         GOMP_PLUGIN_error (#FN " error: %s",    \
  78                            cuda_error (__r));   \
  79         return ERET;                            \
  80       }                                         \
  81   } while (0)
  82
  83 #define CUDA_CALL(FN, ...)                      \
  84   CUDA_CALL_ERET (false, (FN), __VA_ARGS__)
  85
  86 #define CUDA_CALL_ASSERT(FN, ...)               \
  87   do {                                          \
  88     unsigned __r = FN (__VA_ARGS__);            \
  89     if (__r != CUDA_SUCCESS)                    \
  90       {                                         \
  91         GOMP_PLUGIN_fatal (#FN " error: %s",    \
  92                            cuda_error (__r));   \
  93       }                                         \
  94   } while (0)
  95
  96 static unsigned int instantiated_devices = 0;
  97 static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
  98
  99 struct ptx_stream
 100 {
 101   CUstream stream;
 102   pthread_t host_thread;
 103   bool multithreaded;
 104
 105   CUdeviceptr d;
 106   void *h;
 107   void *h_begin;
 108   void *h_end;
 109   void *h_next;
 110   void *h_prev;
 111   void *h_tail;
 112
 113   struct ptx_stream *next;
 114 };
 115
 116 /* Thread-specific data for PTX.  */
 117
 118 struct nvptx_thread
 119 {
 120   struct ptx_stream *current_stream;
 121   struct ptx_device *ptx_dev;
 122 };
 123
 124 struct map
 125 {
 126   int     async;
 127   size_t  size;
 128   char    mappings[0];
 129 };
 130
 131 static bool
 132 map_init (struct ptx_stream *s)
 133 {
 134   int size = getpagesize ();
 135
 136   assert (s);
 137   assert (!s->d);
 138   assert (!s->h);
 139
 140   CUDA_CALL (cuMemAllocHost, &s->h, size);
 141   CUDA_CALL (cuMemHostGetDevicePointer, &s->d, s->h, 0);
 142
 143   assert (s->h);
 144
 145   s->h_begin = s->h;
 146   s->h_end = s->h_begin + size;
 147   s->h_next = s->h_prev = s->h_tail = s->h_begin;
 148
 149   assert (s->h_next);
 150   assert (s->h_end);
 151   return true;
 152 }
 153
 154 static bool
 155 map_fini (struct ptx_stream *s)
 156 {
 157   CUDA_CALL (cuMemFreeHost, s->h);
 158   return true;
 159 }
 160
 161 static void
 162 map_pop (struct ptx_stream *s)
 163 {
 164   struct map *m;
 165
 166   assert (s != NULL);
 167   assert (s->h_next);
 168   assert (s->h_prev);
 169   assert (s->h_tail);
 170
 171   m = s->h_tail;
 172
 173   s->h_tail += m->size;
 174
 175   if (s->h_tail >= s->h_end)
 176     s->h_tail = s->h_begin + (int) (s->h_tail - s->h_end);
 177
 178   if (s->h_next == s->h_tail)
 179     s->h_prev = s->h_next;
 180
 181   assert (s->h_next >= s->h_begin);
 182   assert (s->h_tail >= s->h_begin);
 183   assert (s->h_prev >= s->h_begin);
 184
 185   assert (s->h_next <= s->h_end);
 186   assert (s->h_tail <= s->h_end);
 187   assert (s->h_prev <= s->h_end);
 188 }
 189
 190 static void
 191 map_push (struct ptx_stream *s, int async, size_t size, void **h, void **d)
 192 {
 193   int left;
 194   int offset;
 195   struct map *m;
 196
 197   assert (s != NULL);
 198
 199   left = s->h_end - s->h_next;
 200   size += sizeof (struct map);
 201
 202   assert (s->h_prev);
 203   assert (s->h_next);
 204
 205   if (size >= left)
 206     {
 207       m = s->h_prev;
 208       m->size += left;
 209       s->h_next = s->h_begin;
 210
 211       if (s->h_next + size > s->h_end)
 212         GOMP_PLUGIN_fatal ("unable to push map");
 213     }
 214
 215   assert (s->h_next);
 216
 217   m = s->h_next;
 218   m->async = async;
 219   m->size = size;
 220
 221   offset = (void *)&m->mappings[0] - s->h;
 222
 223   *d = (void *)(s->d + offset);
 224   *h = (void *)(s->h + offset);
 225
 226   s->h_prev = s->h_next;
 227   s->h_next += size;
 228
 229   assert (s->h_prev);
 230   assert (s->h_next);
 231
 232   assert (s->h_next >= s->h_begin);
 233   assert (s->h_tail >= s->h_begin);
 234   assert (s->h_prev >= s->h_begin);
 235   assert (s->h_next <= s->h_end);
 236   assert (s->h_tail <= s->h_end);
 237   assert (s->h_prev <= s->h_end);
 238
 239   return;
 240 }
 241
 242 /* Target data function launch information.  */
 243
 244 struct targ_fn_launch
 245 {
 246   const char *fn;
 247   unsigned short dim[GOMP_DIM_MAX];
 248 };
 249
 250 /* Target PTX object information.  */
 251
 252 struct targ_ptx_obj
 253 {
 254   const char *code;
 255   size_t size;
 256 };
 257
 258 /* Target data image information.  */
 259
 260 typedef struct nvptx_tdata
 261 {
 262   const struct targ_ptx_obj *ptx_objs;
 263   unsigned ptx_num;
 264
 265   const char *const *var_names;
 266   unsigned var_num;
 267
 268   const struct targ_fn_launch *fn_descs;
 269   unsigned fn_num;
 270 } nvptx_tdata_t;
 271
 272 /* Descriptor of a loaded function.  */
 273
 274 struct targ_fn_descriptor
 275 {
 276   CUfunction fn;
 277   const struct targ_fn_launch *launch;
 278   int regs_per_thread;
 279   int max_threads_per_block;
 280 };
 281
 282 /* A loaded PTX image.  */
 283 struct ptx_image_data
 284 {
 285   const void *target_data;
 286   CUmodule module;
 287
 288   struct targ_fn_descriptor *fns;  /* Array of functions.  */
 289
 290   struct ptx_image_data *next;
 291 };
 292
 293 struct ptx_device
 294 {
 295   CUcontext ctx;
 296   bool ctx_shared;
 297   CUdevice dev;
 298   struct ptx_stream *null_stream;
 299   /* All non-null streams associated with this device (actually context),
 300      either created implicitly or passed in from the user (via
 301      acc_set_cuda_stream).  */
 302   struct ptx_stream *active_streams;
 303   struct {
 304     struct ptx_stream **arr;
 305     int size;
 306   } async_streams;
 307   /* A lock for use when manipulating the above stream list and array.  */
 308   pthread_mutex_t stream_lock;
 309   int ord;
 310   bool overlap;
 311   bool map;
 312   bool concur;
 313   bool mkern;
 314   int  mode;
 315   int clock_khz;
 316   int num_sms;
 317   int regs_per_block;
 318   int regs_per_sm;
 319
 320   struct ptx_image_data *images;  /* Images loaded on device.  */
 321   pthread_mutex_t image_lock;     /* Lock for above list.  */
 322
 323   struct ptx_device *next;
 324 };
 325
 326 enum ptx_event_type
 327 {
 328   PTX_EVT_MEM,
 329   PTX_EVT_KNL,
 330   PTX_EVT_SYNC,
 331   PTX_EVT_ASYNC_CLEANUP
 332 };
 333
 334 struct ptx_event
 335 {
 336   CUevent *evt;
 337   int type;
 338   void *addr;
 339   int ord;
 340   int val;
 341
 342   struct ptx_event *next;
 343 };
 344
 345 static pthread_mutex_t ptx_event_lock;
 346 static struct ptx_event *ptx_events;
 347
 348 static struct ptx_device **ptx_devices;
 349
 350 static inline struct nvptx_thread *
 351 nvptx_thread (void)
 352 {
 353   return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
 354 }
 355
 356 static bool
 357 init_streams_for_device (struct ptx_device *ptx_dev, int concurrency)
 358 {
 359   int i;
 360   struct ptx_stream *null_stream
 361     = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
 362
 363   null_stream->stream = NULL;
 364   null_stream->host_thread = pthread_self ();
 365   null_stream->multithreaded = true;
 366   null_stream->d = (CUdeviceptr) NULL;
 367   null_stream->h = NULL;
 368   if (!map_init (null_stream))
 369     return false;
 370
 371   ptx_dev->null_stream = null_stream;
 372   ptx_dev->active_streams = NULL;
 373   pthread_mutex_init (&ptx_dev->stream_lock, NULL);
 374
 375   if (concurrency < 1)
 376     concurrency = 1;
 377
 378   /* This is just a guess -- make space for as many async streams as the
 379      current device is capable of concurrently executing.  This can grow
 380      later as necessary.  No streams are created yet.  */
 381   ptx_dev->async_streams.arr
 382     = GOMP_PLUGIN_malloc (concurrency * sizeof (struct ptx_stream *));
 383   ptx_dev->async_streams.size = concurrency;
 384
 385   for (i = 0; i < concurrency; i++)
 386     ptx_dev->async_streams.arr[i] = NULL;
 387
 388   return true;
 389 }
 390
 391 static bool
 392 fini_streams_for_device (struct ptx_device *ptx_dev)
 393 {
 394   free (ptx_dev->async_streams.arr);
 395
 396   bool ret = true;
 397   while (ptx_dev->active_streams != NULL)
 398     {
 399       struct ptx_stream *s = ptx_dev->active_streams;
 400       ptx_dev->active_streams = ptx_dev->active_streams->next;
 401
 402       ret &= map_fini (s);
 403
 404       CUresult r = cuStreamDestroy (s->stream);
 405       if (r != CUDA_SUCCESS)
 406         {
 407           GOMP_PLUGIN_error ("cuStreamDestroy error: %s", cuda_error (r));
 408           ret = false;
 409         }
 410       free (s);
 411     }
 412
 413   ret &= map_fini (ptx_dev->null_stream);
 414   free (ptx_dev->null_stream);
 415   return ret;
 416 }
 417
 418 /* Select a stream for (OpenACC-semantics) ASYNC argument for the current
 419    thread THREAD (and also current device/context).  If CREATE is true, create
 420    the stream if it does not exist (or use EXISTING if it is non-NULL), and
 421    associate the stream with the same thread argument.  Returns stream to use
 422    as result.  */
 423
 424 static struct ptx_stream *
 425 select_stream_for_async (int async, pthread_t thread, bool create,
 426                          CUstream existing)
 427 {
 428   struct nvptx_thread *nvthd = nvptx_thread ();
 429   /* Local copy of TLS variable.  */
 430   struct ptx_device *ptx_dev = nvthd->ptx_dev;
 431   struct ptx_stream *stream = NULL;
 432   int orig_async = async;
 433
 434   /* The special value acc_async_noval (-1) maps (for now) to an
 435      implicitly-created stream, which is then handled the same as any other
 436      numbered async stream.  Other options are available, e.g. using the null
 437      stream for anonymous async operations, or choosing an idle stream from an
 438      active set.  But, stick with this for now.  */
 439   if (async > acc_async_sync)
 440     async++;
 441
 442   if (create)
 443     pthread_mutex_lock (&ptx_dev->stream_lock);
 444
 445   /* NOTE: AFAICT there's no particular need for acc_async_sync to map to the
 446      null stream, and in fact better performance may be obtainable if it doesn't
 447      (because the null stream enforces overly-strict synchronisation with
 448      respect to other streams for legacy reasons, and that's probably not
 449      needed with OpenACC).  Maybe investigate later.  */
 450   if (async == acc_async_sync)
 451     stream = ptx_dev->null_stream;
 452   else if (async >= 0 && async < ptx_dev->async_streams.size
 453            && ptx_dev->async_streams.arr[async] && !(create && existing))
 454     stream = ptx_dev->async_streams.arr[async];
 455   else if (async >= 0 && create)
 456     {
 457       if (async >= ptx_dev->async_streams.size)
 458         {
 459           int i, newsize = ptx_dev->async_streams.size * 2;
 460
 461           if (async >= newsize)
 462             newsize = async + 1;
 463
 464           ptx_dev->async_streams.arr
 465             = GOMP_PLUGIN_realloc (ptx_dev->async_streams.arr,
 466                                    newsize * sizeof (struct ptx_stream *));
 467
 468           for (i = ptx_dev->async_streams.size; i < newsize; i++)
 469             ptx_dev->async_streams.arr[i] = NULL;
 470
 471           ptx_dev->async_streams.size = newsize;
 472         }
 473
 474       /* Create a new stream on-demand if there isn't one already, or if we're
 475          setting a particular async value to an existing (externally-provided)
 476          stream.  */
 477       if (!ptx_dev->async_streams.arr[async] || existing)
 478         {
 479           CUresult r;
 480           struct ptx_stream *s
 481             = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
 482
 483           if (existing)
 484             s->stream = existing;
 485           else
 486             {
 487               r = cuStreamCreate (&s->stream, CU_STREAM_DEFAULT);
 488               if (r != CUDA_SUCCESS)
 489                 {
 490                   pthread_mutex_unlock (&ptx_dev->stream_lock);
 491                   GOMP_PLUGIN_fatal ("cuStreamCreate error: %s",
 492                                      cuda_error (r));
 493                 }
 494             }
 495
 496           /* If CREATE is true, we're going to be queueing some work on this
 497              stream.  Associate it with the current host thread.  */
 498           s->host_thread = thread;
 499           s->multithreaded = false;
 500
 501           s->d = (CUdeviceptr) NULL;
 502           s->h = NULL;
 503           if (!map_init (s))
 504             {
 505               pthread_mutex_unlock (&ptx_dev->stream_lock);
 506               GOMP_PLUGIN_fatal ("map_init fail");
 507             }
 508
 509           s->next = ptx_dev->active_streams;
 510           ptx_dev->active_streams = s;
 511           ptx_dev->async_streams.arr[async] = s;
 512         }
 513
 514       stream = ptx_dev->async_streams.arr[async];
 515     }
 516   else if (async < 0)
 517     {
 518       if (create)
 519         pthread_mutex_unlock (&ptx_dev->stream_lock);
 520       GOMP_PLUGIN_fatal ("bad async %d", async);
 521     }
 522
 523   if (create)
 524     {
 525       assert (stream != NULL);
 526
 527       /* If we're trying to use the same stream from different threads
 528          simultaneously, set stream->multithreaded to true.  This affects the
 529          behaviour of acc_async_test_all and acc_wait_all, which are supposed to
 530          only wait for asynchronous launches from the same host thread they are
 531          invoked on.  If multiple threads use the same async value, we make note
 532          of that here and fall back to testing/waiting for all threads in those
 533          functions.  */
 534       if (thread != stream->host_thread)
 535         stream->multithreaded = true;
 536
 537       pthread_mutex_unlock (&ptx_dev->stream_lock);
 538     }
 539   else if (stream && !stream->multithreaded
 540            && !pthread_equal (stream->host_thread, thread))
 541     GOMP_PLUGIN_fatal ("async %d used on wrong thread", orig_async);
 542
 543   return stream;
 544 }
 545
 546 /* Initialize the device.  Return TRUE on success, else FALSE.  PTX_DEV_LOCK
 547    should be locked on entry and remains locked on exit.  */
 548
 549 static bool
 550 nvptx_init (void)
 551 {
 552   int ndevs;
 553
 554   if (instantiated_devices != 0)
 555     return true;
 556
 557   CUDA_CALL (cuInit, 0);
 558   ptx_events = NULL;
 559   pthread_mutex_init (&ptx_event_lock, NULL);
 560
 561   CUDA_CALL (cuDeviceGetCount, &ndevs);
 562   ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *)
 563                                             * ndevs);
 564   return true;
 565 }
 566
 567 /* Select the N'th PTX device for the current host thread.  The device must
 568    have been previously opened before calling this function.  */
 569
 570 static bool
 571 nvptx_attach_host_thread_to_device (int n)
 572 {
 573   CUdevice dev;
 574   CUresult r;
 575   struct ptx_device *ptx_dev;
 576   CUcontext thd_ctx;
 577
 578   r = cuCtxGetDevice (&dev);
 579   if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
 580     {
 581       GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
 582       return false;
 583     }
 584
 585   if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n)
 586     return true;
 587   else
 588     {
 589       CUcontext old_ctx;
 590
 591       ptx_dev = ptx_devices[n];
 592       if (!ptx_dev)
 593         {
 594           GOMP_PLUGIN_error ("device %d not found", n);
 595           return false;
 596         }
 597
 598       CUDA_CALL (cuCtxGetCurrent, &thd_ctx);
 599
 600       /* We don't necessarily have a current context (e.g. if it has been
 601          destroyed.  Pop it if we do though.  */
 602       if (thd_ctx != NULL)
 603         CUDA_CALL (cuCtxPopCurrent, &old_ctx);
 604
 605       CUDA_CALL (cuCtxPushCurrent, ptx_dev->ctx);
 606     }
 607   return true;
 608 }
 609
 610 static struct ptx_device *
 611 nvptx_open_device (int n)
 612 {
 613   struct ptx_device *ptx_dev;
 614   CUdevice dev, ctx_dev;
 615   CUresult r;
 616   int async_engines, pi;
 617
 618   CUDA_CALL_ERET (NULL, cuDeviceGet, &dev, n);
 619
 620   ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device));
 621
 622   ptx_dev->ord = n;
 623   ptx_dev->dev = dev;
 624   ptx_dev->ctx_shared = false;
 625
 626   r = cuCtxGetDevice (&ctx_dev);
 627   if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
 628     {
 629       GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
 630       return NULL;
 631     }
 632
 633   if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev)
 634     {
 635       /* The current host thread has an active context for a different device.
 636          Detach it.  */
 637       CUcontext old_ctx;
 638       CUDA_CALL_ERET (NULL, cuCtxPopCurrent, &old_ctx);
 639     }
 640
 641   CUDA_CALL_ERET (NULL, cuCtxGetCurrent, &ptx_dev->ctx);
 642
 643   if (!ptx_dev->ctx)
 644     CUDA_CALL_ERET (NULL, cuCtxCreate, &ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
 645   else
 646     ptx_dev->ctx_shared = true;
 647
 648   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 649                   &pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
 650   ptx_dev->overlap = pi;
 651
 652   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 653                   &pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
 654   ptx_dev->map = pi;
 655
 656   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 657                   &pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
 658   ptx_dev->concur = pi;
 659
 660   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 661                   &pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
 662   ptx_dev->mode = pi;
 663
 664   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 665                   &pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
 666   ptx_dev->mkern = pi;
 667
 668   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 669                   &pi, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
 670   ptx_dev->clock_khz = pi;
 671
 672   CUDA_CALL_ERET (NULL,  cuDeviceGetAttribute,
 673                   &pi, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
 674   ptx_dev->num_sms = pi;
 675
 676   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 677                   &pi, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev);
 678   ptx_dev->regs_per_block = pi;
 679
 680   /* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82 is defined only
 681      in CUDA 6.0 and newer.  */
 682   r = cuDeviceGetAttribute (&pi, 82, dev);
 683   /* Fallback: use limit of registers per block, which is usually equal.  */
 684   if (r == CUDA_ERROR_INVALID_VALUE)
 685     pi = ptx_dev->regs_per_block;
 686   else if (r != CUDA_SUCCESS)
 687     {
 688       GOMP_PLUGIN_error ("cuDeviceGetAttribute error: %s", cuda_error (r));
 689       return NULL;
 690     }
 691   ptx_dev->regs_per_sm = pi;
 692
 693   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 694                   &pi, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev);
 695   if (pi != 32)
 696     {
 697       GOMP_PLUGIN_error ("Only warp size 32 is supported");
 698       return NULL;
 699     }
 700
 701   r = cuDeviceGetAttribute (&async_engines,
 702                             CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
 703   if (r != CUDA_SUCCESS)
 704     async_engines = 1;
 705
 706   ptx_dev->images = NULL;
 707   pthread_mutex_init (&ptx_dev->image_lock, NULL);
 708
 709   if (!init_streams_for_device (ptx_dev, async_engines))
 710     return NULL;
 711
 712   return ptx_dev;
 713 }
 714
 715 static bool
 716 nvptx_close_device (struct ptx_device *ptx_dev)
 717 {
 718   if (!ptx_dev)
 719     return true;
 720
 721   if (!fini_streams_for_device (ptx_dev))
 722     return false;
 723
 724   pthread_mutex_destroy (&ptx_dev->image_lock);
 725
 726   if (!ptx_dev->ctx_shared)
 727     CUDA_CALL (cuCtxDestroy, ptx_dev->ctx);
 728
 729   free (ptx_dev);
 730   return true;
 731 }
 732
 733 static int
 734 nvptx_get_num_devices (void)
 735 {
 736   int n;
 737
 738   /* PR libgomp/65099: Currently, we only support offloading in 64-bit
 739      configurations.  */
 740   if (sizeof (void *) != 8)
 741     return 0;
 742
 743   /* This function will be called before the plugin has been initialized in
 744      order to enumerate available devices, but CUDA API routines can't be used
 745      until cuInit has been called.  Just call it now (but don't yet do any
 746      further initialization).  */
 747   if (instantiated_devices == 0)
 748     {
 749       CUresult r = cuInit (0);
 750       /* This is not an error: e.g. we may have CUDA libraries installed but
 751          no devices available.  */
 752       if (r != CUDA_SUCCESS)
 753         return 0;
 754     }
 755
 756   CUDA_CALL_ERET (-1, cuDeviceGetCount, &n);
 757   return n;
 758 }
 759
 760
 761 static bool
 762 link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs,
 763           unsigned num_objs)
 764 {
 765   CUjit_option opts[6];
 766   void *optvals[6];
 767   float elapsed = 0.0;
 768   char elog[1024];
 769   char ilog[16384];
 770   CUlinkState linkstate;
 771   CUresult r;
 772   void *linkout;
 773   size_t linkoutsize __attribute__ ((unused));
 774
 775   opts[0] = CU_JIT_WALL_TIME;
 776   optvals[0] = &elapsed;
 777
 778   opts[1] = CU_JIT_INFO_LOG_BUFFER;
 779   optvals[1] = &ilog[0];
 780
 781   opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
 782   optvals[2] = (void *) sizeof ilog;
 783
 784   opts[3] = CU_JIT_ERROR_LOG_BUFFER;
 785   optvals[3] = &elog[0];
 786
 787   opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
 788   optvals[4] = (void *) sizeof elog;
 789
 790   opts[5] = CU_JIT_LOG_VERBOSE;
 791   optvals[5] = (void *) 1;
 792
 793   CUDA_CALL (cuLinkCreate, 6, opts, optvals, &linkstate);
 794
 795   for (; num_objs--; ptx_objs++)
 796     {
 797       /* cuLinkAddData's 'data' argument erroneously omits the const
 798          qualifier.  */
 799       GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs->code);
 800       r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, (char*)ptx_objs->code,
 801                          ptx_objs->size, 0, 0, 0, 0);
 802       if (r != CUDA_SUCCESS)
 803         {
 804           GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
 805           GOMP_PLUGIN_error ("cuLinkAddData (ptx_code) error: %s",
 806                              cuda_error (r));
 807           return false;
 808         }
 809     }
 810
 811   GOMP_PLUGIN_debug (0, "Linking\n");
 812   r = cuLinkComplete (linkstate, &linkout, &linkoutsize);
 813
 814   GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed);
 815   GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]);
 816
 817   if (r != CUDA_SUCCESS)
 818     {
 819       GOMP_PLUGIN_error ("cuLinkComplete error: %s", cuda_error (r));
 820       return false;
 821     }
 822
 823   CUDA_CALL (cuModuleLoadData, module, linkout);
 824   CUDA_CALL (cuLinkDestroy, linkstate);
 825   return true;
 826 }
 827
 828 static void
 829 event_gc (bool memmap_lockable)
 830 {
 831   struct ptx_event *ptx_event = ptx_events;
 832   struct ptx_event *async_cleanups = NULL;
 833   struct nvptx_thread *nvthd = nvptx_thread ();
 834
 835   pthread_mutex_lock (&ptx_event_lock);
 836
 837   while (ptx_event != NULL)
 838     {
 839       CUresult r;
 840       struct ptx_event *e = ptx_event;
 841
 842       ptx_event = ptx_event->next;
 843
 844       if (e->ord != nvthd->ptx_dev->ord)
 845         continue;
 846
 847       r = cuEventQuery (*e->evt);
 848       if (r == CUDA_SUCCESS)
 849         {
 850           bool append_async = false;
 851           CUevent *te;
 852
 853           te = e->evt;
 854
 855           switch (e->type)
 856             {
 857             case PTX_EVT_MEM:
 858             case PTX_EVT_SYNC:
 859               break;
 860
 861             case PTX_EVT_KNL:
 862               map_pop (e->addr);
 863               break;
 864
 865             case PTX_EVT_ASYNC_CLEANUP:
 866               {
 867                 /* The function gomp_plugin_async_unmap_vars needs to claim the
 868                    memory-map splay tree lock for the current device, so we
 869                    can't call it when one of our callers has already claimed
 870                    the lock.  In that case, just delay the GC for this event
 871                    until later.  */
 872                 if (!memmap_lockable)
 873                   continue;
 874
 875                 append_async = true;
 876               }
 877               break;
 878             }
 879
 880           cuEventDestroy (*te);
 881           free ((void *)te);
 882
 883           /* Unlink 'e' from ptx_events list.  */
 884           if (ptx_events == e)
 885             ptx_events = ptx_events->next;
 886           else
 887             {
 888               struct ptx_event *e_ = ptx_events;
 889               while (e_->next != e)
 890                 e_ = e_->next;
 891               e_->next = e_->next->next;
 892             }
 893
 894           if (append_async)
 895             {
 896               e->next = async_cleanups;
 897               async_cleanups = e;
 898             }
 899           else
 900             free (e);
 901         }
 902     }
 903
 904   pthread_mutex_unlock (&ptx_event_lock);
 905
 906   /* We have to do these here, after ptx_event_lock is released.  */
 907   while (async_cleanups)
 908     {
 909       struct ptx_event *e = async_cleanups;
 910       async_cleanups = async_cleanups->next;
 911
 912       GOMP_PLUGIN_async_unmap_vars (e->addr, e->val);
 913       free (e);
 914     }
 915 }
 916
 917 static void
 918 event_add (enum ptx_event_type type, CUevent *e, void *h, int val)
 919 {
 920   struct ptx_event *ptx_event;
 921   struct nvptx_thread *nvthd = nvptx_thread ();
 922
 923   assert (type == PTX_EVT_MEM || type == PTX_EVT_KNL || type == PTX_EVT_SYNC
 924           || type == PTX_EVT_ASYNC_CLEANUP);
 925
 926   ptx_event = GOMP_PLUGIN_malloc (sizeof (struct ptx_event));
 927   ptx_event->type = type;
 928   ptx_event->evt = e;
 929   ptx_event->addr = h;
 930   ptx_event->ord = nvthd->ptx_dev->ord;
 931   ptx_event->val = val;
 932
 933   pthread_mutex_lock (&ptx_event_lock);
 934
 935   ptx_event->next = ptx_events;
 936   ptx_events = ptx_event;
 937
 938   pthread_mutex_unlock (&ptx_event_lock);
 939 }
 940
 941 void
 942 nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
 943             int async, unsigned *dims, void *targ_mem_desc)
 944 {
 945   struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
 946   CUfunction function;
 947   CUresult r;
 948   int i;
 949   struct ptx_stream *dev_str;
 950   void *kargs[1];
 951   void *hp, *dp;
 952   struct nvptx_thread *nvthd = nvptx_thread ();
 953   const char *maybe_abort_msg = "(perhaps abort was called)";
 954
 955   function = targ_fn->fn;
 956
 957   dev_str = select_stream_for_async (async, pthread_self (), false, NULL);
 958   assert (dev_str == nvthd->current_stream);
 959
 960   /* Initialize the launch dimensions.  Typically this is constant,
 961      provided by the device compiler, but we must permit runtime
 962      values.  */
 963   int seen_zero = 0;
 964   for (i = 0; i != GOMP_DIM_MAX; i++)
 965     {
 966       if (targ_fn->launch->dim[i])
 967        dims[i] = targ_fn->launch->dim[i];
 968       if (!dims[i])
 969        seen_zero = 1;
 970     }
 971
 972   if (seen_zero)
 973     {
 974       /* See if the user provided GOMP_OPENACC_DIM environment
 975          variable to specify runtime defaults. */
 976       static int default_dims[GOMP_DIM_MAX];
 977
 978       pthread_mutex_lock (&ptx_dev_lock);
 979       if (!default_dims[0])
 980         {
 981           /* We only read the environment variable once.  You can't
 982              change it in the middle of execution.  The syntax  is
 983              the same as for the -fopenacc-dim compilation option.  */
 984           const char *env_var = getenv ("GOMP_OPENACC_DIM");
 985           if (env_var)
 986             {
 987               const char *pos = env_var;
 988
 989               for (i = 0; *pos && i != GOMP_DIM_MAX; i++)
 990                 {
 991                   if (i && *pos++ != ':')
 992                     break;
 993                   if (*pos != ':')
 994                     {
 995                       const char *eptr;
 996
 997                       errno = 0;
 998                       long val = strtol (pos, (char **)&eptr, 10);
 999                       if (errno || val < 0 || (unsigned)val != val)
1000                         break;
1001                       default_dims[i] = (int)val;
1002                       pos = eptr;
1003                     }
1004                 }
1005             }
1006
1007           int warp_size, block_size, dev_size, cpu_size;
1008           CUdevice dev = nvptx_thread()->ptx_dev->dev;
1009           /* 32 is the default for known hardware.  */
1010           int gang = 0, worker = 32, vector = 32;
1011           CUdevice_attribute cu_tpb, cu_ws, cu_mpc, cu_tpm;
1012
1013           cu_tpb = CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK;
1014           cu_ws = CU_DEVICE_ATTRIBUTE_WARP_SIZE;
1015           cu_mpc = CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT;
1016           cu_tpm  = CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR;
1017
1018           if (cuDeviceGetAttribute (&block_size, cu_tpb, dev) == CUDA_SUCCESS
1019               && cuDeviceGetAttribute (&warp_size, cu_ws, dev) == CUDA_SUCCESS
1020               && cuDeviceGetAttribute (&dev_size, cu_mpc, dev) == CUDA_SUCCESS
1021               && cuDeviceGetAttribute (&cpu_size, cu_tpm, dev)  == CUDA_SUCCESS)
1022             {
1023               GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
1024                                  " dev_size=%d, cpu_size=%d\n",
1025                                  warp_size, block_size, dev_size, cpu_size);
1026               gang = (cpu_size / block_size) * dev_size;
1027               worker = block_size / warp_size;
1028               vector = warp_size;
1029             }
1030
1031           /* There is no upper bound on the gang size.  The best size
1032              matches the hardware configuration.  Logical gangs are
1033              scheduled onto physical hardware.  To maximize usage, we
1034              should guess a large number.  */
1035           if (default_dims[GOMP_DIM_GANG] < 1)
1036             default_dims[GOMP_DIM_GANG] = gang ? gang : 1024;
1037           /* The worker size must not exceed the hardware.  */
1038           if (default_dims[GOMP_DIM_WORKER] < 1
1039               || (default_dims[GOMP_DIM_WORKER] > worker && gang))
1040             default_dims[GOMP_DIM_WORKER] = worker;
1041           /* The vector size must exactly match the hardware.  */
1042           if (default_dims[GOMP_DIM_VECTOR] < 1
1043               || (default_dims[GOMP_DIM_VECTOR] != vector && gang))
1044             default_dims[GOMP_DIM_VECTOR] = vector;
1045
1046           GOMP_PLUGIN_debug (0, " default dimensions [%d,%d,%d]\n",
1047                              default_dims[GOMP_DIM_GANG],
1048                              default_dims[GOMP_DIM_WORKER],
1049                              default_dims[GOMP_DIM_VECTOR]);
1050         }
1051       pthread_mutex_unlock (&ptx_dev_lock);
1052
1053       for (i = 0; i != GOMP_DIM_MAX; i++)
1054         if (!dims[i])
1055           dims[i] = default_dims[i];
1056     }
1057
1058   /* This reserves a chunk of a pre-allocated page of memory mapped on both
1059      the host and the device. HP is a host pointer to the new chunk, and DP is
1060      the corresponding device pointer.  */
1061   map_push (dev_str, async, mapnum * sizeof (void *), &hp, &dp);
1062
1063   GOMP_PLUGIN_debug (0, "  %s: prepare mappings\n", __FUNCTION__);
1064
1065   /* Copy the array of arguments to the mapped page.  */
1066   for (i = 0; i < mapnum; i++)
1067     ((void **) hp)[i] = devaddrs[i];
1068
1069   /* Copy the (device) pointers to arguments to the device (dp and hp might in
1070      fact have the same value on a unified-memory system).  */
1071   CUDA_CALL_ASSERT (cuMemcpy, (CUdeviceptr) dp, (CUdeviceptr) hp,
1072                     mapnum * sizeof (void *));
1073   GOMP_PLUGIN_debug (0, "  %s: kernel %s: launch"
1074                      " gangs=%u, workers=%u, vectors=%u\n",
1075                      __FUNCTION__, targ_fn->launch->fn, dims[GOMP_DIM_GANG],
1076                      dims[GOMP_DIM_WORKER], dims[GOMP_DIM_VECTOR]);
1077
1078   // OpenACC            CUDA
1079   //
1080   // num_gangs          nctaid.x
1081   // num_workers        ntid.y
1082   // vector length      ntid.x
1083
1084   kargs[0] = &dp;
1085   CUDA_CALL_ASSERT (cuLaunchKernel, function,
1086                     dims[GOMP_DIM_GANG], 1, 1,
1087                     dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1,
1088                     0, dev_str->stream, kargs, 0);
1089
1090 #ifndef DISABLE_ASYNC
1091   if (async < acc_async_noval)
1092     {
1093       r = cuStreamSynchronize (dev_str->stream);
1094       if (r == CUDA_ERROR_LAUNCH_FAILED)
1095         GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
1096                            maybe_abort_msg);
1097       else if (r != CUDA_SUCCESS)
1098         GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1099     }
1100   else
1101     {
1102       CUevent *e;
1103
1104       e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1105
1106       r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1107       if (r == CUDA_ERROR_LAUNCH_FAILED)
1108         GOMP_PLUGIN_fatal ("cuEventCreate error: %s %s\n", cuda_error (r),
1109                            maybe_abort_msg);
1110       else if (r != CUDA_SUCCESS)
1111         GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1112
1113       event_gc (true);
1114
1115       CUDA_CALL_ASSERT (cuEventRecord, *e, dev_str->stream);
1116
1117       event_add (PTX_EVT_KNL, e, (void *)dev_str, 0);
1118     }
1119 #else
1120   r = cuCtxSynchronize ();
1121   if (r == CUDA_ERROR_LAUNCH_FAILED)
1122     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
1123                        maybe_abort_msg);
1124   else if (r != CUDA_SUCCESS)
1125     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
1126 #endif
1127
1128   GOMP_PLUGIN_debug (0, "  %s: kernel %s: finished\n", __FUNCTION__,
1129                      targ_fn->launch->fn);
1130
1131 #ifndef DISABLE_ASYNC
1132   if (async < acc_async_noval)
1133 #endif
1134     map_pop (dev_str);
1135 }
1136
1137 void * openacc_get_current_cuda_context (void);
1138
1139 static void *
1140 nvptx_alloc (size_t s)
1141 {
1142   CUdeviceptr d;
1143
1144   CUDA_CALL_ERET (NULL, cuMemAlloc, &d, s);
1145   return (void *) d;
1146 }
1147
1148 static bool
1149 nvptx_free (void *p)
1150 {
1151   CUdeviceptr pb;
1152   size_t ps;
1153
1154   CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) p);
1155   if ((CUdeviceptr) p != pb)
1156     {
1157       GOMP_PLUGIN_error ("invalid device address");
1158       return false;
1159     }
1160
1161   CUDA_CALL (cuMemFree, (CUdeviceptr) p);
1162   return true;
1163 }
1164
1165
1166 static bool
1167 nvptx_host2dev (void *d, const void *h, size_t s)
1168 {
1169   CUdeviceptr pb;
1170   size_t ps;
1171   struct nvptx_thread *nvthd = nvptx_thread ();
1172
1173   if (!s)
1174     return true;
1175   if (!d)
1176     {
1177       GOMP_PLUGIN_error ("invalid device address");
1178       return false;
1179     }
1180
1181   CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
1182
1183   if (!pb)
1184     {
1185       GOMP_PLUGIN_error ("invalid device address");
1186       return false;
1187     }
1188   if (!h)
1189     {
1190       GOMP_PLUGIN_error ("invalid host address");
1191       return false;
1192     }
1193   if (d == h)
1194     {
1195       GOMP_PLUGIN_error ("invalid host or device address");
1196       return false;
1197     }
1198   if ((void *)(d + s) > (void *)(pb + ps))
1199     {
1200       GOMP_PLUGIN_error ("invalid size");
1201       return false;
1202     }
1203
1204 #ifndef DISABLE_ASYNC
1205   if (nvthd && nvthd->current_stream != nvthd->ptx_dev->null_stream)
1206     {
1207       CUevent *e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1208       CUDA_CALL (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1209       event_gc (false);
1210       CUDA_CALL (cuMemcpyHtoDAsync,
1211                  (CUdeviceptr) d, h, s, nvthd->current_stream->stream);
1212       CUDA_CALL (cuEventRecord, *e, nvthd->current_stream->stream);
1213       event_add (PTX_EVT_MEM, e, (void *)h, 0);
1214     }
1215   else
1216 #endif
1217     CUDA_CALL (cuMemcpyHtoD, (CUdeviceptr) d, h, s);
1218
1219   return true;
1220 }
1221
1222 static bool
1223 nvptx_dev2host (void *h, const void *d, size_t s)
1224 {
1225   CUdeviceptr pb;
1226   size_t ps;
1227   struct nvptx_thread *nvthd = nvptx_thread ();
1228
1229   if (!s)
1230     return true;
1231   if (!d)
1232     {
1233       GOMP_PLUGIN_error ("invalid device address");
1234       return false;
1235     }
1236
1237   CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
1238
1239   if (!pb)
1240     {
1241       GOMP_PLUGIN_error ("invalid device address");
1242       return false;
1243     }
1244   if (!h)
1245     {
1246       GOMP_PLUGIN_error ("invalid host address");
1247       return false;
1248     }
1249   if (d == h)
1250     {
1251       GOMP_PLUGIN_error ("invalid host or device address");
1252       return false;
1253     }
1254   if ((void *)(d + s) > (void *)(pb + ps))
1255     {
1256       GOMP_PLUGIN_error ("invalid size");
1257       return false;
1258     }
1259
1260 #ifndef DISABLE_ASYNC
1261   if (nvthd && nvthd->current_stream != nvthd->ptx_dev->null_stream)
1262     {
1263       CUevent *e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1264       CUDA_CALL (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1265       event_gc (false);
1266       CUDA_CALL (cuMemcpyDtoHAsync,
1267                  h, (CUdeviceptr) d, s, nvthd->current_stream->stream);
1268       CUDA_CALL (cuEventRecord, *e, nvthd->current_stream->stream);
1269       event_add (PTX_EVT_MEM, e, (void *)h, 0);
1270     }
1271   else
1272 #endif
1273     CUDA_CALL (cuMemcpyDtoH, h, (CUdeviceptr) d, s);
1274
1275   return true;
1276 }
1277
1278 static void
1279 nvptx_set_async (int async)
1280 {
1281   struct nvptx_thread *nvthd = nvptx_thread ();
1282   nvthd->current_stream
1283     = select_stream_for_async (async, pthread_self (), true, NULL);
1284 }
1285
1286 static int
1287 nvptx_async_test (int async)
1288 {
1289   CUresult r;
1290   struct ptx_stream *s;
1291
1292   s = select_stream_for_async (async, pthread_self (), false, NULL);
1293
1294   if (!s)
1295     GOMP_PLUGIN_fatal ("unknown async %d", async);
1296
1297   r = cuStreamQuery (s->stream);
1298   if (r == CUDA_SUCCESS)
1299     {
1300       /* The oacc-parallel.c:goacc_wait function calls this hook to determine
1301          whether all work has completed on this stream, and if so omits the call
1302          to the wait hook.  If that happens, event_gc might not get called
1303          (which prevents variables from getting unmapped and their associated
1304          device storage freed), so call it here.  */
1305       event_gc (true);
1306       return 1;
1307     }
1308   else if (r == CUDA_ERROR_NOT_READY)
1309     return 0;
1310
1311   GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
1312
1313   return 0;
1314 }
1315
1316 static int
1317 nvptx_async_test_all (void)
1318 {
1319   struct ptx_stream *s;
1320   pthread_t self = pthread_self ();
1321   struct nvptx_thread *nvthd = nvptx_thread ();
1322
1323   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1324
1325   for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
1326     {
1327       if ((s->multithreaded || pthread_equal (s->host_thread, self))
1328           && cuStreamQuery (s->stream) == CUDA_ERROR_NOT_READY)
1329         {
1330           pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1331           return 0;
1332         }
1333     }
1334
1335   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1336
1337   event_gc (true);
1338
1339   return 1;
1340 }
1341
1342 static void
1343 nvptx_wait (int async)
1344 {
1345   struct ptx_stream *s;
1346
1347   s = select_stream_for_async (async, pthread_self (), false, NULL);
1348   if (!s)
1349     GOMP_PLUGIN_fatal ("unknown async %d", async);
1350
1351   CUDA_CALL_ASSERT (cuStreamSynchronize, s->stream);
1352
1353   event_gc (true);
1354 }
1355
1356 static void
1357 nvptx_wait_async (int async1, int async2)
1358 {
1359   CUevent *e;
1360   struct ptx_stream *s1, *s2;
1361   pthread_t self = pthread_self ();
1362
1363   /* The stream that is waiting (rather than being waited for) doesn't
1364      necessarily have to exist already.  */
1365   s2 = select_stream_for_async (async2, self, true, NULL);
1366
1367   s1 = select_stream_for_async (async1, self, false, NULL);
1368   if (!s1)
1369     GOMP_PLUGIN_fatal ("invalid async 1\n");
1370
1371   if (s1 == s2)
1372     GOMP_PLUGIN_fatal ("identical parameters");
1373
1374   e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1375
1376   CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1377
1378   event_gc (true);
1379
1380   CUDA_CALL_ASSERT (cuEventRecord, *e, s1->stream);
1381
1382   event_add (PTX_EVT_SYNC, e, NULL, 0);
1383
1384   CUDA_CALL_ASSERT (cuStreamWaitEvent, s2->stream, *e, 0);
1385 }
1386
1387 static void
1388 nvptx_wait_all (void)
1389 {
1390   CUresult r;
1391   struct ptx_stream *s;
1392   pthread_t self = pthread_self ();
1393   struct nvptx_thread *nvthd = nvptx_thread ();
1394
1395   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1396
1397   /* Wait for active streams initiated by this thread (or by multiple threads)
1398      to complete.  */
1399   for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
1400     {
1401       if (s->multithreaded || pthread_equal (s->host_thread, self))
1402         {
1403           r = cuStreamQuery (s->stream);
1404           if (r == CUDA_SUCCESS)
1405             continue;
1406           else if (r != CUDA_ERROR_NOT_READY)
1407             GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
1408
1409           CUDA_CALL_ASSERT (cuStreamSynchronize, s->stream);
1410         }
1411     }
1412
1413   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1414
1415   event_gc (true);
1416 }
1417
1418 static void
1419 nvptx_wait_all_async (int async)
1420 {
1421   struct ptx_stream *waiting_stream, *other_stream;
1422   CUevent *e;
1423   struct nvptx_thread *nvthd = nvptx_thread ();
1424   pthread_t self = pthread_self ();
1425
1426   /* The stream doing the waiting.  This could be the first mention of the
1427      stream, so create it if necessary.  */
1428   waiting_stream
1429     = select_stream_for_async (async, pthread_self (), true, NULL);
1430
1431   /* Launches on the null stream already block on other streams in the
1432      context.  */
1433   if (!waiting_stream || waiting_stream == nvthd->ptx_dev->null_stream)
1434     return;
1435
1436   event_gc (true);
1437
1438   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1439
1440   for (other_stream = nvthd->ptx_dev->active_streams;
1441        other_stream != NULL;
1442        other_stream = other_stream->next)
1443     {
1444       if (!other_stream->multithreaded
1445           && !pthread_equal (other_stream->host_thread, self))
1446         continue;
1447
1448       e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1449
1450       CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1451
1452       /* Record an event on the waited-for stream.  */
1453       CUDA_CALL_ASSERT (cuEventRecord, *e, other_stream->stream);
1454
1455       event_add (PTX_EVT_SYNC, e, NULL, 0);
1456
1457       CUDA_CALL_ASSERT (cuStreamWaitEvent, waiting_stream->stream, *e, 0);
1458    }
1459
1460   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1461 }
1462
1463 static void *
1464 nvptx_get_current_cuda_device (void)
1465 {
1466   struct nvptx_thread *nvthd = nvptx_thread ();
1467
1468   if (!nvthd || !nvthd->ptx_dev)
1469     return NULL;
1470
1471   return &nvthd->ptx_dev->dev;
1472 }
1473
1474 static void *
1475 nvptx_get_current_cuda_context (void)
1476 {
1477   struct nvptx_thread *nvthd = nvptx_thread ();
1478
1479   if (!nvthd || !nvthd->ptx_dev)
1480     return NULL;
1481
1482   return nvthd->ptx_dev->ctx;
1483 }
1484
1485 static void *
1486 nvptx_get_cuda_stream (int async)
1487 {
1488   struct ptx_stream *s;
1489   struct nvptx_thread *nvthd = nvptx_thread ();
1490
1491   if (!nvthd || !nvthd->ptx_dev)
1492     return NULL;
1493
1494   s = select_stream_for_async (async, pthread_self (), false, NULL);
1495
1496   return s ? s->stream : NULL;
1497 }
1498
1499 static int
1500 nvptx_set_cuda_stream (int async, void *stream)
1501 {
1502   struct ptx_stream *oldstream;
1503   pthread_t self = pthread_self ();
1504   struct nvptx_thread *nvthd = nvptx_thread ();
1505
1506   if (async < 0)
1507     GOMP_PLUGIN_fatal ("bad async %d", async);
1508
1509   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1510
1511   /* We have a list of active streams and an array mapping async values to
1512      entries of that list.  We need to take "ownership" of the passed-in stream,
1513      and add it to our list, removing the previous entry also (if there was one)
1514      in order to prevent resource leaks.  Note the potential for surprise
1515      here: maybe we should keep track of passed-in streams and leave it up to
1516      the user to tidy those up, but that doesn't work for stream handles
1517      returned from acc_get_cuda_stream above...  */
1518
1519   oldstream = select_stream_for_async (async, self, false, NULL);
1520
1521   if (oldstream)
1522     {
1523       if (nvthd->ptx_dev->active_streams == oldstream)
1524         nvthd->ptx_dev->active_streams = nvthd->ptx_dev->active_streams->next;
1525       else
1526         {
1527           struct ptx_stream *s = nvthd->ptx_dev->active_streams;
1528           while (s->next != oldstream)
1529             s = s->next;
1530           s->next = s->next->next;
1531         }
1532
1533       CUDA_CALL_ASSERT (cuStreamDestroy, oldstream->stream);
1534
1535       if (!map_fini (oldstream))
1536         GOMP_PLUGIN_fatal ("error when freeing host memory");
1537
1538       free (oldstream);
1539     }
1540
1541   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1542
1543   (void) select_stream_for_async (async, self, true, (CUstream) stream);
1544
1545   return 1;
1546 }
1547
1548 /* Plugin entry points.  */
1549
1550 const char *
1551 GOMP_OFFLOAD_get_name (void)
1552 {
1553   return "nvptx";
1554 }
1555
1556 unsigned int
1557 GOMP_OFFLOAD_get_caps (void)
1558 {
1559   return GOMP_OFFLOAD_CAP_OPENACC_200 | GOMP_OFFLOAD_CAP_OPENMP_400;
1560 }
1561
1562 int
1563 GOMP_OFFLOAD_get_type (void)
1564 {
1565   return OFFLOAD_TARGET_TYPE_NVIDIA_PTX;
1566 }
1567
1568 int
1569 GOMP_OFFLOAD_get_num_devices (void)
1570 {
1571   return nvptx_get_num_devices ();
1572 }
1573
1574 bool
1575 GOMP_OFFLOAD_init_device (int n)
1576 {
1577   struct ptx_device *dev;
1578
1579   pthread_mutex_lock (&ptx_dev_lock);
1580
1581   if (!nvptx_init () || ptx_devices[n] != NULL)
1582     {
1583       pthread_mutex_unlock (&ptx_dev_lock);
1584       return false;
1585     }
1586
1587   dev = nvptx_open_device (n);
1588   if (dev)
1589     {
1590       ptx_devices[n] = dev;
1591       instantiated_devices++;
1592     }
1593
1594   pthread_mutex_unlock (&ptx_dev_lock);
1595
1596   return dev != NULL;
1597 }
1598
1599 bool
1600 GOMP_OFFLOAD_fini_device (int n)
1601 {
1602   pthread_mutex_lock (&ptx_dev_lock);
1603
1604   if (ptx_devices[n] != NULL)
1605     {
1606       if (!nvptx_attach_host_thread_to_device (n)
1607           || !nvptx_close_device (ptx_devices[n]))
1608         {
1609           pthread_mutex_unlock (&ptx_dev_lock);
1610           return false;
1611         }
1612       ptx_devices[n] = NULL;
1613       instantiated_devices--;
1614     }
1615
1616   pthread_mutex_unlock (&ptx_dev_lock);
1617   return true;
1618 }
1619
1620 /* Return the libgomp version number we're compatible with.  There is
1621    no requirement for cross-version compatibility.  */
1622
1623 unsigned
1624 GOMP_OFFLOAD_version (void)
1625 {
1626   return GOMP_VERSION;
1627 }
1628
1629 /* Initialize __nvptx_clocktick, if present in MODULE.  */
1630
1631 static void
1632 nvptx_set_clocktick (CUmodule module, struct ptx_device *dev)
1633 {
1634   CUdeviceptr dptr;
1635   CUresult r = cuModuleGetGlobal (&dptr, NULL, module, "__nvptx_clocktick");
1636   if (r == CUDA_ERROR_NOT_FOUND)
1637     return;
1638   if (r != CUDA_SUCCESS)
1639     GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
1640   double __nvptx_clocktick = 1e-3 / dev->clock_khz;
1641   r = cuMemcpyHtoD (dptr, &__nvptx_clocktick, sizeof (__nvptx_clocktick));
1642   if (r != CUDA_SUCCESS)
1643     GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
1644 }
1645
1646 /* Load the (partial) program described by TARGET_DATA to device
1647    number ORD.  Allocate and return TARGET_TABLE.  */
1648
1649 int
1650 GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
1651                          struct addr_pair **target_table)
1652 {
1653   CUmodule module;
1654   const char *const *var_names;
1655   const struct targ_fn_launch *fn_descs;
1656   unsigned int fn_entries, var_entries, i, j;
1657   struct targ_fn_descriptor *targ_fns;
1658   struct addr_pair *targ_tbl;
1659   const nvptx_tdata_t *img_header = (const nvptx_tdata_t *) target_data;
1660   struct ptx_image_data *new_image;
1661   struct ptx_device *dev;
1662
1663   if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1664     {
1665       GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1666                          " (expected %u, received %u)",
1667                          GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1668       return -1;
1669     }
1670
1671   if (!nvptx_attach_host_thread_to_device (ord)
1672       || !link_ptx (&module, img_header->ptx_objs, img_header->ptx_num))
1673     return -1;
1674
1675   dev = ptx_devices[ord];
1676
1677   /* The mkoffload utility emits a struct of pointers/integers at the
1678      start of each offload image.  The array of kernel names and the
1679      functions addresses form a one-to-one correspondence.  */
1680
1681   var_entries = img_header->var_num;
1682   var_names = img_header->var_names;
1683   fn_entries = img_header->fn_num;
1684   fn_descs = img_header->fn_descs;
1685
1686   targ_tbl = GOMP_PLUGIN_malloc (sizeof (struct addr_pair)
1687                                  * (fn_entries + var_entries));
1688   targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
1689                                  * fn_entries);
1690
1691   *target_table = targ_tbl;
1692
1693   new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data));
1694   new_image->target_data = target_data;
1695   new_image->module = module;
1696   new_image->fns = targ_fns;
1697
1698   pthread_mutex_lock (&dev->image_lock);
1699   new_image->next = dev->images;
1700   dev->images = new_image;
1701   pthread_mutex_unlock (&dev->image_lock);
1702
1703   for (i = 0; i < fn_entries; i++, targ_fns++, targ_tbl++)
1704     {
1705       CUfunction function;
1706       int nregs, mthrs;
1707
1708       CUDA_CALL_ERET (-1, cuModuleGetFunction, &function, module,
1709                       fn_descs[i].fn);
1710       CUDA_CALL_ERET (-1, cuFuncGetAttribute, &nregs,
1711                       CU_FUNC_ATTRIBUTE_NUM_REGS, function);
1712       CUDA_CALL_ERET (-1, cuFuncGetAttribute, &mthrs,
1713                       CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, function);
1714
1715       targ_fns->fn = function;
1716       targ_fns->launch = &fn_descs[i];
1717       targ_fns->regs_per_thread = nregs;
1718       targ_fns->max_threads_per_block = mthrs;
1719
1720       targ_tbl->start = (uintptr_t) targ_fns;
1721       targ_tbl->end = targ_tbl->start + 1;
1722     }
1723
1724   for (j = 0; j < var_entries; j++, targ_tbl++)
1725     {
1726       CUdeviceptr var;
1727       size_t bytes;
1728
1729       CUDA_CALL_ERET (-1, cuModuleGetGlobal,
1730                       &var, &bytes, module, var_names[j]);
1731
1732       targ_tbl->start = (uintptr_t) var;
1733       targ_tbl->end = targ_tbl->start + bytes;
1734     }
1735
1736   nvptx_set_clocktick (module, dev);
1737
1738   return fn_entries + var_entries;
1739 }
1740
1741 /* Unload the program described by TARGET_DATA.  DEV_DATA is the
1742    function descriptors allocated by G_O_load_image.  */
1743
1744 bool
1745 GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data)
1746 {
1747   struct ptx_image_data *image, **prev_p;
1748   struct ptx_device *dev = ptx_devices[ord];
1749
1750   if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1751     {
1752       GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1753                          " (expected %u, received %u)",
1754                          GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1755       return false;
1756     }
1757
1758   bool ret = true;
1759   pthread_mutex_lock (&dev->image_lock);
1760   for (prev_p = &dev->images; (image = *prev_p) != 0; prev_p = &image->next)
1761     if (image->target_data == target_data)
1762       {
1763         *prev_p = image->next;
1764         if (cuModuleUnload (image->module) != CUDA_SUCCESS)
1765           ret = false;
1766         free (image->fns);
1767         free (image);
1768         break;
1769       }
1770   pthread_mutex_unlock (&dev->image_lock);
1771   return ret;
1772 }
1773
1774 void *
1775 GOMP_OFFLOAD_alloc (int ord, size_t size)
1776 {
1777   if (!nvptx_attach_host_thread_to_device (ord))
1778     return NULL;
1779   return nvptx_alloc (size);
1780 }
1781
1782 bool
1783 GOMP_OFFLOAD_free (int ord, void *ptr)
1784 {
1785   return (nvptx_attach_host_thread_to_device (ord)
1786           && nvptx_free (ptr));
1787 }
1788
1789 bool
1790 GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
1791 {
1792   return (nvptx_attach_host_thread_to_device (ord)
1793           && nvptx_dev2host (dst, src, n));
1794 }
1795
1796 bool
1797 GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
1798 {
1799   return (nvptx_attach_host_thread_to_device (ord)
1800           && nvptx_host2dev (dst, src, n));
1801 }
1802
1803 bool
1804 GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n)
1805 {
1806   struct ptx_device *ptx_dev = ptx_devices[ord];
1807   CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n,
1808                                 ptx_dev->null_stream->stream);
1809   return true;
1810 }
1811
1812 void (*device_run) (int n, void *fn_ptr, void *vars) = NULL;
1813
1814 void
1815 GOMP_OFFLOAD_openacc_parallel (void (*fn) (void *), size_t mapnum,
1816                                void **hostaddrs, void **devaddrs,
1817                                int async, unsigned *dims, void *targ_mem_desc)
1818 {
1819   nvptx_exec (fn, mapnum, hostaddrs, devaddrs, async, dims, targ_mem_desc);
1820 }
1821
1822 void
1823 GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc, int async)
1824 {
1825   struct nvptx_thread *nvthd = nvptx_thread ();
1826   CUevent *e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1827
1828   CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1829   CUDA_CALL_ASSERT (cuEventRecord, *e, nvthd->current_stream->stream);
1830   event_add (PTX_EVT_ASYNC_CLEANUP, e, targ_mem_desc, async);
1831 }
1832
1833 int
1834 GOMP_OFFLOAD_openacc_async_test (int async)
1835 {
1836   return nvptx_async_test (async);
1837 }
1838
1839 int
1840 GOMP_OFFLOAD_openacc_async_test_all (void)
1841 {
1842   return nvptx_async_test_all ();
1843 }
1844
1845 void
1846 GOMP_OFFLOAD_openacc_async_wait (int async)
1847 {
1848   nvptx_wait (async);
1849 }
1850
1851 void
1852 GOMP_OFFLOAD_openacc_async_wait_async (int async1, int async2)
1853 {
1854   nvptx_wait_async (async1, async2);
1855 }
1856
1857 void
1858 GOMP_OFFLOAD_openacc_async_wait_all (void)
1859 {
1860   nvptx_wait_all ();
1861 }
1862
1863 void
1864 GOMP_OFFLOAD_openacc_async_wait_all_async (int async)
1865 {
1866   nvptx_wait_all_async (async);
1867 }
1868
1869 void
1870 GOMP_OFFLOAD_openacc_async_set_async (int async)
1871 {
1872   nvptx_set_async (async);
1873 }
1874
1875 void *
1876 GOMP_OFFLOAD_openacc_create_thread_data (int ord)
1877 {
1878   struct ptx_device *ptx_dev;
1879   struct nvptx_thread *nvthd
1880     = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
1881   CUcontext thd_ctx;
1882
1883   ptx_dev = ptx_devices[ord];
1884
1885   assert (ptx_dev);
1886
1887   CUDA_CALL_ASSERT (cuCtxGetCurrent, &thd_ctx);
1888
1889   assert (ptx_dev->ctx);
1890
1891   if (!thd_ctx)
1892     CUDA_CALL_ASSERT (cuCtxPushCurrent, ptx_dev->ctx);
1893
1894   nvthd->current_stream = ptx_dev->null_stream;
1895   nvthd->ptx_dev = ptx_dev;
1896
1897   return (void *) nvthd;
1898 }
1899
1900 void
1901 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
1902 {
1903   free (data);
1904 }
1905
1906 void *
1907 GOMP_OFFLOAD_openacc_get_current_cuda_device (void)
1908 {
1909   return nvptx_get_current_cuda_device ();
1910 }
1911
1912 void *
1913 GOMP_OFFLOAD_openacc_get_current_cuda_context (void)
1914 {
1915   return nvptx_get_current_cuda_context ();
1916 }
1917
1918 /* NOTE: This returns a CUstream, not a ptx_stream pointer.  */
1919
1920 void *
1921 GOMP_OFFLOAD_openacc_get_cuda_stream (int async)
1922 {
1923   return nvptx_get_cuda_stream (async);
1924 }
1925
1926 /* NOTE: This takes a CUstream, not a ptx_stream pointer.  */
1927
1928 int
1929 GOMP_OFFLOAD_openacc_set_cuda_stream (int async, void *stream)
1930 {
1931   return nvptx_set_cuda_stream (async, stream);
1932 }
1933
1934 /* Adjust launch dimensions: pick good values for number of blocks and warps
1935    and ensure that number of warps does not exceed CUDA limits as well as GCC's
1936    own limits.  */
1937
1938 static void
1939 nvptx_adjust_launch_bounds (struct targ_fn_descriptor *fn,
1940                             struct ptx_device *ptx_dev,
1941                             int *teams_p, int *threads_p)
1942 {
1943   int max_warps_block = fn->max_threads_per_block / 32;
1944   /* Maximum 32 warps per block is an implementation limit in NVPTX backend
1945      and libgcc, which matches documented limit of all GPUs as of 2015.  */
1946   if (max_warps_block > 32)
1947     max_warps_block = 32;
1948   if (*threads_p <= 0)
1949     *threads_p = 8;
1950   if (*threads_p > max_warps_block)
1951     *threads_p = max_warps_block;
1952
1953   int regs_per_block = fn->regs_per_thread * 32 * *threads_p;
1954   /* This is an estimate of how many blocks the device can host simultaneously.
1955      Actual limit, which may be lower, can be queried with "occupancy control"
1956      driver interface (since CUDA 6.0).  */
1957   int max_blocks = ptx_dev->regs_per_sm / regs_per_block * ptx_dev->num_sms;
1958   if (*teams_p <= 0 || *teams_p > max_blocks)
1959     *teams_p = max_blocks;
1960 }
1961
1962 /* Return the size of per-warp stacks (see gcc -msoft-stack) to use for OpenMP
1963    target regions.  */
1964
1965 static size_t
1966 nvptx_stacks_size ()
1967 {
1968   return 128 * 1024;
1969 }
1970
1971 /* Return contiguous storage for NUM stacks, each SIZE bytes.  */
1972
1973 static void *
1974 nvptx_stacks_alloc (size_t size, int num)
1975 {
1976   CUdeviceptr stacks;
1977   CUresult r = cuMemAlloc (&stacks, size * num);
1978   if (r != CUDA_SUCCESS)
1979     GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
1980   return (void *) stacks;
1981 }
1982
1983 /* Release storage previously allocated by nvptx_stacks_alloc.  */
1984
1985 static void
1986 nvptx_stacks_free (void *p, int num)
1987 {
1988   CUresult r = cuMemFree ((CUdeviceptr) p);
1989   if (r != CUDA_SUCCESS)
1990     GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
1991 }
1992
1993 void
1994 GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
1995 {
1996   CUfunction function = ((struct targ_fn_descriptor *) tgt_fn)->fn;
1997   CUresult r;
1998   struct ptx_device *ptx_dev = ptx_devices[ord];
1999   const char *maybe_abort_msg = "(perhaps abort was called)";
2000   int teams = 0, threads = 0;
2001
2002   if (!args)
2003     GOMP_PLUGIN_fatal ("No target arguments provided");
2004   while (*args)
2005     {
2006       intptr_t id = (intptr_t) *args++, val;
2007       if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
2008         val = (intptr_t) *args++;
2009       else
2010         val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
2011       if ((id & GOMP_TARGET_ARG_DEVICE_MASK) != GOMP_TARGET_ARG_DEVICE_ALL)
2012         continue;
2013       val = val > INT_MAX ? INT_MAX : val;
2014       id &= GOMP_TARGET_ARG_ID_MASK;
2015       if (id == GOMP_TARGET_ARG_NUM_TEAMS)
2016         teams = val;
2017       else if (id == GOMP_TARGET_ARG_THREAD_LIMIT)
2018         threads = val;
2019     }
2020   nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads);
2021
2022   size_t stack_size = nvptx_stacks_size ();
2023   void *stacks = nvptx_stacks_alloc (stack_size, teams * threads);
2024   void *fn_args[] = {tgt_vars, stacks, (void *) stack_size};
2025   size_t fn_args_size = sizeof fn_args;
2026   void *config[] = {
2027     CU_LAUNCH_PARAM_BUFFER_POINTER, fn_args,
2028     CU_LAUNCH_PARAM_BUFFER_SIZE, &fn_args_size,
2029     CU_LAUNCH_PARAM_END
2030   };
2031   r = cuLaunchKernel (function,
2032                       teams, 1, 1,
2033                       32, threads, 1,
2034                       0, ptx_dev->null_stream->stream, NULL, config);
2035   if (r != CUDA_SUCCESS)
2036     GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
2037
2038   r = cuCtxSynchronize ();
2039   if (r == CUDA_ERROR_LAUNCH_FAILED)
2040     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
2041                        maybe_abort_msg);
2042   else if (r != CUDA_SUCCESS)
2043     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
2044   nvptx_stacks_free (stacks, teams * threads);
2045 }
2046
2047 void
2048 GOMP_OFFLOAD_async_run (int ord, void *tgt_fn, void *tgt_vars, void **args,
2049                         void *async_data)
2050 {
2051   GOMP_PLUGIN_fatal ("GOMP_OFFLOAD_async_run unimplemented");
2052 }