libgomp/plugin/plugin-nvptx.c

   1 /* Plugin for NVPTX execution.
   2
   3    Copyright (C) 2013-2018 Free Software Foundation, Inc.
   4
   5    Contributed by Mentor Embedded.
   6
   7    This file is part of the GNU Offloading and Multi Processing Library
   8    (libgomp).
   9
  10    Libgomp is free software; you can redistribute it and/or modify it
  11    under the terms of the GNU General Public License as published by
  12    the Free Software Foundation; either version 3, or (at your option)
  13    any later version.
  14
  15    Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
  16    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  17    FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  18    more details.
  19
  20    Under Section 7 of GPL version 3, you are granted additional
  21    permissions described in the GCC Runtime Library Exception, version
  22    3.1, as published by the Free Software Foundation.
  23
  24    You should have received a copy of the GNU General Public License and
  25    a copy of the GCC Runtime Library Exception along with this program;
  26    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  27    <http://www.gnu.org/licenses/>.  */
  28
  29 /* Nvidia PTX-specific parts of OpenACC support.  The cuda driver
  30    library appears to hold some implicit state, but the documentation
  31    is not clear as to what that state might be.  Or how one might
  32    propagate it from one thread to another.  */
  33
  34 #define _GNU_SOURCE
  35 #include "openacc.h"
  36 #include "config.h"
  37 #include "libgomp-plugin.h"
  38 #include "oacc-plugin.h"
  39 #include "gomp-constants.h"
  40
  41 #include <pthread.h>
  42 #include <cuda.h>
  43 #include <stdbool.h>
  44 #include <stdint.h>
  45 #include <limits.h>
  46 #include <string.h>
  47 #include <stdio.h>
  48 #include <unistd.h>
  49 #include <assert.h>
  50 #include <errno.h>
  51
  52 #if CUDA_VERSION < 6000
  53 extern CUresult cuGetErrorString (CUresult, const char **);
  54 #define CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR 82
  55 #endif
  56
  57 #if CUDA_VERSION >= 6050
  58 #undef cuLinkCreate
  59 #undef cuLinkAddData
  60 CUresult cuLinkAddData (CUlinkState, CUjitInputType, void *, size_t,
  61                         const char *, unsigned, CUjit_option *, void **);
  62 CUresult cuLinkCreate (unsigned, CUjit_option *, void **, CUlinkState *);
  63 #else
  64 typedef size_t (*CUoccupancyB2DSize)(int);
  65 CUresult cuLinkAddData_v2 (CUlinkState, CUjitInputType, void *, size_t,
  66                            const char *, unsigned, CUjit_option *, void **);
  67 CUresult cuLinkCreate_v2 (unsigned, CUjit_option *, void **, CUlinkState *);
  68 CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
  69                                           CUoccupancyB2DSize, size_t, int);
  70 #endif
  71
  72 #define DO_PRAGMA(x) _Pragma (#x)
  73
  74 #if PLUGIN_NVPTX_DYNAMIC
  75 # include <dlfcn.h>
  76
  77 struct cuda_lib_s {
  78
  79 # define CUDA_ONE_CALL(call)                    \
  80   __typeof (call) *call;
  81 # define CUDA_ONE_CALL_MAYBE_NULL(call)         \
  82   CUDA_ONE_CALL (call)
  83 #include "cuda-lib.def"
  84 # undef CUDA_ONE_CALL
  85 # undef CUDA_ONE_CALL_MAYBE_NULL
  86
  87 } cuda_lib;
  88
  89 /* -1 if init_cuda_lib has not been called yet, false
  90    if it has been and failed, true if it has been and succeeded.  */
  91 static signed char cuda_lib_inited = -1;
  92
  93 /* Dynamically load the CUDA runtime library and initialize function
  94    pointers, return false if unsuccessful, true if successful.  */
  95 static bool
  96 init_cuda_lib (void)
  97 {
  98   if (cuda_lib_inited != -1)
  99     return cuda_lib_inited;
 100   const char *cuda_runtime_lib = "libcuda.so.1";
 101   void *h = dlopen (cuda_runtime_lib, RTLD_LAZY);
 102   cuda_lib_inited = false;
 103   if (h == NULL)
 104     return false;
 105
 106 # define CUDA_ONE_CALL(call) CUDA_ONE_CALL_1 (call, false)
 107 # define CUDA_ONE_CALL_MAYBE_NULL(call) CUDA_ONE_CALL_1 (call, true)
 108 # define CUDA_ONE_CALL_1(call, allow_null)              \
 109   cuda_lib.call = dlsym (h, #call);     \
 110   if (!allow_null && cuda_lib.call == NULL)             \
 111     return false;
 112 #include "cuda-lib.def"
 113 # undef CUDA_ONE_CALL
 114 # undef CUDA_ONE_CALL_1
 115 # undef CUDA_ONE_CALL_MAYBE_NULL
 116
 117   cuda_lib_inited = true;
 118   return true;
 119 }
 120 # define CUDA_CALL_PREFIX cuda_lib.
 121 #else
 122
 123 # define CUDA_ONE_CALL(call)
 124 # define CUDA_ONE_CALL_MAYBE_NULL(call) DO_PRAGMA (weak call)
 125 #include "cuda-lib.def"
 126 #undef CUDA_ONE_CALL_MAYBE_NULL
 127 #undef CUDA_ONE_CALL
 128
 129 # define CUDA_CALL_PREFIX
 130 # define init_cuda_lib() true
 131 #endif
 132
 133 #include "secure_getenv.h"
 134
 135 #undef MIN
 136 #undef MAX
 137 #define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
 138 #define MAX(X,Y) ((X) > (Y) ? (X) : (Y))
 139
 140 /* Convenience macros for the frequently used CUDA library call and
 141    error handling sequence as well as CUDA library calls that
 142    do the error checking themselves or don't do it at all.  */
 143
 144 #define CUDA_CALL_ERET(ERET, FN, ...)           \
 145   do {                                          \
 146     unsigned __r                                \
 147       = CUDA_CALL_PREFIX FN (__VA_ARGS__);      \
 148     if (__r != CUDA_SUCCESS)                    \
 149       {                                         \
 150         GOMP_PLUGIN_error (#FN " error: %s",    \
 151                            cuda_error (__r));   \
 152         return ERET;                            \
 153       }                                         \
 154   } while (0)
 155
 156 #define CUDA_CALL(FN, ...)                      \
 157   CUDA_CALL_ERET (false, FN, __VA_ARGS__)
 158
 159 #define CUDA_CALL_ASSERT(FN, ...)               \
 160   do {                                          \
 161     unsigned __r                                \
 162       = CUDA_CALL_PREFIX FN (__VA_ARGS__);      \
 163     if (__r != CUDA_SUCCESS)                    \
 164       {                                         \
 165         GOMP_PLUGIN_fatal (#FN " error: %s",    \
 166                            cuda_error (__r));   \
 167       }                                         \
 168   } while (0)
 169
 170 #define CUDA_CALL_NOCHECK(FN, ...)              \
 171   CUDA_CALL_PREFIX FN (__VA_ARGS__)
 172
 173 #define CUDA_CALL_EXISTS(FN)                    \
 174   CUDA_CALL_PREFIX FN
 175
 176 static const char *
 177 cuda_error (CUresult r)
 178 {
 179   const char *fallback = "unknown cuda error";
 180   const char *desc;
 181
 182   if (!CUDA_CALL_EXISTS (cuGetErrorString))
 183     return fallback;
 184
 185   r = CUDA_CALL_NOCHECK (cuGetErrorString, r, &desc);
 186   if (r == CUDA_SUCCESS)
 187     return desc;
 188
 189   return fallback;
 190 }
 191
 192 static unsigned int instantiated_devices = 0;
 193 static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
 194
 195 struct cuda_map
 196 {
 197   CUdeviceptr d;
 198   size_t size;
 199   bool active;
 200   struct cuda_map *next;
 201 };
 202
 203 struct ptx_stream
 204 {
 205   CUstream stream;
 206   pthread_t host_thread;
 207   bool multithreaded;
 208   struct cuda_map *map;
 209   struct ptx_stream *next;
 210 };
 211
 212 /* Thread-specific data for PTX.  */
 213
 214 struct nvptx_thread
 215 {
 216   struct ptx_stream *current_stream;
 217   struct ptx_device *ptx_dev;
 218 };
 219
 220 static struct cuda_map *
 221 cuda_map_create (size_t size)
 222 {
 223   struct cuda_map *map = GOMP_PLUGIN_malloc (sizeof (struct cuda_map));
 224
 225   assert (map);
 226
 227   map->next = NULL;
 228   map->size = size;
 229   map->active = false;
 230
 231   CUDA_CALL_ERET (NULL, cuMemAlloc, &map->d, size);
 232   assert (map->d);
 233
 234   return map;
 235 }
 236
 237 static void
 238 cuda_map_destroy (struct cuda_map *map)
 239 {
 240   CUDA_CALL_ASSERT (cuMemFree, map->d);
 241   free (map);
 242 }
 243
 244 /* The following map_* routines manage the CUDA device memory that
 245    contains the data mapping arguments for cuLaunchKernel.  Each
 246    asynchronous PTX stream may have multiple pending kernel
 247    invocations, which are launched in a FIFO order.  As such, the map
 248    routines maintains a queue of cuLaunchKernel arguments.
 249
 250    Calls to map_push and map_pop must be guarded by ptx_event_lock.
 251    Likewise, calls to map_init and map_fini are guarded by
 252    ptx_dev_lock inside GOMP_OFFLOAD_init_device and
 253    GOMP_OFFLOAD_fini_device, respectively.  */
 254
 255 static bool
 256 map_init (struct ptx_stream *s)
 257 {
 258   int size = getpagesize ();
 259
 260   assert (s);
 261
 262   s->map = cuda_map_create (size);
 263
 264   return true;
 265 }
 266
 267 static bool
 268 map_fini (struct ptx_stream *s)
 269 {
 270   assert (s->map->next == NULL);
 271   assert (!s->map->active);
 272
 273   cuda_map_destroy (s->map);
 274
 275   return true;
 276 }
 277
 278 static void
 279 map_pop (struct ptx_stream *s)
 280 {
 281   struct cuda_map *next;
 282
 283   assert (s != NULL);
 284
 285   if (s->map->next == NULL)
 286     {
 287       s->map->active = false;
 288       return;
 289     }
 290
 291   next = s->map->next;
 292   cuda_map_destroy (s->map);
 293   s->map = next;
 294 }
 295
 296 static CUdeviceptr
 297 map_push (struct ptx_stream *s, size_t size)
 298 {
 299   struct cuda_map *map = NULL, *t = NULL;
 300
 301   assert (s);
 302   assert (s->map);
 303
 304   /* Each PTX stream requires a separate data region to store the
 305      launch arguments for cuLaunchKernel.  Allocate a new
 306      cuda_map and push it to the end of the list.  */
 307   if (s->map->active)
 308     {
 309       map = cuda_map_create (size);
 310
 311       for (t = s->map; t->next != NULL; t = t->next)
 312         ;
 313
 314       t->next = map;
 315     }
 316   else if (s->map->size < size)
 317     {
 318       cuda_map_destroy (s->map);
 319       map = cuda_map_create (size);
 320     }
 321   else
 322     map = s->map;
 323
 324   s->map = map;
 325   s->map->active = true;
 326
 327   return s->map->d;
 328 }
 329
 330 /* Target data function launch information.  */
 331
 332 struct targ_fn_launch
 333 {
 334   const char *fn;
 335   unsigned short dim[GOMP_DIM_MAX];
 336 };
 337
 338 /* Target PTX object information.  */
 339
 340 struct targ_ptx_obj
 341 {
 342   const char *code;
 343   size_t size;
 344 };
 345
 346 /* Target data image information.  */
 347
 348 typedef struct nvptx_tdata
 349 {
 350   const struct targ_ptx_obj *ptx_objs;
 351   unsigned ptx_num;
 352
 353   const char *const *var_names;
 354   unsigned var_num;
 355
 356   const struct targ_fn_launch *fn_descs;
 357   unsigned fn_num;
 358 } nvptx_tdata_t;
 359
 360 /* Descriptor of a loaded function.  */
 361
 362 struct targ_fn_descriptor
 363 {
 364   CUfunction fn;
 365   const struct targ_fn_launch *launch;
 366   int regs_per_thread;
 367   int max_threads_per_block;
 368 };
 369
 370 /* A loaded PTX image.  */
 371 struct ptx_image_data
 372 {
 373   const void *target_data;
 374   CUmodule module;
 375
 376   struct targ_fn_descriptor *fns;  /* Array of functions.  */
 377
 378   struct ptx_image_data *next;
 379 };
 380
 381 struct ptx_device
 382 {
 383   CUcontext ctx;
 384   bool ctx_shared;
 385   CUdevice dev;
 386   struct ptx_stream *null_stream;
 387   /* All non-null streams associated with this device (actually context),
 388      either created implicitly or passed in from the user (via
 389      acc_set_cuda_stream).  */
 390   struct ptx_stream *active_streams;
 391   struct {
 392     struct ptx_stream **arr;
 393     int size;
 394   } async_streams;
 395   /* A lock for use when manipulating the above stream list and array.  */
 396   pthread_mutex_t stream_lock;
 397   int ord;
 398   bool overlap;
 399   bool map;
 400   bool concur;
 401   bool mkern;
 402   int  mode;
 403   int clock_khz;
 404   int num_sms;
 405   int regs_per_block;
 406   int regs_per_sm;
 407   int warp_size;
 408   int max_threads_per_block;
 409   int max_threads_per_multiprocessor;
 410   int default_dims[GOMP_DIM_MAX];
 411
 412   struct ptx_image_data *images;  /* Images loaded on device.  */
 413   pthread_mutex_t image_lock;     /* Lock for above list.  */
 414
 415   struct ptx_device *next;
 416 };
 417
 418 enum ptx_event_type
 419 {
 420   PTX_EVT_MEM,
 421   PTX_EVT_KNL,
 422   PTX_EVT_SYNC,
 423   PTX_EVT_ASYNC_CLEANUP
 424 };
 425
 426 struct ptx_event
 427 {
 428   CUevent *evt;
 429   int type;
 430   void *addr;
 431   int ord;
 432   int val;
 433
 434   struct ptx_event *next;
 435 };
 436
 437 static pthread_mutex_t ptx_event_lock;
 438 static struct ptx_event *ptx_events;
 439
 440 static struct ptx_device **ptx_devices;
 441
 442 static inline struct nvptx_thread *
 443 nvptx_thread (void)
 444 {
 445   return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
 446 }
 447
 448 static bool
 449 init_streams_for_device (struct ptx_device *ptx_dev, int concurrency)
 450 {
 451   int i;
 452   struct ptx_stream *null_stream
 453     = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
 454
 455   null_stream->stream = NULL;
 456   null_stream->host_thread = pthread_self ();
 457   null_stream->multithreaded = true;
 458   if (!map_init (null_stream))
 459     return false;
 460
 461   ptx_dev->null_stream = null_stream;
 462   ptx_dev->active_streams = NULL;
 463   pthread_mutex_init (&ptx_dev->stream_lock, NULL);
 464
 465   if (concurrency < 1)
 466     concurrency = 1;
 467
 468   /* This is just a guess -- make space for as many async streams as the
 469      current device is capable of concurrently executing.  This can grow
 470      later as necessary.  No streams are created yet.  */
 471   ptx_dev->async_streams.arr
 472     = GOMP_PLUGIN_malloc (concurrency * sizeof (struct ptx_stream *));
 473   ptx_dev->async_streams.size = concurrency;
 474
 475   for (i = 0; i < concurrency; i++)
 476     ptx_dev->async_streams.arr[i] = NULL;
 477
 478   return true;
 479 }
 480
 481 static bool
 482 fini_streams_for_device (struct ptx_device *ptx_dev)
 483 {
 484   free (ptx_dev->async_streams.arr);
 485
 486   bool ret = true;
 487   while (ptx_dev->active_streams != NULL)
 488     {
 489       struct ptx_stream *s = ptx_dev->active_streams;
 490       ptx_dev->active_streams = ptx_dev->active_streams->next;
 491
 492       ret &= map_fini (s);
 493
 494       CUresult r = CUDA_CALL_NOCHECK (cuStreamDestroy, s->stream);
 495       if (r != CUDA_SUCCESS)
 496         {
 497           GOMP_PLUGIN_error ("cuStreamDestroy error: %s", cuda_error (r));
 498           ret = false;
 499         }
 500       free (s);
 501     }
 502
 503   ret &= map_fini (ptx_dev->null_stream);
 504   free (ptx_dev->null_stream);
 505   return ret;
 506 }
 507
 508 /* Select a stream for (OpenACC-semantics) ASYNC argument for the current
 509    thread THREAD (and also current device/context).  If CREATE is true, create
 510    the stream if it does not exist (or use EXISTING if it is non-NULL), and
 511    associate the stream with the same thread argument.  Returns stream to use
 512    as result.  */
 513
 514 static struct ptx_stream *
 515 select_stream_for_async (int async, pthread_t thread, bool create,
 516                          CUstream existing)
 517 {
 518   struct nvptx_thread *nvthd = nvptx_thread ();
 519   /* Local copy of TLS variable.  */
 520   struct ptx_device *ptx_dev = nvthd->ptx_dev;
 521   struct ptx_stream *stream = NULL;
 522   int orig_async = async;
 523
 524   /* The special value acc_async_noval (-1) maps (for now) to an
 525      implicitly-created stream, which is then handled the same as any other
 526      numbered async stream.  Other options are available, e.g. using the null
 527      stream for anonymous async operations, or choosing an idle stream from an
 528      active set.  But, stick with this for now.  */
 529   if (async > acc_async_sync)
 530     async++;
 531
 532   if (create)
 533     pthread_mutex_lock (&ptx_dev->stream_lock);
 534
 535   /* NOTE: AFAICT there's no particular need for acc_async_sync to map to the
 536      null stream, and in fact better performance may be obtainable if it doesn't
 537      (because the null stream enforces overly-strict synchronisation with
 538      respect to other streams for legacy reasons, and that's probably not
 539      needed with OpenACC).  Maybe investigate later.  */
 540   if (async == acc_async_sync)
 541     stream = ptx_dev->null_stream;
 542   else if (async >= 0 && async < ptx_dev->async_streams.size
 543            && ptx_dev->async_streams.arr[async] && !(create && existing))
 544     stream = ptx_dev->async_streams.arr[async];
 545   else if (async >= 0 && create)
 546     {
 547       if (async >= ptx_dev->async_streams.size)
 548         {
 549           int i, newsize = ptx_dev->async_streams.size * 2;
 550
 551           if (async >= newsize)
 552             newsize = async + 1;
 553
 554           ptx_dev->async_streams.arr
 555             = GOMP_PLUGIN_realloc (ptx_dev->async_streams.arr,
 556                                    newsize * sizeof (struct ptx_stream *));
 557
 558           for (i = ptx_dev->async_streams.size; i < newsize; i++)
 559             ptx_dev->async_streams.arr[i] = NULL;
 560
 561           ptx_dev->async_streams.size = newsize;
 562         }
 563
 564       /* Create a new stream on-demand if there isn't one already, or if we're
 565          setting a particular async value to an existing (externally-provided)
 566          stream.  */
 567       if (!ptx_dev->async_streams.arr[async] || existing)
 568         {
 569           CUresult r;
 570           struct ptx_stream *s
 571             = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
 572
 573           if (existing)
 574             s->stream = existing;
 575           else
 576             {
 577               r = CUDA_CALL_NOCHECK (cuStreamCreate, &s->stream,
 578                                      CU_STREAM_DEFAULT);
 579               if (r != CUDA_SUCCESS)
 580                 {
 581                   pthread_mutex_unlock (&ptx_dev->stream_lock);
 582                   GOMP_PLUGIN_fatal ("cuStreamCreate error: %s",
 583                                      cuda_error (r));
 584                 }
 585             }
 586
 587           /* If CREATE is true, we're going to be queueing some work on this
 588              stream.  Associate it with the current host thread.  */
 589           s->host_thread = thread;
 590           s->multithreaded = false;
 591
 592           if (!map_init (s))
 593             {
 594               pthread_mutex_unlock (&ptx_dev->stream_lock);
 595               GOMP_PLUGIN_fatal ("map_init fail");
 596             }
 597
 598           s->next = ptx_dev->active_streams;
 599           ptx_dev->active_streams = s;
 600           ptx_dev->async_streams.arr[async] = s;
 601         }
 602
 603       stream = ptx_dev->async_streams.arr[async];
 604     }
 605   else if (async < 0)
 606     {
 607       if (create)
 608         pthread_mutex_unlock (&ptx_dev->stream_lock);
 609       GOMP_PLUGIN_fatal ("bad async %d", async);
 610     }
 611
 612   if (create)
 613     {
 614       assert (stream != NULL);
 615
 616       /* If we're trying to use the same stream from different threads
 617          simultaneously, set stream->multithreaded to true.  This affects the
 618          behaviour of acc_async_test_all and acc_wait_all, which are supposed to
 619          only wait for asynchronous launches from the same host thread they are
 620          invoked on.  If multiple threads use the same async value, we make note
 621          of that here and fall back to testing/waiting for all threads in those
 622          functions.  */
 623       if (thread != stream->host_thread)
 624         stream->multithreaded = true;
 625
 626       pthread_mutex_unlock (&ptx_dev->stream_lock);
 627     }
 628   else if (stream && !stream->multithreaded
 629            && !pthread_equal (stream->host_thread, thread))
 630     GOMP_PLUGIN_fatal ("async %d used on wrong thread", orig_async);
 631
 632   return stream;
 633 }
 634
 635 /* Initialize the device.  Return TRUE on success, else FALSE.  PTX_DEV_LOCK
 636    should be locked on entry and remains locked on exit.  */
 637
 638 static bool
 639 nvptx_init (void)
 640 {
 641   int ndevs;
 642
 643   if (instantiated_devices != 0)
 644     return true;
 645
 646   ptx_events = NULL;
 647   pthread_mutex_init (&ptx_event_lock, NULL);
 648
 649   if (!init_cuda_lib ())
 650     return false;
 651
 652   CUDA_CALL (cuInit, 0);
 653
 654   CUDA_CALL (cuDeviceGetCount, &ndevs);
 655   ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *)
 656                                             * ndevs);
 657   return true;
 658 }
 659
 660 /* Select the N'th PTX device for the current host thread.  The device must
 661    have been previously opened before calling this function.  */
 662
 663 static bool
 664 nvptx_attach_host_thread_to_device (int n)
 665 {
 666   CUdevice dev;
 667   CUresult r;
 668   struct ptx_device *ptx_dev;
 669   CUcontext thd_ctx;
 670
 671   r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &dev);
 672   if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
 673     {
 674       GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
 675       return false;
 676     }
 677
 678   if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n)
 679     return true;
 680   else
 681     {
 682       CUcontext old_ctx;
 683
 684       ptx_dev = ptx_devices[n];
 685       if (!ptx_dev)
 686         {
 687           GOMP_PLUGIN_error ("device %d not found", n);
 688           return false;
 689         }
 690
 691       CUDA_CALL (cuCtxGetCurrent, &thd_ctx);
 692
 693       /* We don't necessarily have a current context (e.g. if it has been
 694          destroyed.  Pop it if we do though.  */
 695       if (thd_ctx != NULL)
 696         CUDA_CALL (cuCtxPopCurrent, &old_ctx);
 697
 698       CUDA_CALL (cuCtxPushCurrent, ptx_dev->ctx);
 699     }
 700   return true;
 701 }
 702
 703 static struct ptx_device *
 704 nvptx_open_device (int n)
 705 {
 706   struct ptx_device *ptx_dev;
 707   CUdevice dev, ctx_dev;
 708   CUresult r;
 709   int async_engines, pi;
 710
 711   CUDA_CALL_ERET (NULL, cuDeviceGet, &dev, n);
 712
 713   ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device));
 714
 715   ptx_dev->ord = n;
 716   ptx_dev->dev = dev;
 717   ptx_dev->ctx_shared = false;
 718
 719   r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &ctx_dev);
 720   if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
 721     {
 722       GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
 723       return NULL;
 724     }
 725
 726   if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev)
 727     {
 728       /* The current host thread has an active context for a different device.
 729          Detach it.  */
 730       CUcontext old_ctx;
 731       CUDA_CALL_ERET (NULL, cuCtxPopCurrent, &old_ctx);
 732     }
 733
 734   CUDA_CALL_ERET (NULL, cuCtxGetCurrent, &ptx_dev->ctx);
 735
 736   if (!ptx_dev->ctx)
 737     CUDA_CALL_ERET (NULL, cuCtxCreate, &ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
 738   else
 739     ptx_dev->ctx_shared = true;
 740
 741   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 742                   &pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
 743   ptx_dev->overlap = pi;
 744
 745   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 746                   &pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
 747   ptx_dev->map = pi;
 748
 749   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 750                   &pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
 751   ptx_dev->concur = pi;
 752
 753   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 754                   &pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
 755   ptx_dev->mode = pi;
 756
 757   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 758                   &pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
 759   ptx_dev->mkern = pi;
 760
 761   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 762                   &pi, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
 763   ptx_dev->clock_khz = pi;
 764
 765   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 766                   &pi, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
 767   ptx_dev->num_sms = pi;
 768
 769   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 770                   &pi, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev);
 771   ptx_dev->regs_per_block = pi;
 772
 773   /* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR is defined only
 774      in CUDA 6.0 and newer.  */
 775   r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi,
 776                          CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR,
 777                          dev);
 778   /* Fallback: use limit of registers per block, which is usually equal.  */
 779   if (r == CUDA_ERROR_INVALID_VALUE)
 780     pi = ptx_dev->regs_per_block;
 781   else if (r != CUDA_SUCCESS)
 782     {
 783       GOMP_PLUGIN_error ("cuDeviceGetAttribute error: %s", cuda_error (r));
 784       return NULL;
 785     }
 786   ptx_dev->regs_per_sm = pi;
 787
 788   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 789                   &pi, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev);
 790   if (pi != 32)
 791     {
 792       GOMP_PLUGIN_error ("Only warp size 32 is supported");
 793       return NULL;
 794     }
 795   ptx_dev->warp_size = pi;
 796
 797   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
 798                   CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev);
 799   ptx_dev->max_threads_per_block = pi;
 800
 801   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
 802                   CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev);
 803   ptx_dev->max_threads_per_multiprocessor = pi;
 804
 805   r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &async_engines,
 806                          CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
 807   if (r != CUDA_SUCCESS)
 808     async_engines = 1;
 809
 810   for (int i = 0; i != GOMP_DIM_MAX; i++)
 811     ptx_dev->default_dims[i] = 0;
 812
 813   ptx_dev->images = NULL;
 814   pthread_mutex_init (&ptx_dev->image_lock, NULL);
 815
 816   if (!init_streams_for_device (ptx_dev, async_engines))
 817     return NULL;
 818
 819   return ptx_dev;
 820 }
 821
 822 static bool
 823 nvptx_close_device (struct ptx_device *ptx_dev)
 824 {
 825   if (!ptx_dev)
 826     return true;
 827
 828   if (!fini_streams_for_device (ptx_dev))
 829     return false;
 830
 831   pthread_mutex_destroy (&ptx_dev->image_lock);
 832
 833   if (!ptx_dev->ctx_shared)
 834     CUDA_CALL (cuCtxDestroy, ptx_dev->ctx);
 835
 836   free (ptx_dev);
 837   return true;
 838 }
 839
 840 static int
 841 nvptx_get_num_devices (void)
 842 {
 843   int n;
 844
 845   /* PR libgomp/65099: Currently, we only support offloading in 64-bit
 846      configurations.  */
 847   if (sizeof (void *) != 8)
 848     {
 849       GOMP_PLUGIN_debug (0, "Disabling nvptx offloading;"
 850                          " only 64-bit configurations are supported\n");
 851       return 0;
 852     }
 853
 854   /* This function will be called before the plugin has been initialized in
 855      order to enumerate available devices, but CUDA API routines can't be used
 856      until cuInit has been called.  Just call it now (but don't yet do any
 857      further initialization).  */
 858   if (instantiated_devices == 0)
 859     {
 860       if (!init_cuda_lib ())
 861         return 0;
 862       CUresult r = CUDA_CALL_NOCHECK (cuInit, 0);
 863       /* This is not an error: e.g. we may have CUDA libraries installed but
 864          no devices available.  */
 865       if (r != CUDA_SUCCESS)
 866         {
 867           GOMP_PLUGIN_debug (0, "Disabling nvptx offloading; cuInit: %s\n",
 868                              cuda_error (r));
 869           return 0;
 870         }
 871     }
 872
 873   CUDA_CALL_ERET (-1, cuDeviceGetCount, &n);
 874   return n;
 875 }
 876
 877 static void
 878 notify_var (const char *var_name, const char *env_var)
 879 {
 880   if (env_var == NULL)
 881     GOMP_PLUGIN_debug (0, "%s: <Not defined>\n", var_name);
 882   else
 883     GOMP_PLUGIN_debug (0, "%s: '%s'\n", var_name, env_var);
 884 }
 885
 886 static void
 887 process_GOMP_NVPTX_JIT (intptr_t *gomp_nvptx_o)
 888 {
 889   const char *var_name = "GOMP_NVPTX_JIT";
 890   const char *env_var = secure_getenv (var_name);
 891   notify_var (var_name, env_var);
 892
 893   if (env_var == NULL)
 894     return;
 895
 896   const char *c = env_var;
 897   while (*c != '\0')
 898     {
 899       while (*c == ' ')
 900         c++;
 901
 902       if (c[0] == '-' && c[1] == 'O'
 903           && '0' <= c[2] && c[2] <= '4'
 904           && (c[3] == '\0' || c[3] == ' '))
 905         {
 906           *gomp_nvptx_o = c[2] - '0';
 907           c += 3;
 908           continue;
 909         }
 910
 911       GOMP_PLUGIN_error ("Error parsing %s", var_name);
 912       break;
 913     }
 914 }
 915
 916 static bool
 917 link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs,
 918           unsigned num_objs)
 919 {
 920   CUjit_option opts[7];
 921   void *optvals[7];
 922   float elapsed = 0.0;
 923   char elog[1024];
 924   char ilog[16384];
 925   CUlinkState linkstate;
 926   CUresult r;
 927   void *linkout;
 928   size_t linkoutsize __attribute__ ((unused));
 929
 930   opts[0] = CU_JIT_WALL_TIME;
 931   optvals[0] = &elapsed;
 932
 933   opts[1] = CU_JIT_INFO_LOG_BUFFER;
 934   optvals[1] = &ilog[0];
 935
 936   opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
 937   optvals[2] = (void *) sizeof ilog;
 938
 939   opts[3] = CU_JIT_ERROR_LOG_BUFFER;
 940   optvals[3] = &elog[0];
 941
 942   opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
 943   optvals[4] = (void *) sizeof elog;
 944
 945   opts[5] = CU_JIT_LOG_VERBOSE;
 946   optvals[5] = (void *) 1;
 947
 948   static intptr_t gomp_nvptx_o = -1;
 949
 950   static bool init_done = false;
 951   if (!init_done)
 952     {
 953       process_GOMP_NVPTX_JIT (&gomp_nvptx_o);
 954       init_done = true;
 955   }
 956
 957   int nopts = 6;
 958   if (gomp_nvptx_o != -1)
 959     {
 960       opts[nopts] = CU_JIT_OPTIMIZATION_LEVEL;
 961       optvals[nopts] = (void *) gomp_nvptx_o;
 962       nopts++;
 963     }
 964
 965   if (CUDA_CALL_EXISTS (cuLinkCreate_v2))
 966     CUDA_CALL (cuLinkCreate_v2, nopts, opts, optvals, &linkstate);
 967   else
 968     CUDA_CALL (cuLinkCreate, nopts, opts, optvals, &linkstate);
 969
 970   for (; num_objs--; ptx_objs++)
 971     {
 972       /* cuLinkAddData's 'data' argument erroneously omits the const
 973          qualifier.  */
 974       GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs->code);
 975       if (CUDA_CALL_EXISTS (cuLinkAddData_v2))
 976         r = CUDA_CALL_NOCHECK (cuLinkAddData_v2, linkstate, CU_JIT_INPUT_PTX,
 977                                (char *) ptx_objs->code, ptx_objs->size,
 978                                0, 0, 0, 0);
 979       else
 980         r = CUDA_CALL_NOCHECK (cuLinkAddData, linkstate, CU_JIT_INPUT_PTX,
 981                                (char *) ptx_objs->code, ptx_objs->size,
 982                                0, 0, 0, 0);
 983       if (r != CUDA_SUCCESS)
 984         {
 985           GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
 986           GOMP_PLUGIN_error ("cuLinkAddData (ptx_code) error: %s",
 987                              cuda_error (r));
 988           return false;
 989         }
 990     }
 991
 992   GOMP_PLUGIN_debug (0, "Linking\n");
 993   r = CUDA_CALL_NOCHECK (cuLinkComplete, linkstate, &linkout, &linkoutsize);
 994
 995   GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed);
 996   GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]);
 997
 998   if (r != CUDA_SUCCESS)
 999     {
1000       GOMP_PLUGIN_error ("cuLinkComplete error: %s", cuda_error (r));
1001       return false;
1002     }
1003
1004   CUDA_CALL (cuModuleLoadData, module, linkout);
1005   CUDA_CALL (cuLinkDestroy, linkstate);
1006   return true;
1007 }
1008
1009 static void
1010 event_gc (bool memmap_lockable)
1011 {
1012   struct ptx_event *ptx_event = ptx_events;
1013   struct ptx_event *async_cleanups = NULL;
1014   struct nvptx_thread *nvthd = nvptx_thread ();
1015
1016   pthread_mutex_lock (&ptx_event_lock);
1017
1018   while (ptx_event != NULL)
1019     {
1020       CUresult r;
1021       struct ptx_event *e = ptx_event;
1022
1023       ptx_event = ptx_event->next;
1024
1025       if (e->ord != nvthd->ptx_dev->ord)
1026         continue;
1027
1028       r = CUDA_CALL_NOCHECK (cuEventQuery, *e->evt);
1029       if (r == CUDA_SUCCESS)
1030         {
1031           bool append_async = false;
1032           CUevent *te;
1033
1034           te = e->evt;
1035
1036           switch (e->type)
1037             {
1038             case PTX_EVT_MEM:
1039             case PTX_EVT_SYNC:
1040               break;
1041
1042             case PTX_EVT_KNL:
1043               map_pop (e->addr);
1044               break;
1045
1046             case PTX_EVT_ASYNC_CLEANUP:
1047               {
1048                 /* The function gomp_plugin_async_unmap_vars needs to claim the
1049                    memory-map splay tree lock for the current device, so we
1050                    can't call it when one of our callers has already claimed
1051                    the lock.  In that case, just delay the GC for this event
1052                    until later.  */
1053                 if (!memmap_lockable)
1054                   continue;
1055
1056                 append_async = true;
1057               }
1058               break;
1059             }
1060
1061           CUDA_CALL_NOCHECK (cuEventDestroy, *te);
1062           free ((void *)te);
1063
1064           /* Unlink 'e' from ptx_events list.  */
1065           if (ptx_events == e)
1066             ptx_events = ptx_events->next;
1067           else
1068             {
1069               struct ptx_event *e_ = ptx_events;
1070               while (e_->next != e)
1071                 e_ = e_->next;
1072               e_->next = e_->next->next;
1073             }
1074
1075           if (append_async)
1076             {
1077               e->next = async_cleanups;
1078               async_cleanups = e;
1079             }
1080           else
1081             free (e);
1082         }
1083     }
1084
1085   pthread_mutex_unlock (&ptx_event_lock);
1086
1087   /* We have to do these here, after ptx_event_lock is released.  */
1088   while (async_cleanups)
1089     {
1090       struct ptx_event *e = async_cleanups;
1091       async_cleanups = async_cleanups->next;
1092
1093       GOMP_PLUGIN_async_unmap_vars (e->addr, e->val);
1094       free (e);
1095     }
1096 }
1097
1098 static void
1099 event_add (enum ptx_event_type type, CUevent *e, void *h, int val)
1100 {
1101   struct ptx_event *ptx_event;
1102   struct nvptx_thread *nvthd = nvptx_thread ();
1103
1104   assert (type == PTX_EVT_MEM || type == PTX_EVT_KNL || type == PTX_EVT_SYNC
1105           || type == PTX_EVT_ASYNC_CLEANUP);
1106
1107   ptx_event = GOMP_PLUGIN_malloc (sizeof (struct ptx_event));
1108   ptx_event->type = type;
1109   ptx_event->evt = e;
1110   ptx_event->addr = h;
1111   ptx_event->ord = nvthd->ptx_dev->ord;
1112   ptx_event->val = val;
1113
1114   pthread_mutex_lock (&ptx_event_lock);
1115
1116   ptx_event->next = ptx_events;
1117   ptx_events = ptx_event;
1118
1119   pthread_mutex_unlock (&ptx_event_lock);
1120 }
1121
1122 static void
1123 nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
1124             int async, unsigned *dims, void *targ_mem_desc)
1125 {
1126   struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
1127   CUfunction function;
1128   CUresult r;
1129   int i;
1130   struct ptx_stream *dev_str;
1131   void *kargs[1];
1132   void *hp;
1133   CUdeviceptr dp;
1134   struct nvptx_thread *nvthd = nvptx_thread ();
1135   int warp_size = nvthd->ptx_dev->warp_size;
1136   const char *maybe_abort_msg = "(perhaps abort was called)";
1137
1138   function = targ_fn->fn;
1139
1140   dev_str = select_stream_for_async (async, pthread_self (), false, NULL);
1141   assert (dev_str == nvthd->current_stream);
1142
1143   /* Initialize the launch dimensions.  Typically this is constant,
1144      provided by the device compiler, but we must permit runtime
1145      values.  */
1146   int seen_zero = 0;
1147   for (i = 0; i != GOMP_DIM_MAX; i++)
1148     {
1149       if (targ_fn->launch->dim[i])
1150        dims[i] = targ_fn->launch->dim[i];
1151       if (!dims[i])
1152        seen_zero = 1;
1153     }
1154
1155   if (seen_zero)
1156     {
1157       pthread_mutex_lock (&ptx_dev_lock);
1158
1159       static int gomp_openacc_dims[GOMP_DIM_MAX];
1160       if (!gomp_openacc_dims[0])
1161         {
1162           /* See if the user provided GOMP_OPENACC_DIM environment
1163              variable to specify runtime defaults.  */
1164           for (int i = 0; i < GOMP_DIM_MAX; ++i)
1165             gomp_openacc_dims[i] = GOMP_PLUGIN_acc_default_dim (i);
1166         }
1167
1168       if (!nvthd->ptx_dev->default_dims[0])
1169         {
1170           int default_dims[GOMP_DIM_MAX];
1171           for (int i = 0; i < GOMP_DIM_MAX; ++i)
1172             default_dims[i] = gomp_openacc_dims[i];
1173
1174           int gang, worker, vector;
1175           {
1176             int block_size = nvthd->ptx_dev->max_threads_per_block;
1177             int cpu_size = nvthd->ptx_dev->max_threads_per_multiprocessor;
1178             int dev_size = nvthd->ptx_dev->num_sms;
1179             GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
1180                                " dev_size=%d, cpu_size=%d\n",
1181                                warp_size, block_size, dev_size, cpu_size);
1182
1183             gang = (cpu_size / block_size) * dev_size;
1184             worker = block_size / warp_size;
1185             vector = warp_size;
1186           }
1187
1188           /* There is no upper bound on the gang size.  The best size
1189              matches the hardware configuration.  Logical gangs are
1190              scheduled onto physical hardware.  To maximize usage, we
1191              should guess a large number.  */
1192           if (default_dims[GOMP_DIM_GANG] < 1)
1193             default_dims[GOMP_DIM_GANG] = gang ? gang : 1024;
1194           /* The worker size must not exceed the hardware.  */
1195           if (default_dims[GOMP_DIM_WORKER] < 1
1196               || (default_dims[GOMP_DIM_WORKER] > worker && gang))
1197             default_dims[GOMP_DIM_WORKER] = worker;
1198           /* The vector size must exactly match the hardware.  */
1199           if (default_dims[GOMP_DIM_VECTOR] < 1
1200               || (default_dims[GOMP_DIM_VECTOR] != vector && gang))
1201             default_dims[GOMP_DIM_VECTOR] = vector;
1202
1203           GOMP_PLUGIN_debug (0, " default dimensions [%d,%d,%d]\n",
1204                              default_dims[GOMP_DIM_GANG],
1205                              default_dims[GOMP_DIM_WORKER],
1206                              default_dims[GOMP_DIM_VECTOR]);
1207
1208           for (i = 0; i != GOMP_DIM_MAX; i++)
1209             nvthd->ptx_dev->default_dims[i] = default_dims[i];
1210         }
1211       pthread_mutex_unlock (&ptx_dev_lock);
1212
1213       {
1214         bool default_dim_p[GOMP_DIM_MAX];
1215         for (i = 0; i != GOMP_DIM_MAX; i++)
1216           default_dim_p[i] = !dims[i];
1217
1218         if (!CUDA_CALL_EXISTS (cuOccupancyMaxPotentialBlockSize))
1219           {
1220             for (i = 0; i != GOMP_DIM_MAX; i++)
1221               if (default_dim_p[i])
1222                 dims[i] = nvthd->ptx_dev->default_dims[i];
1223
1224             if (default_dim_p[GOMP_DIM_VECTOR])
1225               dims[GOMP_DIM_VECTOR]
1226                 = MIN (dims[GOMP_DIM_VECTOR],
1227                        (targ_fn->max_threads_per_block / warp_size
1228                         * warp_size));
1229
1230             if (default_dim_p[GOMP_DIM_WORKER])
1231               dims[GOMP_DIM_WORKER]
1232                 = MIN (dims[GOMP_DIM_WORKER],
1233                        targ_fn->max_threads_per_block / dims[GOMP_DIM_VECTOR]);
1234           }
1235         else
1236           {
1237             /* Handle the case that the compiler allows the runtime to choose
1238                the vector-length conservatively, by ignoring
1239                gomp_openacc_dims[GOMP_DIM_VECTOR].  TODO: actually handle
1240                it.  */
1241             int vectors = 0;
1242             /* TODO: limit gomp_openacc_dims[GOMP_DIM_WORKER] such that that
1243                gomp_openacc_dims[GOMP_DIM_WORKER] * actual_vectors does not
1244                exceed targ_fn->max_threads_per_block. */
1245             int workers = gomp_openacc_dims[GOMP_DIM_WORKER];
1246             int gangs = gomp_openacc_dims[GOMP_DIM_GANG];
1247             int grids, blocks;
1248
1249             CUDA_CALL_ASSERT (cuOccupancyMaxPotentialBlockSize, &grids,
1250                               &blocks, function, NULL, 0,
1251                               dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]);
1252             GOMP_PLUGIN_debug (0, "cuOccupancyMaxPotentialBlockSize: "
1253                                "grid = %d, block = %d\n", grids, blocks);
1254
1255             /* Keep the num_gangs proportional to the block size.  In
1256                the case were a block size is limited by shared-memory
1257                or the register file capacity, the runtime will not
1258                excessively over assign gangs to the multiprocessor
1259                units if their state is going to be swapped out even
1260                more than necessary. The constant factor 2 is there to
1261                prevent threads from idling when there is insufficient
1262                work for them.  */
1263             if (gangs == 0)
1264               gangs = 2 * grids * (blocks / warp_size);
1265
1266             if (vectors == 0)
1267               vectors = warp_size;
1268
1269             if (workers == 0)
1270               {
1271                 int actual_vectors = (default_dim_p[GOMP_DIM_VECTOR]
1272                                       ? vectors
1273                                       : dims[GOMP_DIM_VECTOR]);
1274                 workers = blocks / actual_vectors;
1275               }
1276
1277             for (i = 0; i != GOMP_DIM_MAX; i++)
1278               if (default_dim_p[i])
1279                 switch (i)
1280                   {
1281                   case GOMP_DIM_GANG: dims[i] = gangs; break;
1282                   case GOMP_DIM_WORKER: dims[i] = workers; break;
1283                   case GOMP_DIM_VECTOR: dims[i] = vectors; break;
1284                   default: GOMP_PLUGIN_fatal ("invalid dim");
1285                   }
1286           }
1287       }
1288     }
1289
1290   /* Check if the accelerator has sufficient hardware resources to
1291      launch the offloaded kernel.  */
1292   if (dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]
1293       > targ_fn->max_threads_per_block)
1294     {
1295       int suggest_workers
1296         = targ_fn->max_threads_per_block / dims[GOMP_DIM_VECTOR];
1297       GOMP_PLUGIN_fatal ("The Nvidia accelerator has insufficient resources to"
1298                          " launch '%s' with num_workers = %d; recompile the"
1299                          " program with 'num_workers = %d' on that offloaded"
1300                          " region or '-fopenacc-dim=:%d'",
1301                          targ_fn->launch->fn, dims[GOMP_DIM_WORKER],
1302                          suggest_workers, suggest_workers);
1303     }
1304
1305   /* This reserves a chunk of a pre-allocated page of memory mapped on both
1306      the host and the device. HP is a host pointer to the new chunk, and DP is
1307      the corresponding device pointer.  */
1308   pthread_mutex_lock (&ptx_event_lock);
1309   dp = map_push (dev_str, mapnum * sizeof (void *));
1310   pthread_mutex_unlock (&ptx_event_lock);
1311
1312   GOMP_PLUGIN_debug (0, "  %s: prepare mappings\n", __FUNCTION__);
1313
1314   /* Copy the array of arguments to the mapped page.  */
1315   hp = alloca(sizeof(void *) * mapnum);
1316   for (i = 0; i < mapnum; i++)
1317     ((void **) hp)[i] = devaddrs[i];
1318
1319   /* Copy the (device) pointers to arguments to the device */
1320   CUDA_CALL_ASSERT (cuMemcpyHtoD, dp, hp,
1321                     mapnum * sizeof (void *));
1322   GOMP_PLUGIN_debug (0, "  %s: kernel %s: launch"
1323                      " gangs=%u, workers=%u, vectors=%u\n",
1324                      __FUNCTION__, targ_fn->launch->fn, dims[GOMP_DIM_GANG],
1325                      dims[GOMP_DIM_WORKER], dims[GOMP_DIM_VECTOR]);
1326
1327   // OpenACC            CUDA
1328   //
1329   // num_gangs          nctaid.x
1330   // num_workers        ntid.y
1331   // vector length      ntid.x
1332
1333   kargs[0] = &dp;
1334   CUDA_CALL_ASSERT (cuLaunchKernel, function,
1335                     dims[GOMP_DIM_GANG], 1, 1,
1336                     dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1,
1337                     0, dev_str->stream, kargs, 0);
1338
1339 #ifndef DISABLE_ASYNC
1340   if (async < acc_async_noval)
1341     {
1342       r = CUDA_CALL_NOCHECK (cuStreamSynchronize, dev_str->stream);
1343       if (r == CUDA_ERROR_LAUNCH_FAILED)
1344         GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
1345                            maybe_abort_msg);
1346       else if (r != CUDA_SUCCESS)
1347         GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1348     }
1349   else
1350     {
1351       CUevent *e;
1352
1353       e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1354
1355       r = CUDA_CALL_NOCHECK (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1356       if (r == CUDA_ERROR_LAUNCH_FAILED)
1357         GOMP_PLUGIN_fatal ("cuEventCreate error: %s %s\n", cuda_error (r),
1358                            maybe_abort_msg);
1359       else if (r != CUDA_SUCCESS)
1360         GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1361
1362       event_gc (true);
1363
1364       CUDA_CALL_ASSERT (cuEventRecord, *e, dev_str->stream);
1365
1366       event_add (PTX_EVT_KNL, e, (void *)dev_str, 0);
1367     }
1368 #else
1369   r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
1370   if (r == CUDA_ERROR_LAUNCH_FAILED)
1371     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
1372                        maybe_abort_msg);
1373   else if (r != CUDA_SUCCESS)
1374     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
1375 #endif
1376
1377   GOMP_PLUGIN_debug (0, "  %s: kernel %s: finished\n", __FUNCTION__,
1378                      targ_fn->launch->fn);
1379
1380 #ifndef DISABLE_ASYNC
1381   if (async < acc_async_noval)
1382 #endif
1383     map_pop (dev_str);
1384 }
1385
1386 void * openacc_get_current_cuda_context (void);
1387
1388 static void *
1389 nvptx_alloc (size_t s)
1390 {
1391   CUdeviceptr d;
1392
1393   CUDA_CALL_ERET (NULL, cuMemAlloc, &d, s);
1394   return (void *) d;
1395 }
1396
1397 static bool
1398 nvptx_free (void *p)
1399 {
1400   CUdeviceptr pb;
1401   size_t ps;
1402
1403   CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) p);
1404   if ((CUdeviceptr) p != pb)
1405     {
1406       GOMP_PLUGIN_error ("invalid device address");
1407       return false;
1408     }
1409
1410   CUDA_CALL (cuMemFree, (CUdeviceptr) p);
1411   return true;
1412 }
1413
1414
1415 static bool
1416 nvptx_host2dev (void *d, const void *h, size_t s)
1417 {
1418   CUdeviceptr pb;
1419   size_t ps;
1420   struct nvptx_thread *nvthd = nvptx_thread ();
1421
1422   if (!s)
1423     return true;
1424   if (!d)
1425     {
1426       GOMP_PLUGIN_error ("invalid device address");
1427       return false;
1428     }
1429
1430   CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
1431
1432   if (!pb)
1433     {
1434       GOMP_PLUGIN_error ("invalid device address");
1435       return false;
1436     }
1437   if (!h)
1438     {
1439       GOMP_PLUGIN_error ("invalid host address");
1440       return false;
1441     }
1442   if (d == h)
1443     {
1444       GOMP_PLUGIN_error ("invalid host or device address");
1445       return false;
1446     }
1447   if ((void *)(d + s) > (void *)(pb + ps))
1448     {
1449       GOMP_PLUGIN_error ("invalid size");
1450       return false;
1451     }
1452
1453 #ifndef DISABLE_ASYNC
1454   if (nvthd && nvthd->current_stream != nvthd->ptx_dev->null_stream)
1455     {
1456       CUevent *e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1457       CUDA_CALL (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1458       event_gc (false);
1459       CUDA_CALL (cuMemcpyHtoDAsync,
1460                  (CUdeviceptr) d, h, s, nvthd->current_stream->stream);
1461       CUDA_CALL (cuEventRecord, *e, nvthd->current_stream->stream);
1462       event_add (PTX_EVT_MEM, e, (void *)h, 0);
1463     }
1464   else
1465 #endif
1466     CUDA_CALL (cuMemcpyHtoD, (CUdeviceptr) d, h, s);
1467
1468   return true;
1469 }
1470
1471 static bool
1472 nvptx_dev2host (void *h, const void *d, size_t s)
1473 {
1474   CUdeviceptr pb;
1475   size_t ps;
1476   struct nvptx_thread *nvthd = nvptx_thread ();
1477
1478   if (!s)
1479     return true;
1480   if (!d)
1481     {
1482       GOMP_PLUGIN_error ("invalid device address");
1483       return false;
1484     }
1485
1486   CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
1487
1488   if (!pb)
1489     {
1490       GOMP_PLUGIN_error ("invalid device address");
1491       return false;
1492     }
1493   if (!h)
1494     {
1495       GOMP_PLUGIN_error ("invalid host address");
1496       return false;
1497     }
1498   if (d == h)
1499     {
1500       GOMP_PLUGIN_error ("invalid host or device address");
1501       return false;
1502     }
1503   if ((void *)(d + s) > (void *)(pb + ps))
1504     {
1505       GOMP_PLUGIN_error ("invalid size");
1506       return false;
1507     }
1508
1509 #ifndef DISABLE_ASYNC
1510   if (nvthd && nvthd->current_stream != nvthd->ptx_dev->null_stream)
1511     {
1512       CUevent *e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1513       CUDA_CALL (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1514       event_gc (false);
1515       CUDA_CALL (cuMemcpyDtoHAsync,
1516                  h, (CUdeviceptr) d, s, nvthd->current_stream->stream);
1517       CUDA_CALL (cuEventRecord, *e, nvthd->current_stream->stream);
1518       event_add (PTX_EVT_MEM, e, (void *)h, 0);
1519     }
1520   else
1521 #endif
1522     CUDA_CALL (cuMemcpyDtoH, h, (CUdeviceptr) d, s);
1523
1524   return true;
1525 }
1526
1527 static void
1528 nvptx_set_async (int async)
1529 {
1530   struct nvptx_thread *nvthd = nvptx_thread ();
1531   nvthd->current_stream
1532     = select_stream_for_async (async, pthread_self (), true, NULL);
1533 }
1534
1535 static int
1536 nvptx_async_test (int async)
1537 {
1538   CUresult r;
1539   struct ptx_stream *s;
1540
1541   s = select_stream_for_async (async, pthread_self (), false, NULL);
1542
1543   if (!s)
1544     GOMP_PLUGIN_fatal ("unknown async %d", async);
1545
1546   r = CUDA_CALL_NOCHECK (cuStreamQuery, s->stream);
1547   if (r == CUDA_SUCCESS)
1548     {
1549       /* The oacc-parallel.c:goacc_wait function calls this hook to determine
1550          whether all work has completed on this stream, and if so omits the call
1551          to the wait hook.  If that happens, event_gc might not get called
1552          (which prevents variables from getting unmapped and their associated
1553          device storage freed), so call it here.  */
1554       event_gc (true);
1555       return 1;
1556     }
1557   else if (r == CUDA_ERROR_NOT_READY)
1558     return 0;
1559
1560   GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
1561
1562   return 0;
1563 }
1564
1565 static int
1566 nvptx_async_test_all (void)
1567 {
1568   struct ptx_stream *s;
1569   pthread_t self = pthread_self ();
1570   struct nvptx_thread *nvthd = nvptx_thread ();
1571
1572   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1573
1574   for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
1575     {
1576       if ((s->multithreaded || pthread_equal (s->host_thread, self))
1577           && CUDA_CALL_NOCHECK (cuStreamQuery,
1578                                 s->stream) == CUDA_ERROR_NOT_READY)
1579         {
1580           pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1581           return 0;
1582         }
1583     }
1584
1585   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1586
1587   event_gc (true);
1588
1589   return 1;
1590 }
1591
1592 static void
1593 nvptx_wait (int async)
1594 {
1595   struct ptx_stream *s;
1596
1597   s = select_stream_for_async (async, pthread_self (), false, NULL);
1598   if (!s)
1599     GOMP_PLUGIN_fatal ("unknown async %d", async);
1600
1601   CUDA_CALL_ASSERT (cuStreamSynchronize, s->stream);
1602
1603   event_gc (true);
1604 }
1605
1606 static void
1607 nvptx_wait_async (int async1, int async2)
1608 {
1609   CUevent *e;
1610   struct ptx_stream *s1, *s2;
1611   pthread_t self = pthread_self ();
1612
1613   /* The stream that is waiting (rather than being waited for) doesn't
1614      necessarily have to exist already.  */
1615   s2 = select_stream_for_async (async2, self, true, NULL);
1616
1617   s1 = select_stream_for_async (async1, self, false, NULL);
1618   if (!s1)
1619     GOMP_PLUGIN_fatal ("invalid async 1\n");
1620
1621   if (s1 == s2)
1622     GOMP_PLUGIN_fatal ("identical parameters");
1623
1624   e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1625
1626   CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1627
1628   event_gc (true);
1629
1630   CUDA_CALL_ASSERT (cuEventRecord, *e, s1->stream);
1631
1632   event_add (PTX_EVT_SYNC, e, NULL, 0);
1633
1634   CUDA_CALL_ASSERT (cuStreamWaitEvent, s2->stream, *e, 0);
1635 }
1636
1637 static void
1638 nvptx_wait_all (void)
1639 {
1640   CUresult r;
1641   struct ptx_stream *s;
1642   pthread_t self = pthread_self ();
1643   struct nvptx_thread *nvthd = nvptx_thread ();
1644
1645   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1646
1647   /* Wait for active streams initiated by this thread (or by multiple threads)
1648      to complete.  */
1649   for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
1650     {
1651       if (s->multithreaded || pthread_equal (s->host_thread, self))
1652         {
1653           r = CUDA_CALL_NOCHECK (cuStreamQuery, s->stream);
1654           if (r == CUDA_SUCCESS)
1655             continue;
1656           else if (r != CUDA_ERROR_NOT_READY)
1657             GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
1658
1659           CUDA_CALL_ASSERT (cuStreamSynchronize, s->stream);
1660         }
1661     }
1662
1663   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1664
1665   event_gc (true);
1666 }
1667
1668 static void
1669 nvptx_wait_all_async (int async)
1670 {
1671   struct ptx_stream *waiting_stream, *other_stream;
1672   CUevent *e;
1673   struct nvptx_thread *nvthd = nvptx_thread ();
1674   pthread_t self = pthread_self ();
1675
1676   /* The stream doing the waiting.  This could be the first mention of the
1677      stream, so create it if necessary.  */
1678   waiting_stream
1679     = select_stream_for_async (async, pthread_self (), true, NULL);
1680
1681   /* Launches on the null stream already block on other streams in the
1682      context.  */
1683   if (!waiting_stream || waiting_stream == nvthd->ptx_dev->null_stream)
1684     return;
1685
1686   event_gc (true);
1687
1688   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1689
1690   for (other_stream = nvthd->ptx_dev->active_streams;
1691        other_stream != NULL;
1692        other_stream = other_stream->next)
1693     {
1694       if (!other_stream->multithreaded
1695           && !pthread_equal (other_stream->host_thread, self))
1696         continue;
1697
1698       e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1699
1700       CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1701
1702       /* Record an event on the waited-for stream.  */
1703       CUDA_CALL_ASSERT (cuEventRecord, *e, other_stream->stream);
1704
1705       event_add (PTX_EVT_SYNC, e, NULL, 0);
1706
1707       CUDA_CALL_ASSERT (cuStreamWaitEvent, waiting_stream->stream, *e, 0);
1708    }
1709
1710   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1711 }
1712
1713 static void *
1714 nvptx_get_current_cuda_device (void)
1715 {
1716   struct nvptx_thread *nvthd = nvptx_thread ();
1717
1718   if (!nvthd || !nvthd->ptx_dev)
1719     return NULL;
1720
1721   return &nvthd->ptx_dev->dev;
1722 }
1723
1724 static void *
1725 nvptx_get_current_cuda_context (void)
1726 {
1727   struct nvptx_thread *nvthd = nvptx_thread ();
1728
1729   if (!nvthd || !nvthd->ptx_dev)
1730     return NULL;
1731
1732   return nvthd->ptx_dev->ctx;
1733 }
1734
1735 static void *
1736 nvptx_get_cuda_stream (int async)
1737 {
1738   struct ptx_stream *s;
1739   struct nvptx_thread *nvthd = nvptx_thread ();
1740
1741   if (!nvthd || !nvthd->ptx_dev)
1742     return NULL;
1743
1744   s = select_stream_for_async (async, pthread_self (), false, NULL);
1745
1746   return s ? s->stream : NULL;
1747 }
1748
1749 static int
1750 nvptx_set_cuda_stream (int async, void *stream)
1751 {
1752   struct ptx_stream *oldstream;
1753   pthread_t self = pthread_self ();
1754   struct nvptx_thread *nvthd = nvptx_thread ();
1755
1756   if (async < 0)
1757     GOMP_PLUGIN_fatal ("bad async %d", async);
1758
1759   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1760
1761   /* We have a list of active streams and an array mapping async values to
1762      entries of that list.  We need to take "ownership" of the passed-in stream,
1763      and add it to our list, removing the previous entry also (if there was one)
1764      in order to prevent resource leaks.  Note the potential for surprise
1765      here: maybe we should keep track of passed-in streams and leave it up to
1766      the user to tidy those up, but that doesn't work for stream handles
1767      returned from acc_get_cuda_stream above...  */
1768
1769   oldstream = select_stream_for_async (async, self, false, NULL);
1770
1771   if (oldstream)
1772     {
1773       if (nvthd->ptx_dev->active_streams == oldstream)
1774         nvthd->ptx_dev->active_streams = nvthd->ptx_dev->active_streams->next;
1775       else
1776         {
1777           struct ptx_stream *s = nvthd->ptx_dev->active_streams;
1778           while (s->next != oldstream)
1779             s = s->next;
1780           s->next = s->next->next;
1781         }
1782
1783       CUDA_CALL_ASSERT (cuStreamDestroy, oldstream->stream);
1784
1785       if (!map_fini (oldstream))
1786         GOMP_PLUGIN_fatal ("error when freeing host memory");
1787
1788       free (oldstream);
1789     }
1790
1791   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1792
1793   (void) select_stream_for_async (async, self, true, (CUstream) stream);
1794
1795   return 1;
1796 }
1797
1798 /* Plugin entry points.  */
1799
1800 const char *
1801 GOMP_OFFLOAD_get_name (void)
1802 {
1803   return "nvptx";
1804 }
1805
1806 unsigned int
1807 GOMP_OFFLOAD_get_caps (void)
1808 {
1809   return GOMP_OFFLOAD_CAP_OPENACC_200 | GOMP_OFFLOAD_CAP_OPENMP_400;
1810 }
1811
1812 int
1813 GOMP_OFFLOAD_get_type (void)
1814 {
1815   return OFFLOAD_TARGET_TYPE_NVIDIA_PTX;
1816 }
1817
1818 int
1819 GOMP_OFFLOAD_get_num_devices (void)
1820 {
1821   return nvptx_get_num_devices ();
1822 }
1823
1824 bool
1825 GOMP_OFFLOAD_init_device (int n)
1826 {
1827   struct ptx_device *dev;
1828
1829   pthread_mutex_lock (&ptx_dev_lock);
1830
1831   if (!nvptx_init () || ptx_devices[n] != NULL)
1832     {
1833       pthread_mutex_unlock (&ptx_dev_lock);
1834       return false;
1835     }
1836
1837   dev = nvptx_open_device (n);
1838   if (dev)
1839     {
1840       ptx_devices[n] = dev;
1841       instantiated_devices++;
1842     }
1843
1844   pthread_mutex_unlock (&ptx_dev_lock);
1845
1846   return dev != NULL;
1847 }
1848
1849 bool
1850 GOMP_OFFLOAD_fini_device (int n)
1851 {
1852   pthread_mutex_lock (&ptx_dev_lock);
1853
1854   if (ptx_devices[n] != NULL)
1855     {
1856       if (!nvptx_attach_host_thread_to_device (n)
1857           || !nvptx_close_device (ptx_devices[n]))
1858         {
1859           pthread_mutex_unlock (&ptx_dev_lock);
1860           return false;
1861         }
1862       ptx_devices[n] = NULL;
1863       instantiated_devices--;
1864     }
1865
1866   pthread_mutex_unlock (&ptx_dev_lock);
1867   return true;
1868 }
1869
1870 /* Return the libgomp version number we're compatible with.  There is
1871    no requirement for cross-version compatibility.  */
1872
1873 unsigned
1874 GOMP_OFFLOAD_version (void)
1875 {
1876   return GOMP_VERSION;
1877 }
1878
1879 /* Initialize __nvptx_clocktick, if present in MODULE.  */
1880
1881 static void
1882 nvptx_set_clocktick (CUmodule module, struct ptx_device *dev)
1883 {
1884   CUdeviceptr dptr;
1885   CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &dptr, NULL,
1886                                   module, "__nvptx_clocktick");
1887   if (r == CUDA_ERROR_NOT_FOUND)
1888     return;
1889   if (r != CUDA_SUCCESS)
1890     GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
1891   double __nvptx_clocktick = 1e-3 / dev->clock_khz;
1892   r = CUDA_CALL_NOCHECK (cuMemcpyHtoD, dptr, &__nvptx_clocktick,
1893                          sizeof (__nvptx_clocktick));
1894   if (r != CUDA_SUCCESS)
1895     GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
1896 }
1897
1898 /* Load the (partial) program described by TARGET_DATA to device
1899    number ORD.  Allocate and return TARGET_TABLE.  */
1900
1901 int
1902 GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
1903                          struct addr_pair **target_table)
1904 {
1905   CUmodule module;
1906   const char *const *var_names;
1907   const struct targ_fn_launch *fn_descs;
1908   unsigned int fn_entries, var_entries, i, j;
1909   struct targ_fn_descriptor *targ_fns;
1910   struct addr_pair *targ_tbl;
1911   const nvptx_tdata_t *img_header = (const nvptx_tdata_t *) target_data;
1912   struct ptx_image_data *new_image;
1913   struct ptx_device *dev;
1914
1915   if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1916     {
1917       GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1918                          " (expected %u, received %u)",
1919                          GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1920       return -1;
1921     }
1922
1923   if (!nvptx_attach_host_thread_to_device (ord)
1924       || !link_ptx (&module, img_header->ptx_objs, img_header->ptx_num))
1925     return -1;
1926
1927   dev = ptx_devices[ord];
1928
1929   /* The mkoffload utility emits a struct of pointers/integers at the
1930      start of each offload image.  The array of kernel names and the
1931      functions addresses form a one-to-one correspondence.  */
1932
1933   var_entries = img_header->var_num;
1934   var_names = img_header->var_names;
1935   fn_entries = img_header->fn_num;
1936   fn_descs = img_header->fn_descs;
1937
1938   targ_tbl = GOMP_PLUGIN_malloc (sizeof (struct addr_pair)
1939                                  * (fn_entries + var_entries));
1940   targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
1941                                  * fn_entries);
1942
1943   *target_table = targ_tbl;
1944
1945   new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data));
1946   new_image->target_data = target_data;
1947   new_image->module = module;
1948   new_image->fns = targ_fns;
1949
1950   pthread_mutex_lock (&dev->image_lock);
1951   new_image->next = dev->images;
1952   dev->images = new_image;
1953   pthread_mutex_unlock (&dev->image_lock);
1954
1955   for (i = 0; i < fn_entries; i++, targ_fns++, targ_tbl++)
1956     {
1957       CUfunction function;
1958       int nregs, mthrs;
1959
1960       CUDA_CALL_ERET (-1, cuModuleGetFunction, &function, module,
1961                       fn_descs[i].fn);
1962       CUDA_CALL_ERET (-1, cuFuncGetAttribute, &nregs,
1963                       CU_FUNC_ATTRIBUTE_NUM_REGS, function);
1964       CUDA_CALL_ERET (-1, cuFuncGetAttribute, &mthrs,
1965                       CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, function);
1966
1967       targ_fns->fn = function;
1968       targ_fns->launch = &fn_descs[i];
1969       targ_fns->regs_per_thread = nregs;
1970       targ_fns->max_threads_per_block = mthrs;
1971
1972       targ_tbl->start = (uintptr_t) targ_fns;
1973       targ_tbl->end = targ_tbl->start + 1;
1974     }
1975
1976   for (j = 0; j < var_entries; j++, targ_tbl++)
1977     {
1978       CUdeviceptr var;
1979       size_t bytes;
1980
1981       CUDA_CALL_ERET (-1, cuModuleGetGlobal,
1982                       &var, &bytes, module, var_names[j]);
1983
1984       targ_tbl->start = (uintptr_t) var;
1985       targ_tbl->end = targ_tbl->start + bytes;
1986     }
1987
1988   nvptx_set_clocktick (module, dev);
1989
1990   return fn_entries + var_entries;
1991 }
1992
1993 /* Unload the program described by TARGET_DATA.  DEV_DATA is the
1994    function descriptors allocated by G_O_load_image.  */
1995
1996 bool
1997 GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data)
1998 {
1999   struct ptx_image_data *image, **prev_p;
2000   struct ptx_device *dev = ptx_devices[ord];
2001
2002   if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
2003     {
2004       GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
2005                          " (expected %u, received %u)",
2006                          GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
2007       return false;
2008     }
2009
2010   bool ret = true;
2011   pthread_mutex_lock (&dev->image_lock);
2012   for (prev_p = &dev->images; (image = *prev_p) != 0; prev_p = &image->next)
2013     if (image->target_data == target_data)
2014       {
2015         *prev_p = image->next;
2016         if (CUDA_CALL_NOCHECK (cuModuleUnload, image->module) != CUDA_SUCCESS)
2017           ret = false;
2018         free (image->fns);
2019         free (image);
2020         break;
2021       }
2022   pthread_mutex_unlock (&dev->image_lock);
2023   return ret;
2024 }
2025
2026 void *
2027 GOMP_OFFLOAD_alloc (int ord, size_t size)
2028 {
2029   if (!nvptx_attach_host_thread_to_device (ord))
2030     return NULL;
2031   return nvptx_alloc (size);
2032 }
2033
2034 bool
2035 GOMP_OFFLOAD_free (int ord, void *ptr)
2036 {
2037   return (nvptx_attach_host_thread_to_device (ord)
2038           && nvptx_free (ptr));
2039 }
2040
2041 bool
2042 GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
2043 {
2044   return (nvptx_attach_host_thread_to_device (ord)
2045           && nvptx_dev2host (dst, src, n));
2046 }
2047
2048 bool
2049 GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
2050 {
2051   return (nvptx_attach_host_thread_to_device (ord)
2052           && nvptx_host2dev (dst, src, n));
2053 }
2054
2055 bool
2056 GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n)
2057 {
2058   struct ptx_device *ptx_dev = ptx_devices[ord];
2059   CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n,
2060                                 ptx_dev->null_stream->stream);
2061   return true;
2062 }
2063
2064 void (*device_run) (int n, void *fn_ptr, void *vars) = NULL;
2065
2066 void
2067 GOMP_OFFLOAD_openacc_exec (void (*fn) (void *), size_t mapnum,
2068                            void **hostaddrs, void **devaddrs,
2069                            int async, unsigned *dims, void *targ_mem_desc)
2070 {
2071   nvptx_exec (fn, mapnum, hostaddrs, devaddrs, async, dims, targ_mem_desc);
2072 }
2073
2074 void
2075 GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc, int async)
2076 {
2077   struct nvptx_thread *nvthd = nvptx_thread ();
2078   CUevent *e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
2079
2080   CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
2081   CUDA_CALL_ASSERT (cuEventRecord, *e, nvthd->current_stream->stream);
2082   event_add (PTX_EVT_ASYNC_CLEANUP, e, targ_mem_desc, async);
2083 }
2084
2085 int
2086 GOMP_OFFLOAD_openacc_async_test (int async)
2087 {
2088   return nvptx_async_test (async);
2089 }
2090
2091 int
2092 GOMP_OFFLOAD_openacc_async_test_all (void)
2093 {
2094   return nvptx_async_test_all ();
2095 }
2096
2097 void
2098 GOMP_OFFLOAD_openacc_async_wait (int async)
2099 {
2100   nvptx_wait (async);
2101 }
2102
2103 void
2104 GOMP_OFFLOAD_openacc_async_wait_async (int async1, int async2)
2105 {
2106   nvptx_wait_async (async1, async2);
2107 }
2108
2109 void
2110 GOMP_OFFLOAD_openacc_async_wait_all (void)
2111 {
2112   nvptx_wait_all ();
2113 }
2114
2115 void
2116 GOMP_OFFLOAD_openacc_async_wait_all_async (int async)
2117 {
2118   nvptx_wait_all_async (async);
2119 }
2120
2121 void
2122 GOMP_OFFLOAD_openacc_async_set_async (int async)
2123 {
2124   nvptx_set_async (async);
2125 }
2126
2127 void *
2128 GOMP_OFFLOAD_openacc_create_thread_data (int ord)
2129 {
2130   struct ptx_device *ptx_dev;
2131   struct nvptx_thread *nvthd
2132     = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
2133   CUcontext thd_ctx;
2134
2135   ptx_dev = ptx_devices[ord];
2136
2137   assert (ptx_dev);
2138
2139   CUDA_CALL_ASSERT (cuCtxGetCurrent, &thd_ctx);
2140
2141   assert (ptx_dev->ctx);
2142
2143   if (!thd_ctx)
2144     CUDA_CALL_ASSERT (cuCtxPushCurrent, ptx_dev->ctx);
2145
2146   nvthd->current_stream = ptx_dev->null_stream;
2147   nvthd->ptx_dev = ptx_dev;
2148
2149   return (void *) nvthd;
2150 }
2151
2152 void
2153 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
2154 {
2155   free (data);
2156 }
2157
2158 void *
2159 GOMP_OFFLOAD_openacc_cuda_get_current_device (void)
2160 {
2161   return nvptx_get_current_cuda_device ();
2162 }
2163
2164 void *
2165 GOMP_OFFLOAD_openacc_cuda_get_current_context (void)
2166 {
2167   return nvptx_get_current_cuda_context ();
2168 }
2169
2170 /* NOTE: This returns a CUstream, not a ptx_stream pointer.  */
2171
2172 void *
2173 GOMP_OFFLOAD_openacc_cuda_get_stream (int async)
2174 {
2175   return nvptx_get_cuda_stream (async);
2176 }
2177
2178 /* NOTE: This takes a CUstream, not a ptx_stream pointer.  */
2179
2180 int
2181 GOMP_OFFLOAD_openacc_cuda_set_stream (int async, void *stream)
2182 {
2183   return nvptx_set_cuda_stream (async, stream);
2184 }
2185
2186 /* Adjust launch dimensions: pick good values for number of blocks and warps
2187    and ensure that number of warps does not exceed CUDA limits as well as GCC's
2188    own limits.  */
2189
2190 static void
2191 nvptx_adjust_launch_bounds (struct targ_fn_descriptor *fn,
2192                             struct ptx_device *ptx_dev,
2193                             int *teams_p, int *threads_p)
2194 {
2195   int max_warps_block = fn->max_threads_per_block / 32;
2196   /* Maximum 32 warps per block is an implementation limit in NVPTX backend
2197      and libgcc, which matches documented limit of all GPUs as of 2015.  */
2198   if (max_warps_block > 32)
2199     max_warps_block = 32;
2200   if (*threads_p <= 0)
2201     *threads_p = 8;
2202   if (*threads_p > max_warps_block)
2203     *threads_p = max_warps_block;
2204
2205   int regs_per_block = fn->regs_per_thread * 32 * *threads_p;
2206   /* This is an estimate of how many blocks the device can host simultaneously.
2207      Actual limit, which may be lower, can be queried with "occupancy control"
2208      driver interface (since CUDA 6.0).  */
2209   int max_blocks = ptx_dev->regs_per_sm / regs_per_block * ptx_dev->num_sms;
2210   if (*teams_p <= 0 || *teams_p > max_blocks)
2211     *teams_p = max_blocks;
2212 }
2213
2214 /* Return the size of per-warp stacks (see gcc -msoft-stack) to use for OpenMP
2215    target regions.  */
2216
2217 static size_t
2218 nvptx_stacks_size ()
2219 {
2220   return 128 * 1024;
2221 }
2222
2223 /* Return contiguous storage for NUM stacks, each SIZE bytes.  */
2224
2225 static void *
2226 nvptx_stacks_alloc (size_t size, int num)
2227 {
2228   CUdeviceptr stacks;
2229   CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &stacks, size * num);
2230   if (r != CUDA_SUCCESS)
2231     GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
2232   return (void *) stacks;
2233 }
2234
2235 /* Release storage previously allocated by nvptx_stacks_alloc.  */
2236
2237 static void
2238 nvptx_stacks_free (void *p, int num)
2239 {
2240   CUresult r = CUDA_CALL_NOCHECK (cuMemFree, (CUdeviceptr) p);
2241   if (r != CUDA_SUCCESS)
2242     GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
2243 }
2244
2245 void
2246 GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
2247 {
2248   CUfunction function = ((struct targ_fn_descriptor *) tgt_fn)->fn;
2249   CUresult r;
2250   struct ptx_device *ptx_dev = ptx_devices[ord];
2251   const char *maybe_abort_msg = "(perhaps abort was called)";
2252   int teams = 0, threads = 0;
2253
2254   if (!args)
2255     GOMP_PLUGIN_fatal ("No target arguments provided");
2256   while (*args)
2257     {
2258       intptr_t id = (intptr_t) *args++, val;
2259       if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
2260         val = (intptr_t) *args++;
2261       else
2262         val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
2263       if ((id & GOMP_TARGET_ARG_DEVICE_MASK) != GOMP_TARGET_ARG_DEVICE_ALL)
2264         continue;
2265       val = val > INT_MAX ? INT_MAX : val;
2266       id &= GOMP_TARGET_ARG_ID_MASK;
2267       if (id == GOMP_TARGET_ARG_NUM_TEAMS)
2268         teams = val;
2269       else if (id == GOMP_TARGET_ARG_THREAD_LIMIT)
2270         threads = val;
2271     }
2272   nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads);
2273
2274   size_t stack_size = nvptx_stacks_size ();
2275   void *stacks = nvptx_stacks_alloc (stack_size, teams * threads);
2276   void *fn_args[] = {tgt_vars, stacks, (void *) stack_size};
2277   size_t fn_args_size = sizeof fn_args;
2278   void *config[] = {
2279     CU_LAUNCH_PARAM_BUFFER_POINTER, fn_args,
2280     CU_LAUNCH_PARAM_BUFFER_SIZE, &fn_args_size,
2281     CU_LAUNCH_PARAM_END
2282   };
2283   r = CUDA_CALL_NOCHECK (cuLaunchKernel, function, teams, 1, 1,
2284                          32, threads, 1, 0, ptx_dev->null_stream->stream,
2285                          NULL, config);
2286   if (r != CUDA_SUCCESS)
2287     GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
2288
2289   r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
2290   if (r == CUDA_ERROR_LAUNCH_FAILED)
2291     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
2292                        maybe_abort_msg);
2293   else if (r != CUDA_SUCCESS)
2294     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
2295   nvptx_stacks_free (stacks, teams * threads);
2296 }
2297
2298 void
2299 GOMP_OFFLOAD_async_run (int ord, void *tgt_fn, void *tgt_vars, void **args,
2300                         void *async_data)
2301 {
2302   GOMP_PLUGIN_fatal ("GOMP_OFFLOAD_async_run unimplemented");
2303 }